Do not perform XID alignment because looks like it has no noticeable impact on performance but can cause problrems with restoring running xacts from CLOG

2026-01-29 00:00:38 +00:00 · 2024-07-05 18:01:33 +03:00
260 changed files with 5763 additions and 12313 deletions
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,2 +0,0 @@
-# allows for nicer hunk headers with git show
-*.rs diff=rust
--- a/.github/actions/neon-project-create/action.yml
+++ b/.github/actions/neon-project-create/action.yml
@@ -9,8 +9,8 @@ inputs:
    description: 'Region ID, if not set the project will be created in the default region'
    default: aws-us-east-2
  postgres_version:
-    description: 'Postgres version; default is 16'
-    default: '16'
+    description: 'Postgres version; default is 15'
+    default: '15'
  api_host:
    description: 'Neon API host'
    default: console-stage.neon.build
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -115,7 +115,6 @@ runs:
        export POSTGRES_DISTRIB_DIR=${POSTGRES_DISTRIB_DIR:-/tmp/neon/pg_install}
        export DEFAULT_PG_VERSION=${PG_VERSION#v}
        export LD_LIBRARY_PATH=${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/lib
-        export BENCHMARK_CONNSTR=${BENCHMARK_CONNSTR:-}

        if [ "${BUILD_TYPE}" = "remote" ]; then
          export REMOTE_ENV=1
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -1,291 +0,0 @@
-name: Build and Test Locally
-
-on:
-  workflow_call:
-    inputs:
-      arch:
-        description: 'x64 or arm64'
-        required: true
-        type: string
-      build-tag:
-        description: 'build tag'
-        required: true
-        type: string
-      build-tools-image:
-        description: 'build-tools image'
-        required: true
-        type: string
-      build-type:
-        description: 'debug or release'
-        required: true
-        type: string
-
-defaults:
-  run:
-    shell: bash -euxo pipefail {0}
-
-env:
-  RUST_BACKTRACE: 1
-  COPT: '-Werror'
-  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
-
-jobs:
-  build-neon:
-    runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', inputs.arch == 'arm64' && 'large-arm64' || 'large')) }}
-    container:
-      image: ${{ inputs.build-tools-image }}
-      credentials:
-        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
-      # Raise locked memory limit for tokio-epoll-uring.
-      # On 5.10 LTS kernels < 5.10.162 (and generally mainline kernels < 5.12),
-      # io_uring will account the memory of the CQ and SQ as locked.
-      # More details: https://github.com/neondatabase/neon/issues/6373#issuecomment-1905814391
-      options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
-    env:
-      BUILD_TYPE: ${{ inputs.build-type }}
-      GIT_VERSION: ${{ github.event.pull_request.head.sha || github.sha }}
-      BUILD_TAG: ${{ inputs.build-tag }}
-
-    steps:
-      - name: Fix git ownership
-        run: |
-          # Workaround for `fatal: detected dubious ownership in repository at ...`
-          #
-          # Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers
-          #   Ref https://github.com/actions/checkout/issues/785
-          #
-          git config --global --add safe.directory ${{ github.workspace }}
-          git config --global --add safe.directory ${GITHUB_WORKSPACE}
-          for r in 14 15 16; do
-            git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
-            git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
-          done
-
-      - uses: actions/checkout@v4
-        with:
-          submodules: true
-          fetch-depth: 1
-
-      - name: Set pg 14 revision for caching
-        id: pg_v14_rev
-        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT
-
-      - name: Set pg 15 revision for caching
-        id: pg_v15_rev
-        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT
-
-      - name: Set pg 16 revision for caching
-        id: pg_v16_rev
-        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT
-
-      # Set some environment variables used by all the steps.
-      #
-      # CARGO_FLAGS is extra options to pass to "cargo build", "cargo test" etc.
-      #   It also includes --features, if any
-      #
-      # CARGO_FEATURES is passed to "cargo metadata". It is separate from CARGO_FLAGS,
-      #   because "cargo metadata" doesn't accept --release or --debug options
-      #
-      # We run tests with addtional features, that are turned off by default (e.g. in release builds), see
-      # corresponding Cargo.toml files for their descriptions.
-      - name: Set env variables
-        run: |
-          CARGO_FEATURES="--features testing"
-          if [[ $BUILD_TYPE == "debug" ]]; then
-            cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run"
-            CARGO_FLAGS="--locked"
-          elif [[ $BUILD_TYPE == "release" ]]; then
-            cov_prefix=""
-            CARGO_FLAGS="--locked --release"
-          fi
-          {
-            echo "cov_prefix=${cov_prefix}"
-            echo "CARGO_FEATURES=${CARGO_FEATURES}"
-            echo "CARGO_FLAGS=${CARGO_FLAGS}"
-            echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo"
-          } >> $GITHUB_ENV
-
-      - name: Cache postgres v14 build
-        id: cache_pg_14
-        uses: actions/cache@v4
-        with:
-          path: pg_install/v14
-          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
-
-      - name: Cache postgres v15 build
-        id: cache_pg_15
-        uses: actions/cache@v4
-        with:
-          path: pg_install/v15
-          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
-
-      - name: Cache postgres v16 build
-        id: cache_pg_16
-        uses: actions/cache@v4
-        with:
-          path: pg_install/v16
-          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
-
-      - name: Build postgres v14
-        if: steps.cache_pg_14.outputs.cache-hit != 'true'
-        run: mold -run make postgres-v14 -j$(nproc)
-
-      - name: Build postgres v15
-        if: steps.cache_pg_15.outputs.cache-hit != 'true'
-        run: mold -run make postgres-v15 -j$(nproc)
-
-      - name: Build postgres v16
-        if: steps.cache_pg_16.outputs.cache-hit != 'true'
-        run: mold -run make postgres-v16 -j$(nproc)
-
-      - name: Build neon extensions
-        run: mold -run make neon-pg-ext -j$(nproc)
-
-      - name: Build walproposer-lib
-        run: mold -run make walproposer-lib -j$(nproc)
-
-      - name: Run cargo build
-        run: |
-          PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
-          export PQ_LIB_DIR
-          ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
-
-      # Do install *before* running rust tests because they might recompile the
-      # binaries with different features/flags.
-      - name: Install rust binaries
-        run: |
-          # Install target binaries
-          mkdir -p /tmp/neon/bin/
-          binaries=$(
-            ${cov_prefix} cargo metadata $CARGO_FEATURES --format-version=1 --no-deps |
-            jq -r '.packages[].targets[] | select(.kind | index("bin")) | .name'
-          )
-          for bin in $binaries; do
-            SRC=target/$BUILD_TYPE/$bin
-            DST=/tmp/neon/bin/$bin
-            cp "$SRC" "$DST"
-          done
-
-          # Install test executables and write list of all binaries (for code coverage)
-          if [[ $BUILD_TYPE == "debug" ]]; then
-            # Keep bloated coverage data files away from the rest of the artifact
-            mkdir -p /tmp/coverage/
-
-            mkdir -p /tmp/neon/test_bin/
-
-            test_exe_paths=$(
-              ${cov_prefix} cargo test $CARGO_FLAGS $CARGO_FEATURES --message-format=json --no-run |
-              jq -r '.executable | select(. != null)'
-            )
-            for bin in $test_exe_paths; do
-              SRC=$bin
-              DST=/tmp/neon/test_bin/$(basename $bin)
-
-              # We don't need debug symbols for code coverage, so strip them out to make
-              # the artifact smaller.
-              strip "$SRC" -o "$DST"
-              echo "$DST" >> /tmp/coverage/binaries.list
-            done
-
-            for bin in $binaries; do
-              echo "/tmp/neon/bin/$bin" >> /tmp/coverage/binaries.list
-            done
-          fi
-
-      - name: Run rust tests
-        env:
-          NEXTEST_RETRIES: 3
-        run: |
-          PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
-          export PQ_LIB_DIR
-          LD_LIBRARY_PATH=$(pwd)/pg_install/v16/lib
-          export LD_LIBRARY_PATH
-
-          #nextest does not yet support running doctests
-          cargo test --doc $CARGO_FLAGS $CARGO_FEATURES
-
-          for io_engine in std-fs tokio-epoll-uring ; do
-            NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
-          done
-
-          # Run separate tests for real S3
-          export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
-          export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
-          export REMOTE_STORAGE_S3_REGION=eu-central-1
-          ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(remote_storage)' -E 'test(test_real_s3)'
-
-          # Run separate tests for real Azure Blob Storage
-          # XXX: replace region with `eu-central-1`-like region
-          export ENABLE_REAL_AZURE_REMOTE_STORAGE=y
-          export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}"
-          export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}"
-          export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
-          export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
-          ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(remote_storage)' -E 'test(test_real_azure)'
-
-      - name: Install postgres binaries
-        run: cp -a pg_install /tmp/neon/pg_install
-
-      - name: Upload Neon artifact
-        uses: ./.github/actions/upload
-        with:
-          name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-artifact
-          path: /tmp/neon
-
-      # XXX: keep this after the binaries.list is formed, so the coverage can properly work later
-      - name: Merge and upload coverage data
-        if: inputs.build-type == 'debug'
-        uses: ./.github/actions/save-coverage-data
-
-  regress-tests:
-    # Run test on x64 only
-    if: inputs.arch == 'x64'
-    needs: [ build-neon ]
-    runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', inputs.arch == 'arm64' && 'large-arm64' || 'large')) }}
-    container:
-      image: ${{ inputs.build-tools-image }}
-      credentials:
-        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
-      # for changed limits, see comments on `options:` earlier in this file
-      options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
-    strategy:
-      fail-fast: false
-      matrix:
-        pg_version: [ v14, v15, v16 ]
-    steps:
-      - uses: actions/checkout@v4
-        with:
-          submodules: true
-          fetch-depth: 1
-
-      - name: Pytest regression tests
-        uses: ./.github/actions/run-python-test-set
-        timeout-minutes: 60
-        with:
-          build_type: ${{ inputs.build-type }}
-          test_selection: regress
-          needs_postgres_source: true
-          run_with_real_s3: true
-          real_s3_bucket: neon-github-ci-tests
-          real_s3_region: eu-central-1
-          rerun_flaky: true
-          pg_version: ${{ matrix.pg_version }}
-        env:
-          TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
-          CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
-          BUILD_TAG: ${{ inputs.build-tag }}
-          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
-          PAGESERVER_GET_VECTORED_IMPL: vectored
-          PAGESERVER_GET_IMPL: vectored
-          PAGESERVER_VALIDATE_VEC_GET: true
-
-      # Temporary disable this step until we figure out why it's so flaky
-      # Ref https://github.com/neondatabase/neon/issues/4540
-      - name: Merge and upload coverage data
-        if: |
-          false &&
-          inputs.build-type == 'debug' && matrix.pg_version == 'v14'
-        uses: ./.github/actions/save-coverage-data
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -56,27 +56,15 @@ concurrency:
 jobs:
  bench:
    if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - DEFAULT_PG_VERSION: 16
-            PLATFORM: "neon-staging"
-            region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
-            provisioner: 'k8s-pod' 
-          - DEFAULT_PG_VERSION: 16
-            PLATFORM: "azure-staging"
-            region_id: 'azure-eastus2'
-            provisioner: 'k8s-neonvm'
    env:
      TEST_PG_BENCH_DURATIONS_MATRIX: "300"
      TEST_PG_BENCH_SCALES_MATRIX: "10,100"
      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
-      DEFAULT_PG_VERSION: ${{ matrix.DEFAULT_PG_VERSION }}
+      DEFAULT_PG_VERSION: 14
      TEST_OUTPUT: /tmp/test_output
      BUILD_TYPE: remote
      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
-      PLATFORM: ${{ matrix.PLATFORM }}
+      PLATFORM: "neon-staging"

    runs-on: [ self-hosted, us-east-2, x64 ]
    container:
@@ -97,10 +85,9 @@ jobs:
      id: create-neon-project
      uses: ./.github/actions/neon-project-create
      with:
-        region_id: ${{ matrix.region_id }}
+        region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
        postgres_version: ${{ env.DEFAULT_PG_VERSION }}
        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
-        provisioner: ${{ matrix.provisioner }}

    - name: Run benchmark
      uses: ./.github/actions/run-python-test-set
@@ -109,18 +96,10 @@ jobs:
        test_selection: performance
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
-        pg_version: ${{ env.DEFAULT_PG_VERSION }}
        # Set --sparse-ordering option of pytest-order plugin
        # to ensure tests are running in order of appears in the file.
        # It's important for test_perf_pgbench.py::test_pgbench_remote_* tests
-        extra_params:
-          -m remote_cluster
-          --sparse-ordering
-          --timeout 14400
-          --ignore test_runner/performance/test_perf_olap.py
-          --ignore test_runner/performance/test_perf_pgvector_queries.py
-          --ignore test_runner/performance/test_logical_replication.py
-          --ignore test_runner/performance/test_physical_replication.py
+        extra_params: -m remote_cluster --sparse-ordering --timeout 5400 --ignore test_runner/performance/test_perf_olap.py --ignore test_runner/performance/test_perf_pgvector_queries.py
      env:
        BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -146,71 +125,6 @@ jobs:
      env:
        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}

-  replication-tests:
-    if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
-    env:
-      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
-      DEFAULT_PG_VERSION: 14
-      TEST_OUTPUT: /tmp/test_output
-      BUILD_TYPE: remote
-      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
-      PLATFORM: "neon-staging"
-
-    runs-on: [ self-hosted, us-east-2, x64 ]
-    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
-      options: --init
-
-    steps:
-    - uses: actions/checkout@v4
-
-    - name: Download Neon artifact
-      uses: ./.github/actions/download
-      with:
-        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
-        path: /tmp/neon/
-        prefix: latest
-
-    - name: Run benchmark
-      uses: ./.github/actions/run-python-test-set
-      with:
-        build_type: ${{ env.BUILD_TYPE }}
-        test_selection: performance/test_logical_replication.py
-        run_in_parallel: false
-        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
-        extra_params: -m remote_cluster --timeout 5400
-      env:
-        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
-        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
-        NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }}
-
-    - name: Run benchmark
-      uses: ./.github/actions/run-python-test-set
-      with:
-        build_type: ${{ env.BUILD_TYPE }}
-        test_selection: performance/test_physical_replication.py
-        run_in_parallel: false
-        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
-        extra_params: -m remote_cluster --timeout 5400
-        pg_version: ${{ env.DEFAULT_PG_VERSION }}
-      env:
-        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
-        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
-        NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }}
-
-    - name: Create Allure report
-      if: ${{ !cancelled() }}
-      uses: ./.github/actions/allure-report-generate
-
-    - name: Post to a Slack channel
-      if: ${{ github.event.schedule && failure() }}
-      uses: slackapi/slack-github-action@v1
-      with:
-        channel-id: "C033QLM5P7D" # dev-staging-stream
-        slack-message: "Periodic replication testing: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
-      env:
-        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
-
  generate-matrices:
    if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
    # Create matrices for the benchmarking jobs, so we run benchmarks on rds only once a week (on Saturday)
@@ -218,14 +132,11 @@ jobs:
    # Available platforms:
    # - neon-captest-new: Freshly created project (1 CU)
    # - neon-captest-freetier: Use freetier-sized compute (0.25 CU)
-    # - neonvm-captest-azure-new: Freshly created project (1 CU) in azure region
-    # - neonvm-captest-azure-freetier: Use freetier-sized compute (0.25 CU) in azure region
    # - neon-captest-reuse: Reusing existing project
    # - rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs
    # - rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
    env:
      RUN_AWS_RDS_AND_AURORA: ${{ github.event.inputs.run_AWS_RDS_AND_AURORA || 'false' }}
-      DEFAULT_REGION_ID: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
    runs-on: ubuntu-22.04
    outputs:
      pgbench-compare-matrix: ${{ steps.pgbench-compare-matrix.outputs.matrix }}
@@ -236,32 +147,23 @@ jobs:
    - name: Generate matrix for pgbench benchmark
      id: pgbench-compare-matrix
      run: |
-        region_id_default=${{ env.DEFAULT_REGION_ID }}
        matrix='{
-          "pg_version" : [
-            16
-          ],
-          "region_id" : [
-            "'"$region_id_default"'"
-            ],
          "platform": [
            "neon-captest-new",
            "neon-captest-reuse",
            "neonvm-captest-new"
          ],
          "db_size": [ "10gb" ],
-          "include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neon-captest-freetier",         "db_size": "3gb"  },
-                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neon-captest-new",              "db_size": "50gb" },
-                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier",       "db_size": "3gb"  },
-                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new",            "db_size": "50gb" },
-                      { "pg_version": 16, "region_id": "azure-eastus2",      "platform": "neonvm-azure-captest-freetier", "db_size": "3gb"  },
-                      { "pg_version": 16, "region_id": "azure-eastus2",      "platform": "neonvm-azure-captest-new",      "db_size": "10gb" },
-                      { "pg_version": 16, "region_id": "azure-eastus2",      "platform": "neonvm-azure-captest-new",      "db_size": "50gb" },
-                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb" }]
+          "include": [{ "platform": "neon-captest-freetier",         "db_size": "3gb"  },
+                      { "platform": "neon-captest-new",              "db_size": "50gb" },
+                      { "platform": "neonvm-captest-freetier",       "db_size": "3gb"  },
+                      { "platform": "neonvm-captest-new",            "db_size": "50gb" },
+                      { "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb" }]
        }'

        if [ "$(date +%A)" = "Saturday" ]; then
-          matrix=$(echo "$matrix" | jq '.include += [{ "pg_version": 14, "region_id": "'"$region_id_default"'", "platform": "rds-postgres", "db_size": "10gb"}]')
+          matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "db_size": "10gb"},
+                                                     { "platform": "rds-aurora",   "db_size": "50gb"}]')
        fi

        echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
@@ -313,7 +215,7 @@ jobs:
      TEST_PG_BENCH_DURATIONS_MATRIX: "60m"
      TEST_PG_BENCH_SCALES_MATRIX: ${{ matrix.db_size }}
      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
-      DEFAULT_PG_VERSION: ${{ matrix.pg_version }}
+      DEFAULT_PG_VERSION: 14
      TEST_OUTPUT: /tmp/test_output
      BUILD_TYPE: remote
      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
@@ -338,14 +240,14 @@ jobs:
        prefix: latest

    - name: Create Neon Project
-      if: contains(fromJson('["neon-captest-new", "neon-captest-freetier", "neonvm-captest-new", "neonvm-captest-freetier", "neonvm-azure-captest-freetier", "neonvm-azure-captest-new"]'), matrix.platform)
+      if: contains(fromJson('["neon-captest-new", "neon-captest-freetier", "neonvm-captest-new", "neonvm-captest-freetier"]'), matrix.platform)
      id: create-neon-project
      uses: ./.github/actions/neon-project-create
      with:
-        region_id: ${{ matrix.region_id }}
+        region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
        postgres_version: ${{ env.DEFAULT_PG_VERSION }}
        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
-        compute_units: ${{ (contains(matrix.platform, 'captest-freetier') && '[0.25, 0.25]') || '[1, 1]' }}
+        compute_units: ${{ (matrix.platform == 'neon-captest-freetier' && '[0.25, 0.25]') || '[1, 1]' }}
        provisioner: ${{ (contains(matrix.platform, 'neonvm-') && 'k8s-neonvm') || 'k8s-pod' }}

    - name: Set up Connection String
@@ -358,7 +260,7 @@ jobs:
          neonvm-captest-sharding-reuse)
            CONNSTR=${{ secrets.BENCHMARK_CAPTEST_SHARDING_CONNSTR }}
            ;;
-          neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier | neonvm-azure-captest-new | neonvm-azure-captest-freetier)
+          neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier)
            CONNSTR=${{ steps.create-neon-project.outputs.dsn }}
            ;;
          rds-aurora)
@@ -383,7 +285,6 @@ jobs:
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_init
-        pg_version: ${{ env.DEFAULT_PG_VERSION }}
      env:
        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -397,7 +298,6 @@ jobs:
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_simple_update
-        pg_version: ${{ env.DEFAULT_PG_VERSION }}
      env:
        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -411,7 +311,6 @@ jobs:
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_select_only
-        pg_version: ${{ env.DEFAULT_PG_VERSION }}
      env:
        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -438,13 +337,6 @@ jobs:
        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}

  pgbench-pgvector:
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - PLATFORM: "neon-captest-pgvector"
-          - PLATFORM: "azure-captest-pgvector"
-            
    env:
      TEST_PG_BENCH_DURATIONS_MATRIX: "15m"
      TEST_PG_BENCH_SCALES_MATRIX: "1"
@@ -452,9 +344,8 @@ jobs:
      DEFAULT_PG_VERSION: 16
      TEST_OUTPUT: /tmp/test_output
      BUILD_TYPE: remote
-      LD_LIBRARY_PATH: /home/nonroot/pg/usr/lib/x86_64-linux-gnu
      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
-      PLATFORM: ${{ matrix.PLATFORM }}
+      PLATFORM: "neon-captest-pgvector"

    runs-on: [ self-hosted, us-east-2, x64 ]
    container:
@@ -464,39 +355,17 @@ jobs:
    steps:
    - uses: actions/checkout@v4

-    # until https://github.com/neondatabase/neon/issues/8275 is fixed we temporarily install postgresql-16
-    # instead of using Neon artifacts containing pgbench
-    - name: Install postgresql-16 where pytest expects it
-      run: |
-        cd /home/nonroot
-        wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/libpq5_16.3-1.pgdg110%2B1_amd64.deb
-        wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-client-16_16.3-1.pgdg110%2B1_amd64.deb
-        wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-16_16.3-1.pgdg110%2B1_amd64.deb 
-        dpkg -x libpq5_16.3-1.pgdg110+1_amd64.deb pg
-        dpkg -x postgresql-client-16_16.3-1.pgdg110+1_amd64.deb pg
-        dpkg -x postgresql-16_16.3-1.pgdg110+1_amd64.deb pg
-        mkdir -p /tmp/neon/pg_install/v16/bin
-        ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/pgbench /tmp/neon/pg_install/v16/bin/pgbench  
-        ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/psql /tmp/neon/pg_install/v16/bin/psql  
-        ln -s /home/nonroot/pg/usr/lib/x86_64-linux-gnu /tmp/neon/pg_install/v16/lib 
-        /tmp/neon/pg_install/v16/bin/pgbench --version
-        /tmp/neon/pg_install/v16/bin/psql --version
+    - name: Download Neon artifact
+      uses: ./.github/actions/download
+      with:
+        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
+        path: /tmp/neon/
+        prefix: latest

    - name: Set up Connection String
      id: set-up-connstr
      run: |
-        case "${PLATFORM}" in
-          neon-captest-pgvector)
-            CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR }}
-            ;;
-          azure-captest-pgvector)
-            CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR_AZURE }}
-            ;;
-          *)
-            echo >&2 "Unknown PLATFORM=${PLATFORM}"
-            exit 1
-            ;;
-        esac
+        CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR }}

        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT

@@ -508,7 +377,6 @@ jobs:
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 21600 -k test_pgvector_indexing
-        pg_version: ${{ env.DEFAULT_PG_VERSION }}
      env:
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
@@ -522,7 +390,6 @@ jobs:
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 21600
-        pg_version: ${{ env.DEFAULT_PG_VERSION }}
      env:
        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -537,10 +404,11 @@ jobs:
      uses: slackapi/slack-github-action@v1
      with:
        channel-id: "C033QLM5P7D" # dev-staging-stream
-        slack-message: "Periodic perf testing ${PLATFORM}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+        slack-message: "Periodic perf testing neon-captest-pgvector: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
      env:
        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}

+
  clickbench-compare:
    # ClichBench DB for rds-aurora and rds-Postgres deployed to the same clusters
    # we use for performance testing in pgbench-compare.
@@ -784,7 +652,6 @@ jobs:
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 21600 -k test_user_examples
-        pg_version: ${{ env.DEFAULT_PG_VERSION }}
      env:
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -48,30 +48,12 @@ jobs:

  tag:
    needs: [ check-permissions ]
-    runs-on: ubuntu-22.04
+    runs-on: [ self-hosted, gen3, small ]
+    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
    outputs:
      build-tag: ${{steps.build-tag.outputs.tag}}
-    permissions:
-      id-token: write
-      contents: read

    steps:
-      # - name: Install az cli
-      #   run: |
-      #     curl -sL https://aka.ms/InstallAzureCLIDeb --output-dir /tmp -OJ
-      #     bash /tmp/InstallAzureCLIDeb
-      #
-      - uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a # v2.1.1
-        with:
-          client-id: ${{ secrets.AZURE_DEV_RUNNER_CLIENT_ID }}
-          tenant-id: ${{ secrets.AZURE_TENANT_ID }}
-          subscription-id: ${{ secrets.AZURE_DEV_SUBSCRIPTION_ID }}
-
-      - name: push
-        run: |
-          az acr login --name neoneastus2
-          docker buildx imagetools create -t neoneastus2.azurecr.io/neondatabase/neon:5718 neondatabase/neon:5718
-
      - name: Checkout
        uses: actions/checkout@v4
        with:
@@ -143,11 +125,7 @@ jobs:

  check-codestyle-rust:
    needs: [ check-permissions, build-build-tools-image ]
-    strategy:
-      matrix:
-        arch: [ x64, arm64 ]
-    runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }}
-
+    runs-on: [ self-hosted, gen3, small ]
    container:
      image: ${{ needs.build-build-tools-image.outputs.image }}
      credentials:
@@ -215,27 +193,291 @@ jobs:
        if: ${{ !cancelled() }}
        run: cargo deny check --hide-inclusion-graph

-  build-and-test-locally:
-    needs: [ tag, build-build-tools-image ]
+  build-neon:
+    needs: [ check-permissions, tag, build-build-tools-image ]
+    runs-on: [ self-hosted, gen3, large ]
+    container:
+      image: ${{ needs.build-build-tools-image.outputs.image }}
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+      # Raise locked memory limit for tokio-epoll-uring.
+      # On 5.10 LTS kernels < 5.10.162 (and generally mainline kernels < 5.12),
+      # io_uring will account the memory of the CQ and SQ as locked.
+      # More details: https://github.com/neondatabase/neon/issues/6373#issuecomment-1905814391
+      options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
    strategy:
      fail-fast: false
      matrix:
-        arch: [ x64 ]
-        build-type: [ debug, release ]
-        include:
-          - build-type: release
-            arch: arm64
-    uses: ./.github/workflows/_build-and-test-locally.yml
-    with:
-      arch: ${{ matrix.arch }}
-      build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}
-      build-tag: ${{ needs.tag.outputs.build-tag }}
-      build-type: ${{ matrix.build-type }}
-    secrets: inherit
+        build_type: [ debug, release ]
+    env:
+      BUILD_TYPE: ${{ matrix.build_type }}
+      GIT_VERSION: ${{ github.event.pull_request.head.sha || github.sha }}
+      BUILD_TAG: ${{ needs.tag.outputs.build-tag }}
+
+    steps:
+      - name: Fix git ownership
+        run: |
+          # Workaround for `fatal: detected dubious ownership in repository at ...`
+          #
+          # Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers
+          #   Ref https://github.com/actions/checkout/issues/785
+          #
+          git config --global --add safe.directory ${{ github.workspace }}
+          git config --global --add safe.directory ${GITHUB_WORKSPACE}
+          for r in 14 15 16; do
+            git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
+            git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
+          done
+
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          submodules: true
+          fetch-depth: 1
+
+      - name: Set pg 14 revision for caching
+        id: pg_v14_rev
+        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT
+
+      - name: Set pg 15 revision for caching
+        id: pg_v15_rev
+        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT
+
+      - name: Set pg 16 revision for caching
+        id: pg_v16_rev
+        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT
+
+      # Set some environment variables used by all the steps.
+      #
+      # CARGO_FLAGS is extra options to pass to "cargo build", "cargo test" etc.
+      #   It also includes --features, if any
+      #
+      # CARGO_FEATURES is passed to "cargo metadata". It is separate from CARGO_FLAGS,
+      #   because "cargo metadata" doesn't accept --release or --debug options
+      #
+      # We run tests with addtional features, that are turned off by default (e.g. in release builds), see
+      # corresponding Cargo.toml files for their descriptions.
+      - name: Set env variables
+        run: |
+          CARGO_FEATURES="--features testing"
+          if [[ $BUILD_TYPE == "debug" ]]; then
+            cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run"
+            CARGO_FLAGS="--locked"
+          elif [[ $BUILD_TYPE == "release" ]]; then
+            cov_prefix=""
+            CARGO_FLAGS="--locked --release"
+          fi
+          {
+            echo "cov_prefix=${cov_prefix}"
+            echo "CARGO_FEATURES=${CARGO_FEATURES}"
+            echo "CARGO_FLAGS=${CARGO_FLAGS}"
+            echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo"
+          } >> $GITHUB_ENV
+
+      # Disabled for now
+      # Don't include the ~/.cargo/registry/src directory. It contains just
+      # uncompressed versions of the crates in ~/.cargo/registry/cache
+      # directory, and it's faster to let 'cargo' to rebuild it from the
+      # compressed crates.
+#      - name: Cache cargo deps
+#        id: cache_cargo
+#        uses: actions/cache@v4
+#        with:
+#          path: |
+#            ~/.cargo/registry/
+#            !~/.cargo/registry/src
+#            ~/.cargo/git/
+#            target/
+#          # Fall back to older versions of the key, if no cache for current Cargo.lock was found
+#          key: |
+#            v1-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}
+#            v1-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-
+
+      - name: Cache postgres v14 build
+        id: cache_pg_14
+        uses: actions/cache@v4
+        with:
+          path: pg_install/v14
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
+
+      - name: Cache postgres v15 build
+        id: cache_pg_15
+        uses: actions/cache@v4
+        with:
+          path: pg_install/v15
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
+
+      - name: Cache postgres v16 build
+        id: cache_pg_16
+        uses: actions/cache@v4
+        with:
+          path: pg_install/v16
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
+
+      - name: Build postgres v14
+        if: steps.cache_pg_14.outputs.cache-hit != 'true'
+        run: mold -run make postgres-v14 -j$(nproc)
+
+      - name: Build postgres v15
+        if: steps.cache_pg_15.outputs.cache-hit != 'true'
+        run: mold -run make postgres-v15 -j$(nproc)
+
+      - name: Build postgres v16
+        if: steps.cache_pg_16.outputs.cache-hit != 'true'
+        run: mold -run make postgres-v16 -j$(nproc)
+
+      - name: Build neon extensions
+        run: mold -run make neon-pg-ext -j$(nproc)
+
+      - name: Build walproposer-lib
+        run: mold -run make walproposer-lib -j$(nproc)
+
+      - name: Run cargo build
+        run: |
+          PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
+          export PQ_LIB_DIR
+          ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
+
+      # Do install *before* running rust tests because they might recompile the
+      # binaries with different features/flags.
+      - name: Install rust binaries
+        run: |
+          # Install target binaries
+          mkdir -p /tmp/neon/bin/
+          binaries=$(
+            ${cov_prefix} cargo metadata $CARGO_FEATURES --format-version=1 --no-deps |
+            jq -r '.packages[].targets[] | select(.kind | index("bin")) | .name'
+          )
+          for bin in $binaries; do
+            SRC=target/$BUILD_TYPE/$bin
+            DST=/tmp/neon/bin/$bin
+            cp "$SRC" "$DST"
+          done
+
+          # Install test executables and write list of all binaries (for code coverage)
+          if [[ $BUILD_TYPE == "debug" ]]; then
+            # Keep bloated coverage data files away from the rest of the artifact
+            mkdir -p /tmp/coverage/
+
+            mkdir -p /tmp/neon/test_bin/
+
+            test_exe_paths=$(
+              ${cov_prefix} cargo test $CARGO_FLAGS $CARGO_FEATURES --message-format=json --no-run |
+              jq -r '.executable | select(. != null)'
+            )
+            for bin in $test_exe_paths; do
+              SRC=$bin
+              DST=/tmp/neon/test_bin/$(basename $bin)
+
+              # We don't need debug symbols for code coverage, so strip them out to make
+              # the artifact smaller.
+              strip "$SRC" -o "$DST"
+              echo "$DST" >> /tmp/coverage/binaries.list
+            done
+
+            for bin in $binaries; do
+              echo "/tmp/neon/bin/$bin" >> /tmp/coverage/binaries.list
+            done
+          fi
+
+      - name: Run rust tests
+        env:
+          NEXTEST_RETRIES: 3
+        run: |
+          PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
+          export PQ_LIB_DIR
+          LD_LIBRARY_PATH=$(pwd)/pg_install/v16/lib
+          export LD_LIBRARY_PATH
+
+          #nextest does not yet support running doctests
+          cargo test --doc $CARGO_FLAGS $CARGO_FEATURES
+
+          for io_engine in std-fs tokio-epoll-uring ; do
+            NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
+          done
+
+          # Run separate tests for real S3
+          export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
+          export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
+          export REMOTE_STORAGE_S3_REGION=eu-central-1
+          ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(remote_storage)' -E 'test(test_real_s3)'
+
+          # Run separate tests for real Azure Blob Storage
+          # XXX: replace region with `eu-central-1`-like region
+          export ENABLE_REAL_AZURE_REMOTE_STORAGE=y
+          export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}"
+          export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}"
+          export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
+          export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
+          ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(remote_storage)' -E 'test(test_real_azure)'
+
+      - name: Install postgres binaries
+        run: cp -a pg_install /tmp/neon/pg_install
+
+      - name: Upload Neon artifact
+        uses: ./.github/actions/upload
+        with:
+          name: neon-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-artifact
+          path: /tmp/neon
+
+      # XXX: keep this after the binaries.list is formed, so the coverage can properly work later
+      - name: Merge and upload coverage data
+        if: matrix.build_type == 'debug'
+        uses: ./.github/actions/save-coverage-data
+
+  regress-tests:
+    needs: [ check-permissions, build-neon, build-build-tools-image, tag ]
+    runs-on: [ self-hosted, gen3, large ]
+    container:
+      image: ${{ needs.build-build-tools-image.outputs.image }}
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+      # for changed limits, see comments on `options:` earlier in this file
+      options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
+    strategy:
+      fail-fast: false
+      matrix:
+        build_type: [ debug, release ]
+        pg_version: [ v14, v15, v16 ]
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          submodules: true
+          fetch-depth: 1
+
+      - name: Pytest regression tests
+        uses: ./.github/actions/run-python-test-set
+        timeout-minutes: 60
+        with:
+          build_type: ${{ matrix.build_type }}
+          test_selection: regress
+          needs_postgres_source: true
+          run_with_real_s3: true
+          real_s3_bucket: neon-github-ci-tests
+          real_s3_region: eu-central-1
+          rerun_flaky: true
+          pg_version: ${{ matrix.pg_version }}
+        env:
+          TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
+          CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
+          BUILD_TAG: ${{ needs.tag.outputs.build-tag }}
+          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
+          PAGESERVER_GET_VECTORED_IMPL: vectored
+          PAGESERVER_GET_IMPL: vectored
+          PAGESERVER_VALIDATE_VEC_GET: true
+
+      # Temporary disable this step until we figure out why it's so flaky
+      # Ref https://github.com/neondatabase/neon/issues/4540
+      - name: Merge and upload coverage data
+        if: |
+          false &&
+          matrix.build_type == 'debug' && matrix.pg_version == 'v14'
+        uses: ./.github/actions/save-coverage-data

-  # Keep `benchmarks` job outside of `build-and-test-locally` workflow to make job failures non-blocking
  get-benchmarks-durations:
-    if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
    outputs:
      json: ${{ steps.get-benchmark-durations.outputs.json }}
    needs: [ check-permissions, build-build-tools-image ]
@@ -246,6 +488,7 @@ jobs:
        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
      options: --init
+    if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
    steps:
      - name: Checkout
        uses: actions/checkout@v4
@@ -270,8 +513,7 @@ jobs:
          echo "json=$(jq --compact-output '.' /tmp/benchmark_durations.json)" >> $GITHUB_OUTPUT

  benchmarks:
-    if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
-    needs: [ check-permissions, build-and-test-locally, build-build-tools-image, get-benchmarks-durations ]
+    needs: [ check-permissions, build-neon, build-build-tools-image, get-benchmarks-durations ]
    runs-on: [ self-hosted, gen3, small ]
    container:
      image: ${{ needs.build-build-tools-image.outputs.image }}
@@ -280,6 +522,7 @@ jobs:
        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
      # for changed limits, see comments on `options:` earlier in this file
      options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
+    if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
    strategy:
      fail-fast: false
      matrix:
@@ -327,7 +570,7 @@ jobs:
        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}

  create-test-report:
-    needs: [ check-permissions, build-and-test-locally, coverage-report, build-build-tools-image ]
+    needs: [ check-permissions, regress-tests, coverage-report, benchmarks, build-build-tools-image ]
    if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }}
    outputs:
      report-url: ${{ steps.create-allure-report.outputs.report-url }}
@@ -378,7 +621,7 @@ jobs:
            })

  coverage-report:
-    needs: [ check-permissions, build-build-tools-image, build-and-test-locally ]
+    needs: [ check-permissions, regress-tests, build-build-tools-image ]
    runs-on: [ self-hosted, gen3, small ]
    container:
      image: ${{ needs.build-build-tools-image.outputs.image }}
@@ -529,8 +772,7 @@ jobs:
          pull: true
          file: Dockerfile
          cache-from: type=registry,ref=neondatabase/neon:cache-${{ matrix.arch }}
-          # 23.07.2024 temporarily disable cache saving in the registry as it is very slow
-          # cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=neondatabase/neon:cache-{0},mode=max', matrix.arch) || '' }}
+          cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=neondatabase/neon:cache-{0},mode=max', matrix.arch) || '' }}
          tags: |
            neondatabase/neon:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}

@@ -623,8 +865,7 @@ jobs:
          pull: true
          file: Dockerfile.compute-node
          cache-from: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache-${{ matrix.arch }}
-          # 23.07.2024 temporarily disable cache saving in the registry as it is very slow
-          # cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=neondatabase/compute-node-{0}:cache-{1},mode=max', matrix.version, matrix.arch) || '' }}
+          cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=neondatabase/compute-node-{0}:cache-{1},mode=max', matrix.version, matrix.arch) || '' }}
          tags: |
            neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}

@@ -644,8 +885,7 @@ jobs:
          file: Dockerfile.compute-node
          target: neon-pg-ext-test
          cache-from: type=registry,ref=neondatabase/neon-test-extensions-${{ matrix.version }}:cache-${{ matrix.arch }}
-          # 23.07.2024 temporarily disable cache saving in the registry as it is very slow
-          # cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=neondatabase/neon-test-extensions-{0}:cache-{1},mode=max', matrix.version, matrix.arch) || '' }}
+          cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=neondatabase/neon-test-extensions-{0}:cache-{1},mode=max', matrix.version, matrix.arch) || '' }}
          tags: |
            neondatabase/neon-test-extensions-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}-${{ matrix.arch }}

@@ -852,12 +1092,6 @@ jobs:
      VERSIONS: v14 v15 v16

    steps:
-      - uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a # v2.1.1
-        with:
-          client-id: ${{ secrets.AZURE_RUNNER_CLIENT_ID }}
-          tenant-id: ${{ secrets.AZURE_TENANT_ID }}
-          subscription-id: ${{ secrets.AZURE_DEV_SUBSCRIPTION_ID }}
-
      - uses: docker/login-action@v3
        with:
          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
@@ -989,7 +1223,7 @@ jobs:
          exit 1

  deploy:
-    needs: [ check-permissions, promote-images, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait ]
+    needs: [ check-permissions, promote-images, tag, regress-tests, trigger-custom-extensions-build-and-wait ]
    if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy'

    runs-on: [ self-hosted, gen3, small ]
@@ -1090,7 +1324,7 @@ jobs:
            })

  promote-compatibility-data:
-    needs: [ check-permissions, promote-images, tag, build-and-test-locally ]
+    needs: [ check-permissions, promote-images, tag, regress-tests ]
    if: github.ref_name == 'release'

    runs-on: [ self-hosted, gen3, small ]
@@ -1102,7 +1336,6 @@ jobs:
        env:
          BUCKET: neon-github-public-dev
          PREFIX: artifacts/latest
-          COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
        run: |
          # Update compatibility snapshot for the release
          for pg_version in v14 v15 v16; do
@@ -1116,7 +1349,7 @@ jobs:

          # Update Neon artifact for the release (reuse already uploaded artifact)
          for build_type in debug release; do
-            OLD_PREFIX=artifacts/${COMMIT_SHA}/${GITHUB_RUN_ID}
+            OLD_PREFIX=artifacts/${GITHUB_RUN_ID}
            FILENAME=neon-${{ runner.os }}-${{ runner.arch }}-${build_type}-artifact.tar.zst

            S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${OLD_PREFIX} | jq -r '.Contents[]?.Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
@@ -1129,7 +1362,7 @@ jobs:
          done

  pin-build-tools-image:
-    needs: [ build-build-tools-image, promote-images, build-and-test-locally ]
+    needs: [ build-build-tools-image, promote-images, regress-tests ]
    if: github.ref_name == 'main'
    uses: ./.github/workflows/pin-build-tools-image.yml
    with:
@@ -1151,7 +1384,7 @@ jobs:
    needs:
      - check-codestyle-python
      - check-codestyle-rust
-      - build-and-test-locally
+      - regress-tests
      - test-images
    runs-on: ubuntu-22.04
    steps:
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -133,6 +133,221 @@ jobs:
      - name: Check that no warnings are produced
        run: ./run_clippy.sh

+  check-linux-arm-build:
+    needs: [ check-permissions, build-build-tools-image ]
+    timeout-minutes: 90
+    runs-on: [ self-hosted, small-arm64 ]
+
+    env:
+      # Use release build only, to have less debug info around
+      # Hence keeping target/ (and general cache size) smaller
+      BUILD_TYPE: release
+      CARGO_FEATURES: --features testing
+      CARGO_FLAGS: --release
+      AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+      AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
+
+    container:
+      image: ${{ needs.build-build-tools-image.outputs.image }}
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+      options: --init
+
+    steps:
+      - name: Fix git ownership
+        run: |
+          # Workaround for `fatal: detected dubious ownership in repository at ...`
+          #
+          # Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers
+          #   Ref https://github.com/actions/checkout/issues/785
+          #
+          git config --global --add safe.directory ${{ github.workspace }}
+          git config --global --add safe.directory ${GITHUB_WORKSPACE}
+          for r in 14 15 16; do
+            git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
+            git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
+          done
+
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          submodules: true
+          fetch-depth: 1
+
+      - name: Set pg 14 revision for caching
+        id: pg_v14_rev
+        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT
+
+      - name: Set pg 15 revision for caching
+        id: pg_v15_rev
+        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT
+
+      - name: Set pg 16 revision for caching
+        id: pg_v16_rev
+        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT
+
+      - name: Set env variables
+        run: |
+          echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo" >> $GITHUB_ENV
+
+      - name: Cache postgres v14 build
+        id: cache_pg_14
+        uses: actions/cache@v4
+        with:
+          path: pg_install/v14
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+
+      - name: Cache postgres v15 build
+        id: cache_pg_15
+        uses: actions/cache@v4
+        with:
+          path: pg_install/v15
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+
+      - name: Cache postgres v16 build
+        id: cache_pg_16
+        uses: actions/cache@v4
+        with:
+          path: pg_install/v16
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+
+      - name: Build postgres v14
+        if: steps.cache_pg_14.outputs.cache-hit != 'true'
+        run: mold -run make postgres-v14 -j$(nproc)
+
+      - name: Build postgres v15
+        if: steps.cache_pg_15.outputs.cache-hit != 'true'
+        run: mold -run make postgres-v15 -j$(nproc)
+
+      - name: Build postgres v16
+        if: steps.cache_pg_16.outputs.cache-hit != 'true'
+        run: mold -run make postgres-v16 -j$(nproc)
+
+      - name: Build neon extensions
+        run: mold -run make neon-pg-ext -j$(nproc)
+
+      - name: Build walproposer-lib
+        run: mold -run make walproposer-lib -j$(nproc)
+
+      - name: Run cargo build
+        run: |
+          PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
+          export PQ_LIB_DIR
+          mold -run cargo build --locked $CARGO_FLAGS $CARGO_FEATURES --bins --tests -j$(nproc)
+
+      - name: Run cargo test
+        env:
+          NEXTEST_RETRIES: 3
+        run: |
+          PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
+          export PQ_LIB_DIR
+          LD_LIBRARY_PATH=$(pwd)/pg_install/v16/lib
+          export LD_LIBRARY_PATH
+
+          cargo nextest run $CARGO_FEATURES -j$(nproc)
+
+          # Run separate tests for real S3
+          export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
+          export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
+          export REMOTE_STORAGE_S3_REGION=eu-central-1
+          # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
+          cargo nextest run --package remote_storage --test test_real_s3 -j$(nproc)
+
+          # Run separate tests for real Azure Blob Storage
+          # XXX: replace region with `eu-central-1`-like region
+          export ENABLE_REAL_AZURE_REMOTE_STORAGE=y
+          export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}"
+          export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}"
+          export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
+          export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
+          # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
+          cargo nextest run --package remote_storage --test test_real_azure -j$(nproc)
+
+  check-codestyle-rust-arm:
+    needs: [ check-permissions, build-build-tools-image ]
+    timeout-minutes: 90
+    runs-on: [ self-hosted, small-arm64 ]
+
+    container:
+      image: ${{ needs.build-build-tools-image.outputs.image }}
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+      options: --init
+
+    strategy:
+      fail-fast: false
+      matrix:
+        build_type: [ debug, release ]
+
+    steps:
+      - name: Fix git ownership
+        run: |
+          # Workaround for `fatal: detected dubious ownership in repository at ...`
+          #
+          # Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers
+          #   Ref https://github.com/actions/checkout/issues/785
+          #
+          git config --global --add safe.directory ${{ github.workspace }}
+          git config --global --add safe.directory ${GITHUB_WORKSPACE}
+          for r in 14 15 16; do
+            git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
+            git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
+          done
+
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          submodules: true
+          fetch-depth: 1
+
+      # Some of our rust modules use FFI and need those to be checked
+      - name: Get postgres headers
+        run: make postgres-headers -j$(nproc)
+
+      # cargo hack runs the given cargo subcommand (clippy in this case) for all feature combinations.
+      # This will catch compiler & clippy warnings in all feature combinations.
+      # TODO: use cargo hack for build and test as well, but, that's quite expensive.
+      # NB: keep clippy args in sync with ./run_clippy.sh
+      - run: |
+          CLIPPY_COMMON_ARGS="$( source .neon_clippy_args; echo "$CLIPPY_COMMON_ARGS")"
+          if [ "$CLIPPY_COMMON_ARGS" = "" ]; then
+            echo "No clippy args found in .neon_clippy_args"
+            exit 1
+          fi
+          echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV
+
+      - name: Run cargo clippy (debug)
+        if: matrix.build_type == 'debug'
+        run: cargo hack --feature-powerset clippy $CLIPPY_COMMON_ARGS
+      - name: Run cargo clippy (release)
+        if: matrix.build_type == 'release'
+        run: cargo hack --feature-powerset clippy --release $CLIPPY_COMMON_ARGS
+
+      - name: Check documentation generation
+        if: matrix.build_type == 'release'
+        run: cargo doc --workspace --no-deps --document-private-items -j$(nproc)
+        env:
+            RUSTDOCFLAGS: "-Dwarnings -Arustdoc::private_intra_doc_links"
+
+      # Use `${{ !cancelled() }}` to run quck tests after the longer clippy run
+      - name: Check formatting
+        if: ${{ !cancelled() && matrix.build_type == 'release' }}
+        run: cargo fmt --all -- --check
+
+      # https://github.com/facebookincubator/cargo-guppy/tree/bec4e0eb29dcd1faac70b1b5360267fc02bf830e/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci
+      - name: Check rust dependencies
+        if: ${{ !cancelled() && matrix.build_type == 'release' }}
+        run: |
+          cargo hakari generate --diff  # workspace-hack Cargo.toml is up-to-date
+          cargo hakari manage-deps --dry-run  # all workspace crates depend on workspace-hack
+
+      # https://github.com/EmbarkStudios/cargo-deny
+      - name: Check rust licenses/bans/advisories/sources
+        if: ${{ !cancelled() && matrix.build_type == 'release' }}
+        run: cargo deny check
+
  gather-rust-build-stats:
    needs: [ check-permissions, build-build-tools-image ]
    if: |
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -261,6 +261,15 @@ version = "0.5.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c59bdb34bc650a32731b31bd8f0829cc15d24a708ee31559e0bb34f2bc320cba"

+[[package]]
+name = "atomic-polyfill"
+version = "1.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c314e70d181aa6053b26e3f7fbf86d1dfff84f816a6175b967666b3506ef7289"
+dependencies = [
+ "critical-section",
+]
+
 [[package]]
 name = "atomic-take"
 version = "1.1.0"
@@ -1227,7 +1236,6 @@ dependencies = [
 "regex",
 "remote_storage",
 "reqwest 0.12.4",
- "rlimit",
 "rust-ini",
 "serde",
 "serde_json",
@@ -1359,7 +1367,6 @@ dependencies = [
 "tracing",
 "url",
 "utils",
- "whoami",
 "workspace_hack",
 ]

@@ -1390,9 +1397,9 @@ dependencies = [

 [[package]]
 name = "crc32c"
-version = "0.6.8"
+version = "0.6.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3a47af21622d091a8f0fb295b88bc886ac74efcc613efc19f5d0b21de5c89e47"
+checksum = "89254598aa9b9fa608de44b3ae54c810f0f06d755e24c50177f1f8f31ff50ce2"
 dependencies = [
 "rustc_version",
 ]
@@ -1442,6 +1449,12 @@ dependencies = [
 "itertools",
 ]

+[[package]]
+name = "critical-section"
+version = "1.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6548a0ad5d2549e111e1f6a11a6c2e2d00ce6a3dafe22948d67c2b443f775e52"
+
 [[package]]
 name = "crossbeam-channel"
 version = "0.5.8"
@@ -1638,16 +1651,6 @@ dependencies = [
 "rusticata-macros",
 ]

-[[package]]
-name = "deranged"
-version = "0.3.11"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b42b6fa04a440b495c8b04d0e71b707c585f83cb9cb28cf8cd0d976c315e31b4"
-dependencies = [
- "powerfmt",
- "serde",
-]
-
 [[package]]
 name = "desim"
 version = "0.1.0"
@@ -2014,6 +2017,16 @@ dependencies = [
 "tokio-util",
 ]

+[[package]]
+name = "fs2"
+version = "0.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9564fc758e15025b46aa6643b1b77d047d1a56a1aea6e01002ac0c7026876213"
+dependencies = [
+ "libc",
+ "winapi",
+]
+
 [[package]]
 name = "fsevent-sys"
 version = "4.1.0"
@@ -2267,6 +2280,15 @@ dependencies = [
 "num-traits",
 ]

+[[package]]
+name = "hash32"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "47d60b12902ba28e2730cd37e95b8c9223af2808df9e902d4df49588d1470606"
+dependencies = [
+ "byteorder",
+]
+
 [[package]]
 name = "hashbrown"
 version = "0.12.3"
@@ -2315,6 +2337,18 @@ dependencies = [
 "num-traits",
 ]

+[[package]]
+name = "heapless"
+version = "0.8.0"
+source = "git+https://github.com/japaric/heapless.git?rev=644653bf3b831c6bb4963be2de24804acf5e5001#644653bf3b831c6bb4963be2de24804acf5e5001"
+dependencies = [
+ "atomic-polyfill",
+ "hash32",
+ "rustc_version",
+ "spin 0.9.8",
+ "stable_deref_trait",
+]
+
 [[package]]
 name = "heck"
 version = "0.4.1"
@@ -2348,6 +2382,16 @@ version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6fe2267d4ed49bc07b63801559be28c718ea06c4738b7a03c94df7386d2cde46"

+[[package]]
+name = "histogram"
+version = "0.7.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e673d137229619d5c2c8903b6ed5852b43636c0017ff2e66b1aafb8ccf04b80b"
+dependencies = [
+ "serde",
+ "thiserror",
+]
+
 [[package]]
 name = "hmac"
 version = "0.12.1"
@@ -2964,9 +3008,9 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"

 [[package]]
 name = "measured"
-version = "0.0.22"
+version = "0.0.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3051f3a030d55d680cdef6ca50e80abd1182f8da29f2344a7c9cb575721138f0"
+checksum = "652bc741286361c06de8cb4d89b21a6437f120c508c51713663589eeb9928ac5"
 dependencies = [
 "bytes",
 "crossbeam-utils",
@@ -2982,9 +3026,9 @@ dependencies = [

 [[package]]
 name = "measured-derive"
-version = "0.0.22"
+version = "0.0.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b9e6777fc80a575f9503d908c8b498782a6c3ee88a06cb416dc3941401e43b94"
+checksum = "6ea497f33e1e856a376c32ad916f69a0bd3c597db1f912a399f842b01a4a685d"
 dependencies = [
 "heck 0.5.0",
 "proc-macro2",
@@ -2994,9 +3038,9 @@ dependencies = [

 [[package]]
 name = "measured-process"
-version = "0.0.22"
+version = "0.0.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7c4b80445aeb08e832d87bf1830049a924cdc1d6b7ef40b6b9b365bff17bf8ec"
+checksum = "b364ccb66937a814b6b2ad751d1a2f7a9d5a78c761144036825fb36bb0771000"
 dependencies = [
 "libc",
 "measured",
@@ -3188,6 +3232,16 @@ dependencies = [
 "winapi",
 ]

+[[package]]
+name = "nu-ansi-term"
+version = "0.46.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84"
+dependencies = [
+ "overload",
+ "winapi",
+]
+
 [[package]]
 name = "num"
 version = "0.4.1"
@@ -3221,12 +3275,6 @@ dependencies = [
 "num-traits",
 ]

-[[package]]
-name = "num-conv"
-version = "0.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9"
-
 [[package]]
 name = "num-integer"
 version = "0.1.45"
@@ -3483,6 +3531,12 @@ version = "0.5.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4030760ffd992bef45b0ae3f10ce1aba99e33464c90d14dd7c039884963ddc7a"

+[[package]]
+name = "overload"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
+
 [[package]]
 name = "p256"
 version = "0.11.1"
@@ -3613,7 +3667,6 @@ dependencies = [
 "sysinfo",
 "tenant_size_model",
 "thiserror",
- "tikv-jemallocator",
 "tokio",
 "tokio-epoll-uring",
 "tokio-io-timeout",
@@ -4024,7 +4077,6 @@ dependencies = [
 "tokio-postgres",
 "tokio-postgres-rustls",
 "tokio-rustls 0.25.0",
- "tokio-util",
 "tracing",
 "workspace_hack",
 ]
@@ -4065,12 +4117,6 @@ dependencies = [
 "workspace_hack",
 ]

-[[package]]
-name = "powerfmt"
-version = "0.2.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391"
-
 [[package]]
 name = "ppv-lite86"
 version = "0.2.17"
@@ -4343,7 +4389,6 @@ dependencies = [
 "tracing-opentelemetry",
 "tracing-subscriber",
 "tracing-utils",
- "typed-json",
 "url",
 "urlencoding",
 "utils",
@@ -4542,15 +4587,6 @@ dependencies = [
 "bitflags 1.3.2",
 ]

-[[package]]
-name = "redox_syscall"
-version = "0.4.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa"
-dependencies = [
- "bitflags 1.3.2",
-]
-
 [[package]]
 name = "regex"
 version = "1.10.2"
@@ -4612,7 +4648,6 @@ name = "remote_storage"
 version = "0.1.0"
 dependencies = [
 "anyhow",
- "async-stream",
 "async-trait",
 "aws-config",
 "aws-credential-types",
@@ -4842,15 +4877,6 @@ dependencies = [
 "windows-sys 0.48.0",
 ]

-[[package]]
-name = "rlimit"
-version = "0.10.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3560f70f30a0f16d11d01ed078a07740fe6b489667abc7c7b029155d9f21c3d8"
-dependencies = [
- "libc",
-]
-
 [[package]]
 name = "routerify"
 version = "3.0.0"
@@ -5119,6 +5145,7 @@ dependencies = [
 "crc32c",
 "desim",
 "fail",
+ "fs2",
 "futures",
 "git-version",
 "hex",
@@ -5145,8 +5172,6 @@ dependencies = [
 "sha2",
 "signal-hook",
 "storage_broker",
- "strum",
- "strum_macros",
 "thiserror",
 "tokio",
 "tokio-io-timeout",
@@ -5371,9 +5396,9 @@ checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4"

 [[package]]
 name = "serde"
-version = "1.0.203"
+version = "1.0.183"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7253ab4de971e72fb7be983802300c30b5a7f0c2e56fab8abfc6a214307c0094"
+checksum = "32ac8da02677876d532745a130fc9d8e6edfa81a269b107c5b00829b91d8eb3c"
 dependencies = [
 "serde_derive",
 ]
@@ -5390,9 +5415,9 @@ dependencies = [

 [[package]]
 name = "serde_derive"
-version = "1.0.203"
+version = "1.0.183"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "500cbc0ebeb6f46627f50f3f5811ccf6bf00643be300b4c3eabc0ef55dc5b5ba"
+checksum = "aafe972d60b0b9bee71a91b92fee2d4fb3c9d7e8f6b179aa99f27203d99a4816"
 dependencies = [
 "proc-macro2",
 "quote",
@@ -5655,6 +5680,9 @@ name = "spin"
 version = "0.9.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67"
+dependencies = [
+ "lock_api",
+]

 [[package]]
 name = "spki"
@@ -5676,6 +5704,12 @@ dependencies = [
 "der 0.7.8",
 ]

+[[package]]
+name = "stable_deref_trait"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3"
+
 [[package]]
 name = "static_assertions"
 version = "1.1.0"
@@ -5752,28 +5786,6 @@ dependencies = [
 "workspace_hack",
 ]

-[[package]]
-name = "storage_controller_client"
-version = "0.1.0"
-dependencies = [
- "anyhow",
- "async-trait",
- "bytes",
- "futures",
- "pageserver_api",
- "pageserver_client",
- "postgres",
- "reqwest 0.12.4",
- "serde",
- "thiserror",
- "tokio",
- "tokio-postgres",
- "tokio-stream",
- "tokio-util",
- "utils",
- "workspace_hack",
-]
-
 [[package]]
 name = "storage_scrubber"
 version = "0.1.0"
@@ -5793,6 +5805,7 @@ dependencies = [
 "futures",
 "futures-util",
 "hex",
+ "histogram",
 "humantime",
 "itertools",
 "once_cell",
@@ -5807,7 +5820,6 @@ dependencies = [
 "serde",
 "serde_json",
 "serde_with",
- "storage_controller_client",
 "thiserror",
 "tokio",
 "tokio-postgres",
@@ -5837,7 +5849,6 @@ dependencies = [
 "reqwest 0.12.4",
 "serde",
 "serde_json",
- "storage_controller_client",
 "thiserror",
 "tokio",
 "tracing",
@@ -6096,15 +6107,12 @@ dependencies = [

 [[package]]
 name = "time"
-version = "0.3.36"
+version = "0.3.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5dfd88e563464686c916c7e46e623e520ddc6d79fa6641390f2e3fa86e83e885"
+checksum = "8f3403384eaacbca9923fa06940178ac13e4edb725486d70e8e15881d0c836cc"
 dependencies = [
- "deranged",
 "itoa",
 "js-sys",
- "num-conv",
- "powerfmt",
 "serde",
 "time-core",
 "time-macros",
@@ -6112,17 +6120,16 @@ dependencies = [

 [[package]]
 name = "time-core"
-version = "0.1.2"
+version = "0.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3"
+checksum = "7300fbefb4dadc1af235a9cef3737cea692a9d97e1b9cbcd4ebdae6f8868e6fb"

 [[package]]
 name = "time-macros"
-version = "0.2.18"
+version = "0.2.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3f252a68540fde3a3877aeea552b832b40ab9a69e318efd078774a01ddee1ccf"
+checksum = "372950940a5f07bf38dbe211d7283c9e6d7327df53794992d293e534c733d09b"
 dependencies = [
- "num-conv",
 "time-core",
 ]

@@ -6465,6 +6472,17 @@ version = "0.3.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52"

+[[package]]
+name = "trace"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "clap",
+ "pageserver_api",
+ "utils",
+ "workspace_hack",
+]
+
 [[package]]
 name = "tracing"
 version = "0.1.37"
@@ -6564,6 +6582,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "30a651bc37f915e81f087d86e62a18eec5f79550c7faff886f7090b4ea757c77"
 dependencies = [
 "matchers",
+ "nu-ansi-term",
 "once_cell",
 "regex",
 "serde",
@@ -6628,16 +6647,6 @@ dependencies = [
 "static_assertions",
 ]

-[[package]]
-name = "typed-json"
-version = "0.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6024a8d0025400b3f6b189366e9aa92012cf9c4fe1cd2620848dd61425c49eed"
-dependencies = [
- "serde",
- "serde_json",
-]
-
 [[package]]
 name = "typenum"
 version = "1.16.0"
@@ -6772,6 +6781,7 @@ dependencies = [
 "criterion",
 "fail",
 "futures",
+ "heapless",
 "hex",
 "hex-literal",
 "humantime",
@@ -6933,12 +6943,6 @@ version = "0.11.0+wasi-snapshot-preview1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"

-[[package]]
-name = "wasite"
-version = "0.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b8dad83b4f25e74f184f64c43b150b91efe7647395b42289f38e50566d82855b"
-
 [[package]]
 name = "wasm-bindgen"
 version = "0.2.92"
@@ -7091,17 +7095,6 @@ dependencies = [
 "once_cell",
 ]

-[[package]]
-name = "whoami"
-version = "1.5.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a44ab49fad634e88f55bf8f9bb3abd2f27d7204172a112c7c9987e01c1c94ea9"
-dependencies = [
- "redox_syscall 0.4.1",
- "wasite",
- "web-sys",
-]
-
 [[package]]
 name = "winapi"
 version = "0.3.9"
@@ -7434,12 +7427,13 @@ dependencies = [
 "clap",
 "clap_builder",
 "crossbeam-utils",
- "deranged",
 "either",
 "fail",
 "futures-channel",
+ "futures-core",
 "futures-executor",
 "futures-io",
+ "futures-sink",
 "futures-util",
 "getrandom 0.2.11",
 "hashbrown 0.14.5",
@@ -7457,9 +7451,7 @@ dependencies = [
 "num-traits",
 "once_cell",
 "parquet",
- "proc-macro2",
 "prost",
- "quote",
 "rand 0.8.5",
 "regex",
 "regex-automata 0.4.3",
@@ -7476,7 +7468,6 @@ dependencies = [
 "syn 1.0.109",
 "syn 2.0.52",
 "sync_wrapper",
- "tikv-jemalloc-sys",
 "time",
 "time-macros",
 "tokio",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -13,9 +13,9 @@ members = [
    "safekeeper",
    "storage_broker",
    "storage_controller",
-    "storage_controller/client",
    "storage_scrubber",
    "workspace_hack",
+    "trace",
    "libs/compute_api",
    "libs/pageserver_api",
    "libs/postgres_ffi",
@@ -84,6 +84,7 @@ enumset = "1.0.12"
 fail = "0.5.0"
 fallible-iterator = "0.2"
 framed-websockets = { version = "0.1.0", git = "https://github.com/neondatabase/framed-websockets" }
+fs2 = "0.4.3"
 futures = "0.3"
 futures-core = "0.3"
 futures-util = "0.3"
@@ -110,8 +111,8 @@ lasso = "0.7"
 leaky-bucket = "1.0.1"
 libc = "0.2"
 md5 = "0.7.0"
-measured = { version = "0.0.22", features=["lasso"] }
-measured-process = { version = "0.0.22" }
+measured = { version = "0.0.21", features=["lasso"] }
+measured-process = { version = "0.0.21" }
 memoffset = "0.8"
 nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] }
 notify = "6.0.0"
@@ -183,16 +184,14 @@ tower-service = "0.3.2"
 tracing = "0.1"
 tracing-error = "0.2.0"
 tracing-opentelemetry = "0.21.0"
-tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
+tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json", "ansi"] }
 twox-hash = { version = "1.6.3", default-features = false }
-typed-json = "0.1"
 url = "2.2"
 urlencoding = "2.1"
 uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] }
 walkdir = "2.3.2"
 rustls-native-certs = "0.7"
 x509-parser = "0.15"
-whoami = "1.5.1"

 ## TODO replace this with tracing
 env_logger = "0.10"
@@ -204,6 +203,9 @@ postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git",
 postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
 tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }

+## Other git libraries
+heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending
+
 ## Local libraries
 compute_api = { version = "0.1", path = "./libs/compute_api/" }
 consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" }
@@ -219,7 +221,6 @@ remote_storage = { version = "0.1", path = "./libs/remote_storage/" }
 safekeeper_api = { version = "0.1", path = "./libs/safekeeper_api" }
 desim = { version = "0.1", path = "./libs/desim" }
 storage_broker = { version = "0.1", path = "./storage_broker/" } # Note: main broker code is inside the binary crate, so linking with the library shouldn't be heavy.
-storage_controller_client = { path = "./storage_controller/client" }
 tenant_size_model = { version = "0.1", path = "./libs/tenant_size_model/" }
 tracing-utils = { version = "0.1", path = "./libs/tracing-utils/" }
 utils = { version = "0.1", path = "./libs/utils/" }
--- a/18
+++ b/18
@@ -93,14 +93,13 @@ COPY --from=pg-build /home/nonroot/postgres_install.tar.gz /data/

 # By default, pageserver uses `.neon/` working directory in WORKDIR, so create one and fill it with the dummy config.
 # Now, when `docker run ... pageserver` is run, it can start without errors, yet will have some default dummy values.
-RUN mkdir -p /data/.neon/ && \
-  echo "id=1234" > "/data/.neon/identity.toml" && \
-  echo "broker_endpoint='http://storage_broker:50051'\n" \
-       "pg_distrib_dir='/usr/local/'\n" \
-       "listen_pg_addr='0.0.0.0:6400'\n" \
-       "listen_http_addr='0.0.0.0:9898'\n" \
-  > /data/.neon/pageserver.toml && \
-  chown -R neon:neon /data/.neon
+RUN mkdir -p /data/.neon/ && chown -R neon:neon /data/.neon/ \
+    && /usr/local/bin/pageserver -D /data/.neon/ --init \
+       -c "id=1234" \
+       -c "broker_endpoint='http://storage_broker:50051'" \
+       -c "pg_distrib_dir='/usr/local/'" \
+       -c "listen_pg_addr='0.0.0.0:6400'" \
+       -c "listen_http_addr='0.0.0.0:9898'"

 # When running a binary that links with libpq, default to using our most recent postgres version.  Binaries
 # that want a particular postgres version will select it explicitly: this is just a default.
@@ -111,6 +110,3 @@ VOLUME ["/data"]
 USER neon
 EXPOSE 6400
 EXPOSE 9898
-
-CMD /usr/local/bin/pageserver -D /data/.neon
-
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -311,12 +311,9 @@ RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz
 FROM build-deps AS rum-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-COPY patches/rum.patch /rum.patch
-
 RUN wget https://github.com/postgrespro/rum/archive/refs/tags/1.3.13.tar.gz -O rum.tar.gz && \
    echo "6ab370532c965568df6210bd844ac6ba649f53055e48243525b0b7e5c4d69a7d rum.tar.gz" | sha256sum --check && \
    mkdir rum-src && cd rum-src && tar xzf ../rum.tar.gz --strip-components=1 -C . && \
-    patch -p1 < /rum.patch && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/rum.control
--- a/13
+++ b/13
@@ -69,8 +69,6 @@ CARGO_CMD_PREFIX += CARGO_TERM_PROGRESS_WHEN=never CI=1
 # Set PQ_LIB_DIR to make sure `storage_controller` get linked with bundled libpq (through diesel)
 CARGO_CMD_PREFIX += PQ_LIB_DIR=$(POSTGRES_INSTALL_DIR)/v16/lib

-CACHEDIR_TAG_CONTENTS := "Signature: 8a477f597d28d172789f06886806bc55"
-
 #
 # Top level Makefile to build Neon and PostgreSQL
 #
@@ -81,24 +79,15 @@ all: neon postgres neon-pg-ext
 #
 # The 'postgres_ffi' depends on the Postgres headers.
 .PHONY: neon
-neon: postgres-headers walproposer-lib cargo-target-dir
+neon: postgres-headers walproposer-lib
 	+@echo "Compiling Neon"
 	$(CARGO_CMD_PREFIX) cargo build $(CARGO_BUILD_FLAGS)
-.PHONY: cargo-target-dir
-cargo-target-dir:
-	# https://github.com/rust-lang/cargo/issues/14281
-	mkdir -p target
-	test -e target/CACHEDIR.TAG || echo "$(CACHEDIR_TAG_CONTENTS)" > target/CACHEDIR.TAG

 ### PostgreSQL parts
 # Some rules are duplicated for Postgres v14 and 15. We may want to refactor
 # to avoid the duplication in the future, but it's tolerable for now.
 #
 $(POSTGRES_INSTALL_DIR)/build/%/config.status:
-
-	mkdir -p $(POSTGRES_INSTALL_DIR)
-	test -e $(POSTGRES_INSTALL_DIR)/CACHEDIR.TAG || echo "$(CACHEDIR_TAG_CONTENTS)" > $(POSTGRES_INSTALL_DIR)/CACHEDIR.TAG
-
 	+@echo "Configuring Postgres $* build"
 	@test -s $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure || { \
 		echo "\nPostgres submodule not found in $(ROOT_PROJECT_DIR)/vendor/postgres-$*/, execute "; \
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -44,4 +44,3 @@ vm_monitor = { version = "0.1", path = "../libs/vm_monitor/" }
 zstd = "0.13"
 bytes = "1.0"
 rust-ini = "0.20.0"
-rlimit = "0.10.1"
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -6,7 +6,7 @@
 //! - Every start is a fresh start, so the data directory is removed and
 //!   initialized again on each run.
 //! - If remote_extension_config is provided, it will be used to fetch extensions list
-//!   and download `shared_preload_libraries` from the remote storage.
+//!  and download `shared_preload_libraries` from the remote storage.
 //! - Next it will put configuration files into the `PGDATA` directory.
 //! - Sync safekeepers and get commit LSN.
 //! - Get `basebackup` from pageserver using the returned on the previous step LSN.
@@ -33,6 +33,7 @@
 //!             -b /usr/local/bin/postgres \
 //!             -r http://pg-ext-s3-gateway \
 //! ```
+//!
 use std::collections::HashMap;
 use std::fs::File;
 use std::path::Path;
@@ -63,7 +64,6 @@ use compute_tools::monitor::launch_monitor;
 use compute_tools::params::*;
 use compute_tools::spec::*;
 use compute_tools::swap::resize_swap;
-use rlimit::{setrlimit, Resource};

 // this is an arbitrary build tag. Fine as a default / for testing purposes
 // in-case of not-set environment var
@@ -72,9 +72,6 @@ const BUILD_TAG_DEFAULT: &str = "latest";
 fn main() -> Result<()> {
    let (build_tag, clap_args) = init()?;

-    // enable core dumping for all child processes
-    setrlimit(Resource::CORE, rlimit::INFINITY, rlimit::INFINITY)?;
-
    let (pg_handle, start_pg_result) = {
        // Enter startup tracing context
        let _startup_context_guard = startup_context_from_env();
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -56,7 +56,6 @@ pub struct ComputeNode {
    /// - we push new spec and it does reconfiguration
    /// - but then something happens and compute pod / VM is destroyed,
    ///   so k8s controller starts it again with the **old** spec
-    ///
    /// and the same for empty computes:
    /// - we started compute without any spec
    /// - we push spec and it does configuration
@@ -799,11 +798,7 @@ impl ComputeNode {
        // In this case we need to connect with old `zenith_admin` name
        // and create new user. We cannot simply rename connected user,
        // but we can create a new one and grant it all privileges.
-        let mut connstr = self.connstr.clone();
-        connstr
-            .query_pairs_mut()
-            .append_pair("application_name", "apply_config");
-
+        let connstr = self.connstr.clone();
        let mut client = match Client::connect(connstr.as_str(), NoTls) {
            Err(e) => match e.code() {
                Some(&SqlState::INVALID_PASSWORD)
@@ -872,11 +867,6 @@ impl ComputeNode {

        // Run migrations separately to not hold up cold starts
        thread::spawn(move || {
-            let mut connstr = connstr.clone();
-            connstr
-                .query_pairs_mut()
-                .append_pair("application_name", "migrations");
-
            let mut client = Client::connect(connstr.as_str(), NoTls)?;
            handle_migrations(&mut client).context("apply_config handle_migrations")
        });
@@ -1117,7 +1107,7 @@ impl ComputeNode {
    // EKS worker nodes have following core dump settings:
    //   /proc/sys/kernel/core_pattern -> core
    //   /proc/sys/kernel/core_uses_pid -> 1
-    //   ulimit -c -> unlimited
+    //   ulimint -c -> unlimited
    // which results in core dumps being written to postgres data directory as core.<pid>.
    //
    // Use that as a default location and pattern, except macos where core dumps are written
@@ -1396,9 +1386,7 @@ pub fn forward_termination_signal() {
    let pg_pid = PG_PID.load(Ordering::SeqCst);
    if pg_pid != 0 {
        let pg_pid = nix::unistd::Pid::from_raw(pg_pid as i32);
-        // Use 'fast' shutdown (SIGINT) because it also creates a shutdown checkpoint, which is important for
-        // ROs to get a list of running xacts faster instead of going through the CLOG.
-        // See https://www.postgresql.org/docs/current/server-shutdown.html for the list of modes and signals.
-        kill(pg_pid, Signal::SIGINT).ok();
+        // use 'immediate' shutdown (SIGQUIT): https://www.postgresql.org/docs/current/server-shutdown.html
+        kill(pg_pid, Signal::SIGQUIT).ok();
    }
 }
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -11,7 +11,6 @@ pub mod logger;
 pub mod catalog;
 pub mod compute;
 pub mod extension_server;
-mod migration;
 pub mod monitor;
 pub mod params;
 pub mod pg_helpers;
--- a/compute_tools/src/migration.rs
+++ b/compute_tools/src/migration.rs
@@ -1,105 +0,0 @@
-use anyhow::{Context, Result};
-use postgres::Client;
-use tracing::info;
-
-pub(crate) struct MigrationRunner<'m> {
-    client: &'m mut Client,
-    migrations: &'m [&'m str],
-}
-
-impl<'m> MigrationRunner<'m> {
-    pub fn new(client: &'m mut Client, migrations: &'m [&'m str]) -> Self {
-        // The neon_migration.migration_id::id column is a bigint, which is equivalent to an i64
-        assert!(migrations.len() + 1 < i64::MAX as usize);
-
-        Self { client, migrations }
-    }
-
-    fn get_migration_id(&mut self) -> Result<i64> {
-        let query = "SELECT id FROM neon_migration.migration_id";
-        let row = self
-            .client
-            .query_one(query, &[])
-            .context("run_migrations get migration_id")?;
-
-        Ok(row.get::<&str, i64>("id"))
-    }
-
-    fn update_migration_id(&mut self, migration_id: i64) -> Result<()> {
-        let setval = format!("UPDATE neon_migration.migration_id SET id={}", migration_id);
-
-        self.client
-            .simple_query(&setval)
-            .context("run_migrations update id")?;
-
-        Ok(())
-    }
-
-    fn prepare_migrations(&mut self) -> Result<()> {
-        let query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
-        self.client.simple_query(query)?;
-
-        let query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)";
-        self.client.simple_query(query)?;
-
-        let query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING";
-        self.client.simple_query(query)?;
-
-        let query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin";
-        self.client.simple_query(query)?;
-
-        let query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC";
-        self.client.simple_query(query)?;
-
-        Ok(())
-    }
-
-    pub fn run_migrations(mut self) -> Result<()> {
-        self.prepare_migrations()?;
-
-        let mut current_migration = self.get_migration_id()? as usize;
-        while current_migration < self.migrations.len() {
-            macro_rules! migration_id {
-                ($cm:expr) => {
-                    ($cm + 1) as i64
-                };
-            }
-
-            let migration = self.migrations[current_migration];
-
-            if migration.starts_with("-- SKIP") {
-                info!("Skipping migration id={}", migration_id!(current_migration));
-            } else {
-                info!(
-                    "Running migration id={}:\n{}\n",
-                    migration_id!(current_migration),
-                    migration
-                );
-
-                self.client
-                    .simple_query("BEGIN")
-                    .context("begin migration")?;
-
-                self.client.simple_query(migration).with_context(|| {
-                    format!(
-                        "run_migrations migration id={}",
-                        migration_id!(current_migration)
-                    )
-                })?;
-
-                // Migration IDs start at 1
-                self.update_migration_id(migration_id!(current_migration))?;
-
-                self.client
-                    .simple_query("COMMIT")
-                    .context("commit migration")?;
-
-                info!("Finished migration id={}", migration_id!(current_migration));
-            }
-
-            current_migration += 1;
-        }
-
-        Ok(())
-    }
-}
--- a/compute_tools/src/migrations/0000-neon_superuser_bypass_rls.sql
+++ b/compute_tools/src/migrations/0000-neon_superuser_bypass_rls.sql
--- a/compute_tools/src/migrations/0001-alter_roles.sql
+++ b/compute_tools/src/migrations/0001-alter_roles.sql
--- a/compute_tools/src/migrations/0002-grant_pg_create_subscription_to_neon_superuser.sql
+++ b/compute_tools/src/migrations/0002-grant_pg_create_subscription_to_neon_superuser.sql
--- a/compute_tools/src/migrations/0003-grant_pg_monitor_to_neon_superuser.sql
+++ b/compute_tools/src/migrations/0003-grant_pg_monitor_to_neon_superuser.sql
--- a/compute_tools/src/migrations/0004-grant_all_on_tables_to_neon_superuser.sql
+++ b/compute_tools/src/migrations/0004-grant_all_on_tables_to_neon_superuser.sql
--- a/compute_tools/src/migrations/0005-grant_all_on_sequences_to_neon_superuser.sql
+++ b/compute_tools/src/migrations/0005-grant_all_on_sequences_to_neon_superuser.sql
--- a/compute_tools/src/migrations/0006-grant_all_on_tables_to_neon_superuser_with_grant_option.sql
+++ b/compute_tools/src/migrations/0006-grant_all_on_tables_to_neon_superuser_with_grant_option.sql
--- a/compute_tools/src/migrations/0007-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql
+++ b/compute_tools/src/migrations/0007-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql
--- a/compute_tools/src/migrations/0008-revoke_replication_for_previously_allowed_roles.sql
+++ b/compute_tools/src/migrations/0008-revoke_replication_for_previously_allowed_roles.sql
--- a/compute_tools/src/migrations/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql
+++ b/compute_tools/src/migrations/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql
@@ -1,7 +0,0 @@
-DO $$
-BEGIN
-    IF (SELECT setting::numeric >= 160000 FROM pg_settings WHERE name = 'server_version_num') THEN
-       EXECUTE 'GRANT EXECUTE ON FUNCTION pg_export_snapshot TO neon_superuser';
-       EXECUTE 'GRANT EXECUTE ON FUNCTION pg_log_standby_snapshot TO neon_superuser';
-    END IF;
-END $$;
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -10,7 +10,6 @@ use tracing::{error, info, info_span, instrument, span_enabled, warn, Level};

 use crate::config;
 use crate::logger::inlinify;
-use crate::migration::MigrationRunner;
 use crate::params::PG_HBA_ALL_MD5;
 use crate::pg_helpers::*;

@@ -777,25 +776,84 @@ pub fn handle_migrations(client: &mut Client) -> Result<()> {

    // Add new migrations in numerical order.
    let migrations = [
-        include_str!("./migrations/0001-neon_superuser_bypass_rls.sql"),
-        include_str!("./migrations/0002-alter_roles.sql"),
-        include_str!("./migrations/0003-grant_pg_create_subscription_to_neon_superuser.sql"),
-        include_str!("./migrations/0004-grant_pg_monitor_to_neon_superuser.sql"),
-        include_str!("./migrations/0005-grant_all_on_tables_to_neon_superuser.sql"),
-        include_str!("./migrations/0006-grant_all_on_sequences_to_neon_superuser.sql"),
+        include_str!("./migrations/0000-neon_superuser_bypass_rls.sql"),
+        include_str!("./migrations/0001-alter_roles.sql"),
+        include_str!("./migrations/0002-grant_pg_create_subscription_to_neon_superuser.sql"),
+        include_str!("./migrations/0003-grant_pg_monitor_to_neon_superuser.sql"),
+        include_str!("./migrations/0004-grant_all_on_tables_to_neon_superuser.sql"),
+        include_str!("./migrations/0005-grant_all_on_sequences_to_neon_superuser.sql"),
        include_str!(
-            "./migrations/0007-grant_all_on_tables_to_neon_superuser_with_grant_option.sql"
+            "./migrations/0006-grant_all_on_tables_to_neon_superuser_with_grant_option.sql"
        ),
        include_str!(
-            "./migrations/0008-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql"
-        ),
-        include_str!("./migrations/0009-revoke_replication_for_previously_allowed_roles.sql"),
-        include_str!(
-            "./migrations/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql"
+            "./migrations/0007-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql"
        ),
+        include_str!("./migrations/0008-revoke_replication_for_previously_allowed_roles.sql"),
    ];

-    MigrationRunner::new(client, &migrations).run_migrations()?;
+    let mut func = || {
+        let query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
+        client.simple_query(query)?;
+
+        let query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)";
+        client.simple_query(query)?;
+
+        let query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING";
+        client.simple_query(query)?;
+
+        let query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin";
+        client.simple_query(query)?;
+
+        let query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC";
+        client.simple_query(query)?;
+        Ok::<_, anyhow::Error>(())
+    };
+    func().context("handle_migrations prepare")?;
+
+    let query = "SELECT id FROM neon_migration.migration_id";
+    let row = client
+        .query_one(query, &[])
+        .context("handle_migrations get migration_id")?;
+    let mut current_migration: usize = row.get::<&str, i64>("id") as usize;
+    let starting_migration_id = current_migration;
+
+    let query = "BEGIN";
+    client
+        .simple_query(query)
+        .context("handle_migrations begin")?;
+
+    while current_migration < migrations.len() {
+        let migration = &migrations[current_migration];
+        if migration.starts_with("-- SKIP") {
+            info!("Skipping migration id={}", current_migration);
+        } else {
+            info!(
+                "Running migration id={}:\n{}\n",
+                current_migration, migration
+            );
+            client.simple_query(migration).with_context(|| {
+                format!("handle_migrations current_migration={}", current_migration)
+            })?;
+        }
+        current_migration += 1;
+    }
+    let setval = format!(
+        "UPDATE neon_migration.migration_id SET id={}",
+        migrations.len()
+    );
+    client
+        .simple_query(&setval)
+        .context("handle_migrations update id")?;
+
+    let query = "COMMIT";
+    client
+        .simple_query(query)
+        .context("handle_migrations commit")?;
+
+    info!(
+        "Ran {} migrations",
+        (migrations.len() - starting_migration_id)
+    );

    Ok(())
 }
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -40,7 +40,6 @@ safekeeper_api.workspace = true
 postgres_connection.workspace = true
 storage_broker.workspace = true
 utils.workspace = true
-whoami.workspace = true

 compute_api.workspace = true
 workspace_hack.workspace = true
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -21,9 +21,7 @@ use pageserver_api::config::{
    DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT,
    DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT,
 };
-use pageserver_api::controller_api::{
-    NodeAvailabilityWrapper, PlacementPolicy, TenantCreateRequest,
-};
+use pageserver_api::controller_api::{PlacementPolicy, TenantCreateRequest};
 use pageserver_api::models::{ShardParameters, TimelineCreateRequest, TimelineInfo};
 use pageserver_api::shard::{ShardCount, ShardStripeSize, TenantShardId};
 use postgres_backend::AuthType;
@@ -1252,70 +1250,9 @@ async fn handle_start_all(
            exit(1);
        }
    }
-
-    neon_start_status_check(env, retry_timeout).await?;
-
    Ok(())
 }

-async fn neon_start_status_check(
-    env: &local_env::LocalEnv,
-    retry_timeout: &Duration,
-) -> anyhow::Result<()> {
-    const RETRY_INTERVAL: Duration = Duration::from_millis(100);
-    const NOTICE_AFTER_RETRIES: Duration = Duration::from_secs(5);
-
-    if env.control_plane_api.is_none() {
-        return Ok(());
-    }
-
-    let storcon = StorageController::from_env(env);
-
-    let retries = retry_timeout.as_millis() / RETRY_INTERVAL.as_millis();
-    let notice_after_retries = retry_timeout.as_millis() / NOTICE_AFTER_RETRIES.as_millis();
-
-    println!("\nRunning neon status check");
-
-    for retry in 0..retries {
-        if retry == notice_after_retries {
-            println!("\nNeon status check has not passed yet, continuing to wait")
-        }
-
-        let mut passed = true;
-        let mut nodes = storcon.node_list().await?;
-        let mut pageservers = env.pageservers.clone();
-
-        if nodes.len() != pageservers.len() {
-            continue;
-        }
-
-        nodes.sort_by_key(|ps| ps.id);
-        pageservers.sort_by_key(|ps| ps.id);
-
-        for (idx, pageserver) in pageservers.iter().enumerate() {
-            let node = &nodes[idx];
-            if node.id != pageserver.id {
-                passed = false;
-                break;
-            }
-
-            if !matches!(node.availability, NodeAvailabilityWrapper::Active) {
-                passed = false;
-                break;
-            }
-        }
-
-        if passed {
-            println!("\nNeon started and passed status check");
-            return Ok(());
-        }
-
-        tokio::time::sleep(RETRY_INTERVAL).await;
-    }
-
-    anyhow::bail!("\nNeon passed status check")
-}
-
 async fn handle_stop_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
    let immediate =
        sub_match.get_one::<String>("stop-mode").map(|s| s.as_str()) == Some("immediate");
--- a/control_plane/src/broker.rs
+++ b/control_plane/src/broker.rs
@@ -1,9 +1,9 @@
 //! Code to manage the storage broker
 //!
-//! In the local test environment, the storage broker stores its data directly in
+//! In the local test environment, the data for each safekeeper is stored in
 //!
 //! ```text
-//!   .neon
+//!   .neon/safekeepers/<safekeeper id>
 //! ```
 use std::time::Duration;

--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -151,10 +151,7 @@ pub struct NeonBroker {
 pub struct NeonStorageControllerConf {
    /// Heartbeat timeout before marking a node offline
    #[serde(with = "humantime_serde")]
-    pub max_offline: Duration,
-
-    #[serde(with = "humantime_serde")]
-    pub max_warming_up: Duration,
+    pub max_unavailable: Duration,

    /// Threshold for auto-splitting a tenant into shards
    pub split_threshold: Option<u64>,
@@ -162,16 +159,14 @@ pub struct NeonStorageControllerConf {

 impl NeonStorageControllerConf {
    // Use a shorter pageserver unavailability interval than the default to speed up tests.
-    const DEFAULT_MAX_OFFLINE_INTERVAL: std::time::Duration = std::time::Duration::from_secs(10);
-
-    const DEFAULT_MAX_WARMING_UP_INTERVAL: std::time::Duration = std::time::Duration::from_secs(30);
+    const DEFAULT_MAX_UNAVAILABLE_INTERVAL: std::time::Duration =
+        std::time::Duration::from_secs(10);
 }

 impl Default for NeonStorageControllerConf {
    fn default() -> Self {
        Self {
-            max_offline: Self::DEFAULT_MAX_OFFLINE_INTERVAL,
-            max_warming_up: Self::DEFAULT_MAX_WARMING_UP_INTERVAL,
+            max_unavailable: Self::DEFAULT_MAX_UNAVAILABLE_INTERVAL,
            split_threshold: None,
        }
    }
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -1,10 +1,8 @@
 //! Code to manage pageservers
 //!
-//! In the local test environment, the data for each pageserver is stored in
+//! In the local test environment, the pageserver stores its data directly in
 //!
-//! ```text
-//!   .neon/pageserver_<pageserver_id>
-//! ```
+//!   .neon/
 //!
 use std::collections::HashMap;

@@ -17,6 +15,7 @@ use std::time::Duration;

 use anyhow::{bail, Context};
 use camino::Utf8PathBuf;
+use futures::SinkExt;
 use pageserver_api::models::{
    self, AuxFilePolicy, LocationConfig, TenantHistorySize, TenantInfo, TimelineInfo,
 };
@@ -25,7 +24,6 @@ use pageserver_client::mgmt_api;
 use postgres_backend::AuthType;
 use postgres_connection::{parse_host_port, PgConnectionConfig};
 use utils::auth::{Claims, Scope};
-use utils::id::NodeId;
 use utils::{
    id::{TenantId, TimelineId},
    lsn::Lsn,
@@ -75,10 +73,6 @@ impl PageServerNode {
        }
    }

-    fn pageserver_make_identity_toml(&self, node_id: NodeId) -> toml_edit::Document {
-        toml_edit::Document::from_str(&format!("id={node_id}")).unwrap()
-    }
-
    fn pageserver_init_make_toml(
        &self,
        conf: NeonLocalInitPageserverConf,
@@ -191,19 +185,6 @@ impl PageServerNode {
            .write_all(config.to_string().as_bytes())
            .context("write pageserver toml")?;
        drop(config_file);
-
-        let identity_file_path = datadir.join("identity.toml");
-        let mut identity_file = std::fs::OpenOptions::new()
-            .create_new(true)
-            .write(true)
-            .open(identity_file_path)
-            .with_context(|| format!("open identity toml for write: {config_file_path:?}"))?;
-        let identity_toml = self.pageserver_make_identity_toml(node_id);
-        identity_file
-            .write_all(identity_toml.to_string().as_bytes())
-            .context("write identity toml")?;
-        drop(identity_toml);
-
        // TODO: invoke a TBD config-check command to validate that pageserver will start with the written config

        // Write metadata file, used by pageserver on startup to register itself with
@@ -369,6 +350,11 @@ impl PageServerNode {
                .map(|x| x.parse::<NonZeroU64>())
                .transpose()
                .context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?,
+            trace_read_requests: settings
+                .remove("trace_read_requests")
+                .map(|x| x.parse::<bool>())
+                .transpose()
+                .context("Failed to parse 'trace_read_requests' as bool")?,
            eviction_policy: settings
                .remove("eviction_policy")
                .map(serde_json::from_str)
@@ -469,6 +455,11 @@ impl PageServerNode {
                    .map(|x| x.parse::<NonZeroU64>())
                    .transpose()
                    .context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?,
+                trace_read_requests: settings
+                    .remove("trace_read_requests")
+                    .map(|x| x.parse::<bool>())
+                    .transpose()
+                    .context("Failed to parse 'trace_read_requests' as bool")?,
                eviction_policy: settings
                    .remove("eviction_policy")
                    .map(serde_json::from_str)
@@ -575,39 +566,60 @@ impl PageServerNode {
        pg_wal: Option<(Lsn, PathBuf)>,
        pg_version: u32,
    ) -> anyhow::Result<()> {
+        let (client, conn) = self.page_server_psql_client().await?;
+        // The connection object performs the actual communication with the database,
+        // so spawn it off to run on its own.
+        tokio::spawn(async move {
+            if let Err(e) = conn.await {
+                eprintln!("connection error: {}", e);
+            }
+        });
+        let client = std::pin::pin!(client);
+
        // Init base reader
        let (start_lsn, base_tarfile_path) = base;
        let base_tarfile = tokio::fs::File::open(base_tarfile_path).await?;
-        let base_tarfile =
-            mgmt_api::ReqwestBody::wrap_stream(tokio_util::io::ReaderStream::new(base_tarfile));
+        let base_tarfile = tokio_util::io::ReaderStream::new(base_tarfile);

        // Init wal reader if necessary
        let (end_lsn, wal_reader) = if let Some((end_lsn, wal_tarfile_path)) = pg_wal {
            let wal_tarfile = tokio::fs::File::open(wal_tarfile_path).await?;
-            let wal_reader =
-                mgmt_api::ReqwestBody::wrap_stream(tokio_util::io::ReaderStream::new(wal_tarfile));
+            let wal_reader = tokio_util::io::ReaderStream::new(wal_tarfile);
            (end_lsn, Some(wal_reader))
        } else {
            (start_lsn, None)
        };

-        // Import base
-        self.http_client
-            .import_basebackup(
-                tenant_id,
-                timeline_id,
-                start_lsn,
-                end_lsn,
-                pg_version,
-                base_tarfile,
-            )
-            .await?;
+        let copy_in = |reader, cmd| {
+            let client = &client;
+            async move {
+                let writer = client.copy_in(&cmd).await?;
+                let writer = std::pin::pin!(writer);
+                let mut writer = writer.sink_map_err(|e| {
+                    std::io::Error::new(std::io::ErrorKind::Other, format!("{e}"))
+                });
+                let mut reader = std::pin::pin!(reader);
+                writer.send_all(&mut reader).await?;
+                writer.into_inner().finish().await?;
+                anyhow::Ok(())
+            }
+        };

+        // Import base
+        copy_in(
+            base_tarfile,
+            format!(
+                "import basebackup {tenant_id} {timeline_id} {start_lsn} {end_lsn} {pg_version}"
+            ),
+        )
+        .await?;
        // Import wal if necessary
        if let Some(wal_reader) = wal_reader {
-            self.http_client
-                .import_wal(tenant_id, timeline_id, start_lsn, end_lsn, wal_reader)
-                .await?;
+            copy_in(
+                wal_reader,
+                format!("import wal {tenant_id} {timeline_id} {start_lsn} {end_lsn}"),
+            )
+            .await?;
        }

        Ok(())
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -5,9 +5,8 @@ use crate::{
 use camino::{Utf8Path, Utf8PathBuf};
 use pageserver_api::{
    controller_api::{
-        NodeConfigureRequest, NodeDescribeResponse, NodeRegisterRequest, TenantCreateRequest,
-        TenantCreateResponse, TenantLocateResponse, TenantShardMigrateRequest,
-        TenantShardMigrateResponse,
+        NodeConfigureRequest, NodeRegisterRequest, TenantCreateRequest, TenantCreateResponse,
+        TenantLocateResponse, TenantShardMigrateRequest, TenantShardMigrateResponse,
    },
    models::{
        TenantShardSplitRequest, TenantShardSplitResponse, TimelineCreateRequest, TimelineInfo,
@@ -30,6 +29,7 @@ use utils::{
 pub struct StorageController {
    env: LocalEnv,
    listen: String,
+    path: Utf8PathBuf,
    private_key: Option<Vec<u8>>,
    public_key: Option<String>,
    postgres_port: u16,
@@ -41,8 +41,6 @@ const COMMAND: &str = "storage_controller";

 const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16;

-const DB_NAME: &str = "storage_controller";
-
 #[derive(Serialize, Deserialize)]
 pub struct AttachHookRequest {
    pub tenant_shard_id: TenantShardId,
@@ -67,6 +65,10 @@ pub struct InspectResponse {

 impl StorageController {
    pub fn from_env(env: &LocalEnv) -> Self {
+        let path = Utf8PathBuf::from_path_buf(env.base_data_dir.clone())
+            .unwrap()
+            .join("attachments.json");
+
        // Makes no sense to construct this if pageservers aren't going to use it: assume
        // pageservers have control plane API set
        let listen_url = env.control_plane_api.clone().unwrap();
@@ -126,6 +128,7 @@ impl StorageController {

        Self {
            env: env.clone(),
+            path,
            listen,
            private_key,
            public_key,
@@ -200,6 +203,7 @@ impl StorageController {
    ///
    /// Returns the database url
    pub async fn setup_database(&self) -> anyhow::Result<String> {
+        const DB_NAME: &str = "storage_controller";
        let database_url = format!("postgresql://localhost:{}/{DB_NAME}", self.postgres_port);

        let pg_bin_dir = self.get_pg_bin_dir().await?;
@@ -228,30 +232,6 @@ impl StorageController {
        Ok(database_url)
    }

-    pub async fn connect_to_database(
-        &self,
-    ) -> anyhow::Result<(
-        tokio_postgres::Client,
-        tokio_postgres::Connection<tokio_postgres::Socket, tokio_postgres::tls::NoTlsStream>,
-    )> {
-        tokio_postgres::Config::new()
-            .host("localhost")
-            .port(self.postgres_port)
-            // The user is the ambient operating system user name.
-            // That is an impurity which we want to fix in => TODO https://github.com/neondatabase/neon/issues/8400
-            //
-            // Until we get there, use the ambient operating system user name.
-            // Recent tokio-postgres versions default to this if the user isn't specified.
-            // But tokio-postgres fork doesn't have this upstream commit:
-            // https://github.com/sfackler/rust-postgres/commit/cb609be758f3fb5af537f04b584a2ee0cebd5e79
-            // => we should rebase our fork => TODO https://github.com/neondatabase/neon/issues/8399
-            .user(&whoami::username())
-            .dbname(DB_NAME)
-            .connect(tokio_postgres::NoTls)
-            .await
-            .map_err(anyhow::Error::new)
-    }
-
    pub async fn start(&self, retry_timeout: &Duration) -> anyhow::Result<()> {
        // Start a vanilla Postgres process used by the storage controller for persistence.
        let pg_data_path = Utf8PathBuf::from_path_buf(self.env.base_data_dir.clone())
@@ -276,20 +256,17 @@ impl StorageController {
            if !status.success() {
                anyhow::bail!("initdb failed with status {status}");
            }
-        };

-        // Write a minimal config file:
-        // - Specify the port, since this is chosen dynamically
-        // - Switch off fsync, since we're running on lightweight test environments and when e.g. scale testing
-        //   the storage controller we don't want a slow local disk to interfere with that.
-        //
-        // NB: it's important that we rewrite this file on each start command so we propagate changes
-        // from `LocalEnv`'s config file (`.neon/config`).
-        tokio::fs::write(
-            &pg_data_path.join("postgresql.conf"),
-            format!("port = {}\nfsync=off\n", self.postgres_port),
-        )
-        .await?;
+            // Write a minimal config file:
+            // - Specify the port, since this is chosen dynamically
+            // - Switch off fsync, since we're running on lightweight test environments and when e.g. scale testing
+            //   the storage controller we don't want a slow local disk to interfere with that.
+            tokio::fs::write(
+                &pg_data_path.join("postgresql.conf"),
+                format!("port = {}\nfsync=off\n", self.postgres_port),
+            )
+            .await?;
+        };

        println!("Starting storage controller database...");
        let db_start_args = [
@@ -319,45 +296,16 @@ impl StorageController {
        // Run migrations on every startup, in case something changed.
        let database_url = self.setup_database().await?;

-        // We support running a startup SQL script to fiddle with the database before we launch storcon.
-        // This is used by the test suite.
-        let startup_script_path = self
-            .env
-            .base_data_dir
-            .join("storage_controller_db.startup.sql");
-        let startup_script = match tokio::fs::read_to_string(&startup_script_path).await {
-            Ok(script) => {
-                tokio::fs::remove_file(startup_script_path).await?;
-                script
-            }
-            Err(e) => {
-                if e.kind() == std::io::ErrorKind::NotFound {
-                    // always run some startup script so that this code path doesn't bit rot
-                    "BEGIN; COMMIT;".to_string()
-                } else {
-                    anyhow::bail!("Failed to read startup script: {e}")
-                }
-            }
-        };
-        let (mut client, conn) = self.connect_to_database().await?;
-        let conn = tokio::spawn(conn);
-        let tx = client.build_transaction();
-        let tx = tx.start().await?;
-        tx.batch_execute(&startup_script).await?;
-        tx.commit().await?;
-        drop(client);
-        conn.await??;
-
        let mut args = vec![
            "-l",
            &self.listen,
+            "-p",
+            self.path.as_ref(),
            "--dev",
            "--database-url",
            &database_url,
-            "--max-offline-interval",
-            &humantime::Duration::from(self.config.max_offline).to_string(),
-            "--max-warming-up-interval",
-            &humantime::Duration::from(self.config.max_warming_up).to_string(),
+            "--max-unavailable-interval",
+            &humantime::Duration::from(self.config.max_unavailable).to_string(),
        ]
        .into_iter()
        .map(|s| s.to_string())
@@ -628,15 +576,6 @@ impl StorageController {
        .await
    }

-    pub async fn node_list(&self) -> anyhow::Result<Vec<NodeDescribeResponse>> {
-        self.dispatch::<(), Vec<NodeDescribeResponse>>(
-            Method::GET,
-            "control/v1/node".to_string(),
-            None,
-        )
-        .await
-    }
-
    #[instrument(skip(self))]
    pub async fn ready(&self) -> anyhow::Result<()> {
        self.dispatch::<(), ()>(Method::GET, "ready".to_string(), None)
--- a/control_plane/storcon_cli/Cargo.toml
+++ b/control_plane/storcon_cli/Cargo.toml
@@ -17,7 +17,6 @@ pageserver_client.workspace = true
 reqwest.workspace = true
 serde.workspace = true
 serde_json = { workspace = true, features = ["raw_value"] }
-storage_controller_client.workspace = true
 thiserror.workspace = true
 tokio.workspace = true
 tracing.workspace = true
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -14,15 +14,15 @@ use pageserver_api::{
    },
    shard::{ShardStripeSize, TenantShardId},
 };
-use pageserver_client::mgmt_api::{self};
+use pageserver_client::mgmt_api::{self, ResponseErrorMessageExt};
 use reqwest::{Method, StatusCode, Url};
+use serde::{de::DeserializeOwned, Serialize};
 use utils::id::{NodeId, TenantId};

 use pageserver_api::controller_api::{
    NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy,
    TenantShardMigrateRequest, TenantShardMigrateResponse,
 };
-use storage_controller_client::control_api::Client;

 #[derive(Subcommand, Debug)]
 enum Command {
@@ -56,10 +56,6 @@ enum Command {
        #[arg(long)]
        scheduling: Option<NodeSchedulingPolicy>,
    },
-    NodeDelete {
-        #[arg(long)]
-        node_id: NodeId,
-    },
    /// Modify a tenant's policies in the storage controller
    TenantPolicy {
        #[arg(long)]
@@ -249,6 +245,64 @@ impl FromStr for NodeAvailabilityArg {
    }
 }

+struct Client {
+    base_url: Url,
+    jwt_token: Option<String>,
+    client: reqwest::Client,
+}
+
+impl Client {
+    fn new(base_url: Url, jwt_token: Option<String>) -> Self {
+        Self {
+            base_url,
+            jwt_token,
+            client: reqwest::ClientBuilder::new()
+                .build()
+                .expect("Failed to construct http client"),
+        }
+    }
+
+    /// Simple HTTP request wrapper for calling into storage controller
+    async fn dispatch<RQ, RS>(
+        &self,
+        method: Method,
+        path: String,
+        body: Option<RQ>,
+    ) -> mgmt_api::Result<RS>
+    where
+        RQ: Serialize + Sized,
+        RS: DeserializeOwned + Sized,
+    {
+        // The configured URL has the /upcall path prefix for pageservers to use: we will strip that out
+        // for general purpose API access.
+        let url = Url::from_str(&format!(
+            "http://{}:{}/{path}",
+            self.base_url.host_str().unwrap(),
+            self.base_url.port().unwrap()
+        ))
+        .unwrap();
+
+        let mut builder = self.client.request(method, url);
+        if let Some(body) = body {
+            builder = builder.json(&body)
+        }
+        if let Some(jwt_token) = &self.jwt_token {
+            builder = builder.header(
+                reqwest::header::AUTHORIZATION,
+                format!("Bearer {jwt_token}"),
+            );
+        }
+
+        let response = builder.send().await.map_err(mgmt_api::Error::ReceiveBody)?;
+        let response = response.error_from_body().await?;
+
+        response
+            .json()
+            .await
+            .map_err(pageserver_client::mgmt_api::Error::ReceiveBody)
+    }
+}
+
 #[tokio::main]
 async fn main() -> anyhow::Result<()> {
    let cli = Cli::parse();
@@ -283,7 +337,7 @@ async fn main() -> anyhow::Result<()> {
        }
        Command::TenantCreate { tenant_id } => {
            storcon_client
-                .dispatch::<_, ()>(
+                .dispatch(
                    Method::POST,
                    "v1/tenant".to_string(),
                    Some(TenantCreateRequest {
@@ -303,16 +357,13 @@ async fn main() -> anyhow::Result<()> {
            tracing::info!("Delete status: {}", status);
        }
        Command::Nodes {} => {
-            let mut resp = storcon_client
+            let resp = storcon_client
                .dispatch::<(), Vec<NodeDescribeResponse>>(
                    Method::GET,
                    "control/v1/node".to_string(),
                    None,
                )
                .await?;
-
-            resp.sort_by(|a, b| a.listen_http_addr.cmp(&b.listen_http_addr));
-
            let mut table = comfy_table::Table::new();
            table.set_header(["Id", "Hostname", "Scheduling", "Availability"]);
            for node in resp {
@@ -344,16 +395,13 @@ async fn main() -> anyhow::Result<()> {
                .await?;
        }
        Command::Tenants {} => {
-            let mut resp = storcon_client
+            let resp = storcon_client
                .dispatch::<(), Vec<TenantDescribeResponse>>(
                    Method::GET,
                    "control/v1/tenant".to_string(),
                    None,
                )
                .await?;
-
-            resp.sort_by(|a, b| a.tenant_id.cmp(&b.tenant_id));
-
            let mut table = comfy_table::Table::new();
            table.set_header([
                "TenantId",
@@ -602,11 +650,6 @@ async fn main() -> anyhow::Result<()> {
                .dispatch::<(), ()>(Method::POST, format!("debug/v1/node/{node_id}/drop"), None)
                .await?;
        }
-        Command::NodeDelete { node_id } => {
-            storcon_client
-                .dispatch::<(), ()>(Method::DELETE, format!("control/v1/node/{node_id}"), None)
-                .await?;
-        }
        Command::TenantSetTimeBasedEviction {
            tenant_id,
            period,
--- a/docker-compose/compute_wrapper/shell/compute.sh
+++ b/docker-compose/compute_wrapper/shell/compute.sh
@@ -33,7 +33,7 @@ echo $result | jq .

 generate_id timeline_id
 PARAMS=(
-     -sbf
+     -sb 
     -X POST
     -H "Content-Type: application/json"
     -d "{\"new_timeline_id\": \"${timeline_id}\", \"pg_version\": ${PG_VERSION}}"
--- a/docker-compose/docker-compose.yml
+++ b/docker-compose/docker-compose.yml
@@ -31,14 +31,25 @@ services:
    restart: always
    image: ${REPOSITORY:-neondatabase}/neon:${TAG:-latest}
    environment:
+      - BROKER_ENDPOINT='http://storage_broker:50051'
      - AWS_ACCESS_KEY_ID=minio
      - AWS_SECRET_ACCESS_KEY=password
      #- RUST_BACKTRACE=1
    ports:
       #- 6400:6400  # pg protocol handler
       - 9898:9898 # http endpoints
-    volumes:
-      - ./pageserver_config:/data/.neon/
+    entrypoint:
+      - "/bin/sh"
+      - "-c"
+    command:
+      - "/usr/local/bin/pageserver -D /data/.neon/
+                                   -c \"broker_endpoint=$$BROKER_ENDPOINT\"
+                                   -c \"listen_pg_addr='0.0.0.0:6400'\"
+                                   -c \"listen_http_addr='0.0.0.0:9898'\"
+                                   -c \"remote_storage={endpoint='http://minio:9000',
+                                                        bucket_name='neon',
+                                                        bucket_region='eu-north-1',
+                                                        prefix_in_bucket='/pageserver/'}\""
    depends_on:
      - storage_broker
      - minio_create_buckets
--- a/docker-compose/pageserver_config/identity.toml
+++ b/docker-compose/pageserver_config/identity.toml
@@ -1 +0,0 @@
-id=1234
--- a/docker-compose/pageserver_config/pageserver.toml
+++ b/docker-compose/pageserver_config/pageserver.toml
@@ -1,5 +0,0 @@
-broker_endpoint='http://storage_broker:50051'
-pg_distrib_dir='/usr/local/'
-listen_pg_addr='0.0.0.0:6400'
-listen_http_addr='0.0.0.0:9898'
-remote_storage={ endpoint='http://minio:9000', bucket_name='neon', bucket_region='eu-north-1', prefix_in_bucket='/pageserver' }
--- a/docs/rfcs/034-ancestor-deletion.md
+++ b/docs/rfcs/034-ancestor-deletion.md
@@ -1,252 +0,0 @@
-# Ancestor Timeline Deletion
-
-Created on: 2024-02-23
-
-Author: John Spray
-
-# Summary
-
-When a tenant creates a new timeline that they will treat as their 'main' history,
-it is awkward to permanently retain an 'old main' timeline as its ancestor. Currently
-this is necessary because it is forbidden to delete a timeline which has descendents.
-
-A new pageserver API is proposed to 'adopt' data from a parent timeline into
-one of its children, such that the link between ancestor and child can be severed,
-leaving the parent in a state where it may then be deleted.
-
-# Motivation
-
-Retaining parent timelines currently has two costs:
-
- Cognitive load on users, who have to remember which is the "real" main timeline.
- Storage capacity cost, as the parent timeline will retain layers up to the
-  child's timeline point, even if the child fully covers its keyspace with image
-  layers and will never actually read from the parent.
-
-# Solution
-
-A new pageserver API `PUT /v1/tenant/:tenant_id/timeline/:timeline_id/detach_ancestor`
-will be added. The `timeline_id` in this URL is that of the _child_ timeline that we
-wish to detach from its parent.
-
-On success, this API will leave the following state:
-
- The detached child timeline will no longer have an ancestor, and will contain all
-  the data needed to service reads without recursing into an ancestor.
- Any other children of the parent whose timeline points were at a lower LSN than
-  the detached child timeline will be modified to have the child timeline as their
-  new parent.
- The parent timeline will still exist, but the child will no longer have it as an
-  ancestor. If this was the last timeline that depended on the parent, then the
-  parent will become deletable.
-
-This API's implementation will consist of a series of retryable steps, such that
-on failures/timeout it can safely be called again to reach the target state.
-
-## Example
-
-### Before
-
-The user has "rolled back" their project to LSN X, resulting in a "new main"
-timeline. The parent "old main" timeline still exists, and they would like
-to clean it up.
-
-They have two other timelines A and B. A is from before the rollback point,
-and B is from after the rollback point.
-
-```
----"old main" timeline-------X-------------------------------------------->
-                |             |                         |
-                |-> child A   |                         |
-                              |-> "new main" timeline   |
-                                                        -> child B
-
-```
-
-### After calling detach ancestor API
-
-The "new main" timeline is no longer dependent on old main, and neither
-is child A, because it had a branch point before X.
-
-The user may now choose to delete child B and "old main" to get to
-a pristine state. Child B is likely to be unwanted since the user
-chose to roll back to X, and it branches from after X. However, we
-don't assume this in the API; it is up to the user to delete it.
-
-```
-|----"old main" timeline---------------------------------------------------->
-                                                         |
-                                                         |
-                                                         |
-                                                         -> child B
-
-|----"new main" timeline--------->
-                 |
-                 |-> child A
-
-
-```
-
-### After removing timelines
-
-We end up with a totally clean state that leaves no trace that a rollback
-ever happened: there is only one root timeline.
-
-```
-| ----"new main" timeline----------->
-                |
-                |-> child A
-
-
-```
-
-## Caveats
-
-Important things for API users to bear in mind:
-
- this API does not delete the parent timeline: you must still do that explicitly.
- if there are other child timelines ahead of the branch point of the detached
-  child, the parent won't be deletable: you must either delete or detach those
-  children.
- do _not_ simply loop over all children and detach them all: this can have an
-  extremely high storage cost. The detach ancestor API is intended for use on a single
-  timeline to make it the new "main".
- The detach ancestor API should also not be
-  exposed directly to the user as button/API, because they might decide
-  to click it for all the children and thereby generate many copies of the
-  parent's data -- the detach ancestor API should be used as part
-  of a high level "clean up after rollback" feature.
-
-## `detach_ancestor` API implementation
-
-Terms used in the following sections:
-
- "the child": the timeline whose ID is specified in the detach ancestor API URL, also
-  called "new main" in the example.
- "the parent": the parent of "the child". Also called "old main" in the example.
- "the branch point" the ancestor_lsn of "the child"
-
-### Phase 1: write out adopted layers to S3
-
-The child will "adopt" layers from the parent, such that its end state contains
-all the parent's history as well as its own.
-
-For all layers in the parent's layer map whose high LSN is below the branch
-point, issue S3 CopyObject requests to duplicate them into the child timeline's
-prefix. Do not add them to the child's layer map yet.
-
-For delta layers in the parent's layer map which straddle the branch point, read them
-and write out only content up to the branch point into new layer objects.
-
-This is a long running operation if the parent has many layers: it should be
-implemented in a way that resumes rather than restarting from scratch, if the API
-times out and is called again.
-
-As an optimization, if there are no other timelines that will be adopted into
-the child, _and_ the child's image layers already full cover the branch LSN,
-then we may skip adopting layers.
-
-### Phase 2: update the child's index
-
-Having written out all needed layers in phase 1, atomically link them all
-into the child's IndexPart and upload to S3. This may be done while the
-child Timeline is still running.
-
-### Phase 3: modify timelines ancestry
-
-Modify the child's ancestor to None, and upload its IndexPart to persist the change.
-
-For all timelines which have the same parent as the child, and have a branch
-point lower than our branch point, switch their ancestor_timeline to the child,
-and upload their IndexPart to persist the change.
-
-## Alternatives considered
-
-### Generate full image layer on child, rather than adopting parent deltas
-
-This would work for the case of a single child, but would prevent re-targeting
-other timelines that depended on the parent. If we detached many children this
-way, the storage cost would become prohibitive (consider a 1TB database with
-100 child timelines: it would cost 100TiB if they all generated their own image layers).
-
-### Don't rewrite anything: just fake it in the API
-
-We could add a layer of indirection that let a child "pretend" that it had no
-ancestor, when in reality it still had the parent. The pageserver API could
-accept deletion of ancestor timelines, and just update child metadata to make
-them look like they have no ancestor.
-
-This would not achieve the desired reduction in storage cost, and may well be more
-complex to maintain than simply implementing the API described in this RFC.
-
-### Avoid copying objects: enable child index to use parent layers directly
-
-We could teach IndexPart to store a TimelineId for each layer, such that a child
-timeline could reference a parent's layers directly, rather than copying them
-into the child's prefix.
-
-This would impose a cost for the normal case of indices that only target the
-timeline's own layers, add complexity, and break the useful simplifying
-invariant that timelines "own" their own path. If child timelines were
-referencing layers from the parent, we would have to ensure that the parent
-never runs GC/compaction again, which would make the API less flexible (the
-proposal in this RFC enables deletion of the parent but doesn't require it.)
-
-## Performance
-
-### Adopting layers
-
- CopyObject is a relatively cheap operation, but we may need to issue tens of thousands
-  of such requests: this can take up to tens of seconds and will compete for RemoteStorage
-  semaphore units with other activity on the pageserver.
- If we are running on storage backend that doesn't implement CopyObject, then
-  this part will be much more expensive as we would stream all layer content
-  through the pageserver. This is no different to issuing a lot
-  of reads to a timeline that does not have a warm local cache: it will move
-  a lot of gigabytes, but that shouldn't break anything.
- Generating truncated layers for delta that straddle the branch point will
-  require streaming read/write of all the layers in question.
-
-### Updating timeline ancestry
-
-The simplest way to update timeline ancestry will probably be to stop and start
-all the Timeline objects: this is preferable to the complexity of making their
-ancestry mutable at runtime.
-
-There will be a corresponding "stutter" in the availability of the timelines,
-of the order 10-100ms, which is the time taken to upload their IndexPart, and
-restart the Timeline.
-
-# Interaction with other features
-
-## Concurrent timeline creation
-
-If new historic timelines are created using the parent as an ancestor while the
-detach ancestor API is running, they will not be re-parented to the child. This
-doesn't break anything, but it leaves the parent in a state where it might not
-be possible to delete it.
-
-Since timeline creations are an explicit user action, this is not something we need to
-worry about as the storage layer: a user who wants to delete their parent timeline will not create
-new children, and if they do, they can choose to delete those children to
-enable deleting the parent.
-
-For the least surprise to the user, before starting the detach ancestor branch
-operation, the control plane should wait until all branches are created and not
-allow any branches to be created before the branch point on the ancestor branch
-while the operation is ongoing.
-
-## WAL based disaster recovery
-
-WAL based disaster recovery currently supports only restoring of the main
-branch. Enabling WAL based disaster recovery in the future requires that we
-keep a record which timeline generated the WAL and at which LSN was a parent
-detached. Keep a list of timeline ids and the LSN in which they were detached in
-the `index_part.json`. Limit the size of the list to 100 first entries, after
-which the WAL disaster recovery will not be possible.
-
-## Sharded tenants
-
-For sharded tenants, calls to the detach ancestor API will pass through the storage
-controller, which will handle them the same as timeline creations: invoke first
-on shard zero, and then on all the other shards.
--- a/docs/rfcs/035-timeline-archive.md
+++ b/docs/rfcs/035-timeline-archive.md
@@ -1,507 +0,0 @@
-# Timeline Archival
-
-## Summary
-
-This RFC describes a mechanism for pageservers to eliminate local storage + compute work
-for timelines which are not in use, in response to external API calls to "archive" a timeline.
-
-The archived state roughly corresponds to fully offloading a timeline to object storage, such
-that its cost is purely the cost of that object storage.
-
-## Motivation
-
-Archived timelines serve multiple purposes:
- Act as a 'snapshot' for workloads that would like to retain restorable copies of their
-  database from longer ago than their PITR window.
- Enable users to create huge numbers of branches (e.g. one per github PR) without having
-  to diligently clean them up later to avoid overloading the pageserver (currently we support
-  up to ~500 branches per tenant).
-
-### Prior art
-
-Most storage and database systems have some form of snapshot, which can be implemented several ways:
-1. full copies of data (e.g. an EBS snapshot to S3)
-2. shallow snapshots which are CoW relative to the original version of the data, e.g. on a typical NFS appliance, or a filesystem like CephFS.
-3. a series of snapshots which are CoW or de-duplicated relative to one another.
-
-Today's Neon branches are approximately like `2.`, although due to implementation details branches
-often end up storing much more data than they really need, as parent branches assume that all data
-at the branch point is needed.  The layers pinned in the parent branch may have a much larger size
-than the physical size of a compressed image layer representing the data at the branch point.
-
-## Requirements
-
- Enter & exit the archived state in response to external admin API calls
- API calls to modify the archived state are atomic and durable
- An archived timeline should eventually (once out of PITR window) use an efficient compressed
-  representation, and avoid retaining arbitrarily large data in its parent branch.
- Remote object GETs during tenant start may be O(N) with the number of _active_ branches,
-  but must not scale with the number of _archived_ branches.
- Background I/O for archived branches should only be done a limited number of times to evolve them
-  to a long-term-efficient state (e.g. rewriting to image layers).  There should be no ongoing "housekeeping"
-  overhead for archived branches, including operations related to calculating sizes for billing.
- The pageserver should put no load on the safekeeper for archived branches.
- Performance of un-archiving a branch must make good use of S3/disk bandwidth to restore the branch
-  to a performant state in a short time (linear with the branch's logical size)
-
-## Non Goals
-
- Archived branches are not a literal `fullbackup` postgres snapshot: they are still stored
-  in Neon's internal format.
- Compute cold starts after activating an archived branch will not have comparable performance to
-  cold starts on an active branch.
- Archived branches will not use any new/additional compression or de-duplication beyond what
-  is already implemented for image layers (zstd per page).
- The pageserver will not "auto start" archived branches in response to page_service API requests: they
-  are only activated explicitly via the HTTP API.
- We will not implement a total offload of archived timelines from safekeepers: their control file (small) will
-  remain on local disk, although existing eviction mechanisms will remove any segments from local disk.
- We will not expose any prometheus metrics for archived timelines, or make them visible in any
-  detailed HTTP APIs other than the specific API for listing archived timelines.
- A parent branch may not be archived unless all its children are.
-
-## Impacted Components
-
-pageserver, storage controller
-
-## Terminology
-
-**Archived**: a branch is _archived_ when an HTTP API request to archive it has succeeded: the caller
-may assume that this branch is now very cheap to store, although this may not be physically so until the
-branch proceeds to the offloaded state.
-
-**Active** branches are branches which are available for use by page_service clients, and have a relatively
-high cost due to consuming local storage.
-
-**Offloaded** branches are a subset of _archived_ branches, which have had their local state removed such
-that they now consume minimal runtime resources and have a cost similar to the cost of object storage.
-
-**Activate** (verb): transition from Archived to Active
-
-**Archive** (verb): transition from Active to Archived
-
-**Offload** (verb): transition from Archived to Offloaded
-
-**Offload manifest**: an object stored in S3 that describes timelines which pageservers do not load.
-
-**Warm up** (verb): operation done on an active branch, by downloading its active layers.  Once a branch is
-warmed up, good performance will be available to page_service clients.
-
-## Implementation
-
-### High level flow
-
-We may think of a timeline which is archived and then activated as proceeding through a series of states:
-
-```mermaid
-stateDiagram
-  [*] --> Active(warm)
-  Active(warm) --> Archived
-  Archived --> Offloaded
-  Archived --> Active(warm)
-  Offloaded --> Active(cold)
-  Active(cold) --> Active(warm)
-```
-
-Note that the transition from Archived to Active(warm) is expected to be fairly rare: the most common lifecycles
-of branches will be:
- Very frequent: Short lived branches: Active -> Deleted
- Frequent: Long-lived branches: Active -> Archived -> Offloaded -> Deleted
- Rare: Branches used to restore old state: Active ->Archived -> Offloaded -> Active
-
-These states are _not_ all stored as a single physical state on the timeline, but rather represent the combination
-of:
- the timeline's lifecycle state: active or archived, stored in the timeline's index
- its offload state: whether pageserver has chosen to drop local storage of the timeline and write it into the
-  manifest of offloaded timelines.
- cache state (whether it's warm or cold).
-
-### Storage format changes
-
-There are two storage format changes:
-1. `index_part.json` gets a new attribute `state` that describes whether the timeline is to
-   be considered active or archived.
-2. A new tenant-level _manifest_ object `tenant_manifest-v1.json` describes which timelines a tenant does not need to load
-   at startup (and is available for storing other small, rarely changing tenant-wide attributes in future)
-
-The manifest object will have a format like this:
-```
-{
-  "offload_timelines": [
-    {
-      "timeline_id": ...
-      "last_record_lsn": ...
-      "last_record_lsn_time": ...
-      "pitr_interval": ...
-      "last_gc_lsn": ...  # equal to last_record_lsn if this branch has no history (i.e. a snapshot)
-      "logical_size": ...  # The size at last_record_lsn
-      "physical_size" ...
-      "parent": Option<{
-        "timeline_id"...
-        "lsn"... # Branch point LSN on the parent
-        "requires_data": bool # True if this branch depends on layers in its parent, identify it here
-
-      }>
-    }
-  ]
-}
-```
-
-The information about a timeline in its offload state is intentionally minimal: just enough to decide:
- Whether it requires [archive optimization](#archive-branch-optimization) by rewriting as a set of image layers: we may infer this
-  by checking if now > last_record_lsn_time - pitr_interval, and pitr_lsn < last_record_lsn.
- Whether a parent branch should include this offloaded branch in its GC inputs to avoid removing
-  layers that the archived branch depends on
- Whether requests to delete this `timeline_id` should be executed (i.e. if a deletion request
-  is received for a timeline_id that isn't in the site of live `Timelines` or in the manifest, then
-  we don't need to go to S3 for the deletion.
- How much archived space to report in consumption metrics
-
-The contents of the manifest's offload list will also be stored as an attribute of `Tenant`, such that the total
-set of timelines may be found by the union of `Tenant::timelines` (non-offloaded timelines) and `Tenant::offloaded`
-(offloaded timelines).
-
-For split-brain protection, the manifest object will be written with a generation suffix, in the same way as
-index_part objects are (see [generation numbers RFC](025-generation-numbers.md)).  This will add some complexity, but
-give us total safety against two pageservers with the same tenant attached fighting over the object.  Existing code
-for finding the latest generation and for cleaning up old generations (in the scrubber) will be generalized to cover
-the manifest file.
-
-### API & Timeline state
-
-Timelines will store a lifecycle state (enum of Active or Archived) in their IndexPart.  This will
-be controlled by a new per-timeline `configure` endpoint.  This is intentionally generic naming, which
-may be used in future to control other per-timeline attributes (e.g. in future we may make PITR interval
-a per-timeline configuration).
-
-`PUT /v1/tenants/{tenant_id}/timelines/{timeline_id}/configure`
-```
-{
-  'state': 'active|archive'
-}
-```
-
-When archiving a timeline, this API will complete as soon as the timeline's state has been set in index_part, and that index has been uploaded.
-
-When activating a timeline, this API will complete as soon as the timeline's state has been set in index_part,
-**and** the `Timeline` object has been instantiated and activated.  This will require reading the timeline's
-index, but not any data: it should be about as fast as a couple of small S3 requests.
-
-The API will be available with identical path via the storage controller: calling this on a sharded tenant
-will simply map the API call to all the shards.
-
-Archived timelines may never have descendent timelines which are active.  This will be enforced at the API level,
-such that activating a timeline requires that all its ancestors are active, and archiving a timeline requires
-that all its descendents are archived.  It is the callers responsibility to walk the hierarchy of timelines
-in the proper order if they would like to archive whole trees of branches.
-
-Because archive timelines will be excluded from the usual timeline listing APIs, a new API specifically
-for archived timelines will be added: this is for use in support/debug:
-
-```
-GET /v1/tenants/{tenant_id}/archived_timelines
-
-{
-  ...same per-timeline content as the tenant manifest...
-}
-
-```
-
-### Tenant attach changes
-
-Currently, during Tenant::spawn we list all the timelines in the S3 bucket, and then for each timeline
-we load their index_part.json.  To avoid the number of GETs scaling linearly with the number of archived
-timelines, we must have a single object that tells us which timelines do not need to be loaded.  The
-number of ListObjects requests while listing timelines will still scale O(N), but this is less problematic
-because each request covers 1000 timelines.
-
-This is **not** literally the same as the set of timelines who have state=archived.  Rather, it is
-the set of timelines which have been offloaded in the background after their state was set to archived.
-
-We may simply skip loading these timelines: there will be no special state of `Timeline`, they just won't
-exist from the perspective of an active `Tenant` apart from in deletion: timeline deletion will need
-to check for offloaded timelines as well as active timelines, to avoid wrongly returning 404 on trying
-to delete an offloaded timeline.
-
-### Warm-up API
-
-`PUT /v1/tenants/{tenant_id}/timelines/{timeline_id}/download?wait_ms=1234`
-
-This API will be similar to the existing `download_remote_layers` API, but smarter:
- It will not download _all_ remote layers, just the visible set (i.e. layers needed for a read)
- It will download layers in the visible set until reaching `wait_ms`, then return a struct describing progress
-  of downloads, so that the caller can poll.
-
-The _visible set_ mentioned above will be calculated by the pageserver in the background, by taking the set
-of readable LSNs (i.e. branch points and heads of branches), and walking the layer map to work out which layers
-can possibly be read from these LSNs.  This concept of layer visibility is more generally useful for cache
-eviction and heatmaps, as well as in this specific case of warming up a timeline.
-
-The caller does not have to wait for the warm up API, or call it at all.  But it is strongly advised
-to call it, because otherwise populating local contents for a timeline can take a long time when waiting
-for SQL queries to coincidentally hit all the layers, and during that time query latency remains quite
-volatile.
-
-### Background work
-
-Archived branches are not subject to normal compaction.  Instead, when the compaction loop encounters
-an archived branch, it will consider rewriting the branch to just image layers if the branch has no history
-([archive branch optimization](#archive-branch-optimization)), or offloading the timeline from local disk
-if its state permits that.
-
-Additionally, the tenant compaction task will walk the state of already offloaded timelines to consider
-optimizing their storage, e.g. if a timeline had some history when offloaded, but since then its PITR
-has elapsed and it can now be rewritten to image layers.
-
-#### Archive branch offload
-
-Recall that when we archive a timeline via the HTTP API, this only sets a state: it doesn't do
-any actual work.
-
-This work is done in the background compaction loop.  It makes sense to tag this work on to the compaction
-loop, because it is spiritually aligned: offloading data for archived branches improves storage efficiency.
-
-The condition for offload is simple:
- - a `Timeline` object exists with state `Archived`
- - the timeline does not have any non-offloaded children.
- 
- Regarding the condition that children must be offloaded, this will always be eventually true, because
- we enforce at the API level that children of archived timelines must themselves be archived, and all
- archived timelines will eventually be offloaded.
-
-Offloading a timeline is simple:
- Read the timeline's attributes that we will store in its offloaded state (especially its logical size)
- Call `shutdown()` on the timeline and remove it from the `Tenant` (as if we were about to delete it)
- Erase all the timeline's content from local storage (`remove_dir_all` on its path)
- Write the tenant manifest to S3 to prevent this timeline being loaded on next start.
-
-#### Archive branch optimization (flattening)
-
-When we offloaded a branch, it might have had some history that prevented rewriting it to a single
-point in time set of image layers.  For example, a branch might have several days of writes and a 7
-day PITR: when we archive it, it still has those days of history.
-
-Once the PITR has expired, we have an opportunity to reduce the physical footprint of the branch by:
- Writing compressed image layers within the archived branch, as these are more efficient as a way of storing
-  a point in time compared with delta layers
- Updating the branch's offload metadata to indicate that this branch no longer depends on its ancestor
-  for data, i.e. the ancestor is free to GC layers files at+below the branch point
-
-Fully compacting an archived branch into image layers at a single LSN may be thought of as *flattening* the
-branch, such that it is now a one-dimensional keyspace rather than a two-dimensional key/lsn space. It becomes
-a true snapshot at that LSN.
-
-It is not always more efficient to flatten a branch than to keep some extra history on the parent: this
-is described in more detail in [optimizations](#delaying-storage-optimization-if-retaining-parent-layers-is-cheaper)
-
-Archive branch optimization should be done _before_ background offloads during compaction, because there may
-be timelines which are ready to be offloaded but also would benefit from the optimization step before
-being offloaded.  For example, a branch which has already fallen out of PITR window and has no history
-of its own may be immediately re-written as a series of image layers before being offloaded.
-
-### Consumption metrics
-
-Archived timelines and offloaded timelines will be excluded from the synthetic size calculation, in anticipating
-that billing structures based on consumption metrics are highly likely to apply different $/GB rates to archived
-vs. ordinary content.
-
-Archived and offloaded timelines' logical size will be reported under the existing `timeline_logical_size`
-variant of `MetricsKey`: receivers are then free to bill on this metric as they please.
-
-### Secondary locations
-
-Archived timelines (including offloaded timelines) will be excluded from heatmaps, and thereby
-when a timeline is archived, after the next cycle of heatmap upload & secondary download, its contents
-will be dropped from secondary locations.
-
-### Sharding
-
-Archiving or activating a timeline will be done symmetrically across all shards in a tenant, in
-the same way that timeline creation and deletion is done.  There are no special rules about ordering:
-the storage controller may dispatch concurrent calls to all shards when archiving or activating a timeline.
-
-Since consumption metrics are only transmitted from shard zero, the state of archival on this shard
-will be authoritative for consumption metrics.
-
-## Error cases
-
-### Errors in sharded tenants
-
-If one shard in a tenant fails an operation but others succeed, the tenant may end up in a mixed
-state, where a timeline is archived on some shards but not on others.  
-
-We will not bother implementing a rollback mechanism for this: errors in archiving/activating a timeline
-are either transient (e.g. S3 unavailable, shutting down), or the fault of the caller (NotFound, BadRequest).
-In the transient case callers are expected to retry until success, or to make appropriate API calls to clear
-up their mistake.  We rely on this good behavior of callers to eventually get timelines into a consistent
-state across all shards.  If callers do leave a timeline in an inconsistent state across shards, this doesn't
-break anything, it's just "weird".
-
-This is similar to the status quo for timeline creation and deletion: callers are expected to retry
-these operations until they succeed.
-
-### Archiving/activating
-
-Archiving/activating a timeline can fail in a limited number of ways:
-1. I/O error storing/reading the timeline's updated index
-    - These errors are always retryable: a fundamental design assumption of the pageserver is that remote
-      storage errors are always transient. 
-2. NotFound if the timeline doesn't exist
-    - Callers of the API are expected to avoid calling deletion and archival APIs concurrently.
-    - The storage controller has runtime locking to prevent races such as deleting a timeline while
-      archiving it.
-3. BadRequest if the rules around ancestors/descendents of archived timelines would be violated
-    - Callers are expected to do their own checks to avoid hitting this case.  If they make
-      a mistake and encounter this error, they should give up.
-
-### Offloading
-
-Offloading can only fail if remote storage is unavailable, which would prevent us from writing the
-tenant manifest.  In such error cases, we give up in the expectation that offloading will be tried 
-again at the next iteration of the compaction loop.
-
-### Archive branch optimization
-
-Optimization is a special form of compaction, so can encounter all the same errors as regular compaction
-can: it should return Result<(), CompactionError>, and as with compaction it will be retried on
-the next iteration of the compaction loop.
-
-## Optimizations
-
-### Delaying storage optimization if retaining parent layers is cheaper
-
-Optimizing archived branches to image layers and thereby enabling parent branch GC to progress
-is a safe default: archived branches cannot over-fill a pageserver's local disk, and once they
-are offloaded to S3 they're totally safe, inert things.
-
-However, in some cases it can be advantageous to retain extra history on their parent branch rather
-than flattening the archived branch.  For example, if a 1TB parent branch is rather slow-changing (1GB
-of data per day), and archive branches are being created nightly, then writing out full 1TB image layers
-for each nightly branch is inefficient compared with just keeping more history on the main branch.
-
-Getting this right requires consideration of:
- Compaction: if keeping more history on the main branch is going to prompt the main branch's compaction to
-  write out extra image layers, then it might make more sense to just write out the image layers on
-  the archived branch.
- Metadata bloat: keeping extra history on a parent branch doesn't just cost GB of storage, it makes
-  the layer map (and index_part) bigger.  There are practical limits beyond which writing an indefinitely
-  large layer map can cause problems elsewhere.
-
-This optimization can probably be implemented quite cheaply with some basic heuristics like:
- don't bother doing optimization on an archive branch if the LSN distance between
-  its branch point and the end of the PITR window is <5% of the logical size of the archive branch.
- ...but, Don't keep more history on the main branch than double the PITR
-
-### Creating a timeline in archived state (a snapshot)
-
-Sometimes, one might want to create a branch with no history, which will not be written to
-before it is archived.  This is a snapshot, although we do not require a special snapshot API,
-since a snapshot can be represented as a timeline with no history.
-
-This can be accomplished by simply creating a timeline and then immediately archiving it, but
-that is somewhat wasteful: this timeline it will spin up various tasks and open a connection to the storage
-broker to try and ingest WAL, before being shutdown in the subsequent archival call.  To explicitly
-support this common special case, we may add a parameter to the timeline creation API which
-creates a timeline directly into the archived state.
-
-Such a timeline creation will do exactly two I/Os at creation time:
- write the index_part object to record the timeline's existence
- when the timeline is offloaded in the next iteration of the compaction loop (~20s later),
-  write the tenant manifest.
-
-Later, when the timeline falls off the end of the PITR interval, the usual offload logic will wake
-up the 'snapshot' branch and write out image layers.
-
-## Future Work
-
-### Enabling `fullbackup` dumps from archive branches
-
-It would be useful to be able to export an archive branch to another system, or for use in a local
-postgres database.
-
-This could be implemented as a general capability for all branches, in which case it would "just work"
-for archive branches by activating them.  However, downloading all the layers in a branch just to generate
-a fullbackup is a bit inefficient: we could implement a special case for flattened archived branches
-which streams image layers from S3 and outputs the fullbackup stream without writing the layers out to disk.
-
-Implementing `fullbackup` is a bit more complicated than this because of sharding, but solving that problem
-is unrelated to the topic of archived branches (it probably involves having each shard write out a fullbackup 
-stream to S3 in an intermediate format and, then having one node stitch them together).
-
-### Tagging layers from archived branches
-
-When we know a layer is an image layer written for an archived branch that has fallen off the PITR window,
-we may add tags to the S3 objects to enable writing lifecycle policies that transition such layers to even
-cheaper storage.
-
-This could be done for all archived layers, or it could be driven by the archival API, to give the pageserver
-external hints on which branches are likely to be reactivated, and which branches are good candidates for
-tagging for low performance storage.
-
-Tagging+lifecycles is just one mechanism: one might also directly use S3 storage classes.  Other clouds' object
-stores have similar mechanisms.
-
-### Storing sequences of archive branches as deltas
-
-When archived branches are used as scheduled snapshots, we could store them even more efficiently
-by encoding them as deltas relative to each other (i.e. for nightly snapshots, when we do the
-storage optimization for Tuesday's snapshot, we would read Monday's snapshot and store only the modified
-pages). This is the kind of encoding that many backup storage systems use.
-
-The utility of this depends a lot on the churn rate of the data, and the cost of doing the delta encoding
-vs. just writing out a simple stream of the entire database.  For smaller databases, writing out a full
-copy is pretty trivial (e.g. writing a compressed copy of a 10GiB database to S3 can take under 10 seconds,
-so the complexity tradeoff of diff-encoding it is dubious).
-
-One does not necessarily have to read-back the previous snapshot in order to encoded the next one: if the
-pageserver knows about the schedule, it can intentionally retain extra history on the main branch so that
-we can say: "A branch exists from Monday night.  I have Monday night's data still active in the main branch,
-so now I can read at the Monday LSN and the Tuesday LSN, calculate the delta, and store it as Tuesday's
-delta snapshot".
-
-Clearly this all requires careful housekeeping to retain the relationship between branches that depend on
-each other: perhaps this would be done by making the archive branches have child/parent relationships with
-each other, or perhaps we would permit them to remain children of their original parent, but additionally
-have a relationship with the snapshot they're encoded relative to.
-
-Activating a branch that is diff-encoded may require activating several earlier branches too, so figuring
-out how frequently to write a full copy is important.  This is essentially a zoomed-out version of what
-we do with delta layers and image layers within a timeline, except each "layer" is a whole timeline.
-
-
-## FAQ/Alternatives
-
-### Store all timelines in the tenant manifest
-
-Rather than special-casing offloaded timelines in the offload manifest, we could store a total
-manifest of all timelines, eliminating the need for the pageserver to list timelines in S3 on
-startup.
-
-That would be a more invasive change (require hooking in to timeline creation), and would
-generate much more I/O to this manifest for tenants that had many branches _and_ frequent
-create/delete cycles for short lived branches.  Restricting the manifest to offloaded timelines
-means that we only have to cope with the rate at which long-lived timelines are archived, rather
-than the rate at which sort lived timelines are created & destroyed.
-
-### Automatically archiving/activating timelines without external API calls
-
-We could implement TTL driven offload of timelines, waking them up when a page request
-arrives.
-
-This has downsides:
- Opacity: if we do TTL-driven offload inside the pageserver, then the end user doesn't
-  know which of their branches are in this state, and might get a surprise when they try
-  to use such a branch.
- Price fluctuation: if the archival of a branch is used in end user pricing, then users
-  prefer clarity & consistency.  Ideally a branch's storage should cost the same from the moment it
-  is created, rather than having a usage-dependency storage price.
- Complexity: enabling the page service to call up into the Tenant to activate a timeline
-  would be awkward, compared with an external entry point.
-
-### Make offloaded a state of Timeline
-
-To reduce the operator-facing complexity of having some timelines APIs that only return
-non-offloaded timelines, we could build the offloaded state into the Timeline type.
-
-`timeline.rs` is already one of the most egregiously long source files in the tree, so
-this is rejected on the basis that we need to avoid making that complexity worse.
--- a/docs/storage_controller.md
+++ b/docs/storage_controller.md
@@ -44,7 +44,7 @@ If you need to modify the database schema, here’s how to create a migration:
 - Use `diesel migration generate <name>` to create a new migration
 - Populate the SQL files in the `migrations/` subdirectory
 - Use `DATABASE_URL=... diesel migration run` to apply the migration you just wrote: this will update the `[schema.rs](http://schema.rs)` file automatically.
-  - This requires a running database: the easiest way to do that is to just run `cargo neon init ; cargo neon start`, which will leave a database available at `postgresql://localhost:1235/storage_controller`
+  - This requires a running database: the easiest way to do that is to just run `cargo neon init ; cargo neon start`, which will leave a database available at `postgresql://localhost:1235/attachment_service`
 - Commit the migration files and the changes to schema.rs
 - If you need to iterate, you can rewind migrations with `diesel migration revert -a` and then `diesel migration run` again.
 - The migrations are build into the storage controller binary, and automatically run at startup after it is deployed, so once you’ve committed a migration no further steps are needed.
--- a/libs/metrics/src/hll.rs
+++ b/libs/metrics/src/hll.rs
@@ -13,7 +13,11 @@ use std::{

 use measured::{
    label::{LabelGroupVisitor, LabelName, LabelValue, LabelVisitor},
-    metric::{counter::CounterState, name::MetricNameEncoder, Metric, MetricType, MetricVec},
+    metric::{
+        group::{Encoding, MetricValue},
+        name::MetricNameEncoder,
+        Metric, MetricType, MetricVec,
+    },
    text::TextEncoder,
    LabelGroup,
 };
@@ -140,7 +144,6 @@ impl<const N: usize> HyperLogLogState<N> {
        })
    }
 }
-
 impl<W: std::io::Write, const N: usize> measured::metric::MetricEncoding<TextEncoder<W>>
    for HyperLogLogState<N>
 {
@@ -179,13 +182,12 @@ impl<W: std::io::Write, const N: usize> measured::metric::MetricEncoding<TextEnc
            .into_iter()
            .enumerate()
            .try_for_each(|(hll_shard, val)| {
-                CounterState::new(val as u64).collect_into(
-                    &(),
+                enc.write_metric_value(
+                    name.by_ref(),
                    labels.by_ref().compose_with(HllShardLabel {
                        hll_shard: hll_shard as i64,
                    }),
-                    name.by_ref(),
-                    enc,
+                    MetricValue::Int(val as i64),
                )
            })
    }
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -9,7 +9,7 @@ use measured::{
    metric::{
        counter::CounterState,
        gauge::GaugeState,
-        group::Encoding,
+        group::{Encoding, MetricValue},
        name::{MetricName, MetricNameEncoder},
        MetricEncoding, MetricFamilyEncoding,
    },
@@ -171,11 +171,8 @@ fn write_gauge<Enc: Encoding>(
    labels: impl LabelGroup,
    name: impl MetricNameEncoder,
    enc: &mut Enc,
-) -> Result<(), Enc::Err>
-where
-    GaugeState: MetricEncoding<Enc>,
-{
-    GaugeState::new(x).collect_into(&(), labels, name, enc)
+) -> Result<(), Enc::Err> {
+    enc.write_metric_value(name, labels, MetricValue::Int(x))
 }

 #[derive(Default)]
@@ -547,6 +544,15 @@ impl<T: Encoding> Encoding for Inc<T> {
    fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> {
        self.0.write_help(name, help)
    }
+
+    fn write_metric_value(
+        &mut self,
+        name: impl MetricNameEncoder,
+        labels: impl LabelGroup,
+        value: MetricValue,
+    ) -> Result<(), Self::Err> {
+        self.0.write_metric_value(name, labels, value)
+    }
 }

 impl<T: Encoding> MetricEncoding<Inc<T>> for MeasuredCounterPairState
@@ -573,6 +579,15 @@ impl<T: Encoding> Encoding for Dec<T> {
    fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> {
        self.0.write_help(name, help)
    }
+
+    fn write_metric_value(
+        &mut self,
+        name: impl MetricNameEncoder,
+        labels: impl LabelGroup,
+        value: MetricValue,
+    ) -> Result<(), Self::Err> {
+        self.0.write_metric_value(name, labels, value)
+    }
 }

 /// Write the dec counter to the encoder
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -1,5 +1,4 @@
 use std::str::FromStr;
-use std::time::Instant;

 /// Request/response types for the storage controller
 /// API (`/control/v1` prefix).  Implemented by the server
@@ -88,7 +87,7 @@ pub struct TenantLocateResponse {
    pub shard_params: ShardParameters,
 }

-#[derive(Serialize, Deserialize, Debug)]
+#[derive(Serialize, Deserialize)]
 pub struct TenantDescribeResponse {
    pub tenant_id: TenantId,
    pub shards: Vec<TenantDescribeResponseShard>,
@@ -111,7 +110,7 @@ pub struct NodeDescribeResponse {
    pub listen_pg_port: u16,
 }

-#[derive(Serialize, Deserialize, Debug)]
+#[derive(Serialize, Deserialize)]
 pub struct TenantDescribeResponseShard {
    pub tenant_shard_id: TenantShardId,

@@ -151,16 +150,11 @@ impl UtilizationScore {
    }
 }

-#[derive(Serialize, Clone, Copy, Debug)]
+#[derive(Serialize, Deserialize, Clone, Copy, Debug)]
 #[serde(into = "NodeAvailabilityWrapper")]
 pub enum NodeAvailability {
    // Normal, happy state
    Active(UtilizationScore),
-    // Node is warming up, but we expect it to become available soon. Covers
-    // the time span between the re-attach response being composed on the storage controller
-    // and the first successful heartbeat after the processing of the re-attach response
-    // finishes on the pageserver.
-    WarmingUp(Instant),
    // Offline: Tenants shouldn't try to attach here, but they may assume that their
    // secondary locations on this node still exist.  Newly added nodes are in this
    // state until we successfully contact them.
@@ -170,10 +164,7 @@ pub enum NodeAvailability {
 impl PartialEq for NodeAvailability {
    fn eq(&self, other: &Self) -> bool {
        use NodeAvailability::*;
-        matches!(
-            (self, other),
-            (Active(_), Active(_)) | (Offline, Offline) | (WarmingUp(_), WarmingUp(_))
-        )
+        matches!((self, other), (Active(_), Active(_)) | (Offline, Offline))
    }
 }

@@ -185,7 +176,6 @@ impl Eq for NodeAvailability {}
 #[derive(Serialize, Deserialize, Clone, Copy, Debug)]
 pub enum NodeAvailabilityWrapper {
    Active,
-    WarmingUp,
    Offline,
 }

@@ -195,7 +185,6 @@ impl From<NodeAvailabilityWrapper> for NodeAvailability {
            // Assume the worst utilisation score to begin with. It will later be updated by
            // the heartbeats.
            NodeAvailabilityWrapper::Active => NodeAvailability::Active(UtilizationScore::worst()),
-            NodeAvailabilityWrapper::WarmingUp => NodeAvailability::WarmingUp(Instant::now()),
            NodeAvailabilityWrapper::Offline => NodeAvailability::Offline,
        }
    }
@@ -205,7 +194,6 @@ impl From<NodeAvailability> for NodeAvailabilityWrapper {
    fn from(val: NodeAvailability) -> Self {
        match val {
            NodeAvailability::Active(_) => NodeAvailabilityWrapper::Active,
-            NodeAvailability::WarmingUp(_) => NodeAvailabilityWrapper::WarmingUp,
            NodeAvailability::Offline => NodeAvailabilityWrapper::Offline,
        }
    }
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -5,10 +5,10 @@ pub mod utilization;
 pub use utilization::PageserverUtilization;

 use std::{
+    borrow::Cow,
    collections::HashMap,
    io::{BufRead, Read},
    num::{NonZeroU64, NonZeroUsize},
-    str::FromStr,
    sync::atomic::AtomicUsize,
    time::{Duration, SystemTime},
 };
@@ -19,6 +19,7 @@ use serde::{Deserialize, Serialize};
 use serde_with::serde_as;
 use utils::{
    completion,
+    history_buffer::HistoryBufferWithDropCounter,
    id::{NodeId, TenantId, TimelineId},
    lsn::Lsn,
    serde_system_time,
@@ -292,6 +293,7 @@ pub struct TenantConfig {
    pub walreceiver_connect_timeout: Option<String>,
    pub lagging_wal_timeout: Option<String>,
    pub max_lsn_wal_lag: Option<NonZeroU64>,
+    pub trace_read_requests: Option<bool>,
    pub eviction_policy: Option<EvictionPolicy>,
    pub min_resident_size_override: Option<u64>,
    pub evictions_low_residence_duration_metric_threshold: Option<String>,
@@ -435,8 +437,22 @@ pub enum CompactionAlgorithm {
    Tiered,
 }

-#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+#[derive(
+    Debug,
+    Clone,
+    Copy,
+    PartialEq,
+    Eq,
+    Serialize,
+    Deserialize,
+    strum_macros::FromRepr,
+    strum_macros::EnumString,
+)]
+#[strum(serialize_all = "kebab-case")]
 pub enum ImageCompressionAlgorithm {
+    /// Disabled for writes, and never decompress during reading.
+    /// Never set this after you've enabled compression once!
+    DisabledNoDecompress,
    // Disabled for writes, support decompressing during read path
    Disabled,
    /// Zstandard compression. Level 0 means and None mean the same (default level). Levels can be negative as well.
@@ -446,27 +462,9 @@ pub enum ImageCompressionAlgorithm {
    },
 }

-impl FromStr for ImageCompressionAlgorithm {
-    type Err = anyhow::Error;
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        let mut components = s.split(['(', ')']);
-        let first = components
-            .next()
-            .ok_or_else(|| anyhow::anyhow!("empty string"))?;
-        match first {
-            "disabled" => Ok(ImageCompressionAlgorithm::Disabled),
-            "zstd" => {
-                let level = if let Some(v) = components.next() {
-                    let v: i8 = v.parse()?;
-                    Some(v)
-                } else {
-                    None
-                };
-
-                Ok(ImageCompressionAlgorithm::Zstd { level })
-            }
-            _ => anyhow::bail!("invalid specifier '{first}'"),
-        }
+impl ImageCompressionAlgorithm {
+    pub fn allow_decompression(&self) -> bool {
+        !matches!(self, ImageCompressionAlgorithm::DisabledNoDecompress)
    }
 }

@@ -649,17 +647,6 @@ pub struct TenantDetails {
    pub timelines: Vec<TimelineId>,
 }

-#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Copy, Debug)]
-pub enum TimelineArchivalState {
-    Archived,
-    Unarchived,
-}
-
-#[derive(Serialize, Deserialize, PartialEq, Eq, Clone)]
-pub struct TimelineArchivalConfigRequest {
-    pub state: TimelineArchivalState,
-}
-
 /// This represents the output of the "timeline_detail" and "timeline_list" API calls.
 #[derive(Debug, Serialize, Deserialize, Clone)]
 pub struct TimelineInfo {
@@ -724,7 +711,58 @@ pub struct LayerMapInfo {
    pub historic_layers: Vec<HistoricLayerInfo>,
 }

-/// The residence status of a layer
+#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy, Serialize, Deserialize, enum_map::Enum)]
+#[repr(usize)]
+pub enum LayerAccessKind {
+    GetValueReconstructData,
+    Iter,
+    KeyIter,
+    Dump,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct LayerAccessStatFullDetails {
+    pub when_millis_since_epoch: u64,
+    pub task_kind: Cow<'static, str>,
+    pub access_kind: LayerAccessKind,
+}
+
+/// An event that impacts the layer's residence status.
+#[serde_as]
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct LayerResidenceEvent {
+    /// The time when the event occurred.
+    /// NB: this timestamp is captured while the residence status changes.
+    /// So, it might be behind/ahead of the actual residence change by a short amount of time.
+    ///
+    #[serde(rename = "timestamp_millis_since_epoch")]
+    #[serde_as(as = "serde_with::TimestampMilliSeconds")]
+    pub timestamp: SystemTime,
+    /// The new residence status of the layer.
+    pub status: LayerResidenceStatus,
+    /// The reason why we had to record this event.
+    pub reason: LayerResidenceEventReason,
+}
+
+/// The reason for recording a given [`LayerResidenceEvent`].
+#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
+pub enum LayerResidenceEventReason {
+    /// The layer map is being populated, e.g. during timeline load or attach.
+    /// This includes [`RemoteLayer`] objects created in [`reconcile_with_remote`].
+    /// We need to record such events because there is no persistent storage for the events.
+    ///
+    // https://github.com/rust-lang/rust/issues/74481
+    /// [`RemoteLayer`]: ../../tenant/storage_layer/struct.RemoteLayer.html
+    /// [`reconcile_with_remote`]: ../../tenant/struct.Timeline.html#method.reconcile_with_remote
+    LayerLoad,
+    /// We just created the layer (e.g., freeze_and_flush or compaction).
+    /// Such layers are always [`LayerResidenceStatus::Resident`].
+    LayerCreate,
+    /// We on-demand downloaded or evicted the given layer.
+    ResidenceChange,
+}
+
+/// The residence status of the layer, after the given [`LayerResidenceEvent`].
 #[derive(Debug, Clone, Copy, Serialize, Deserialize)]
 pub enum LayerResidenceStatus {
    /// Residence status for a layer file that exists locally.
@@ -734,16 +772,23 @@ pub enum LayerResidenceStatus {
    Evicted,
 }

-#[serde_as]
+impl LayerResidenceEvent {
+    pub fn new(status: LayerResidenceStatus, reason: LayerResidenceEventReason) -> Self {
+        Self {
+            status,
+            reason,
+            timestamp: SystemTime::now(),
+        }
+    }
+}
+
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct LayerAccessStats {
-    #[serde_as(as = "serde_with::TimestampMilliSeconds")]
-    pub access_time: SystemTime,
-
-    #[serde_as(as = "serde_with::TimestampMilliSeconds")]
-    pub residence_time: SystemTime,
-
-    pub visible: bool,
+    pub access_count_by_access_kind: HashMap<LayerAccessKind, u64>,
+    pub task_kind_access_flag: Vec<Cow<'static, str>>,
+    pub first: Option<LayerAccessStatFullDetails>,
+    pub accesses_history: HistoryBufferWithDropCounter<LayerAccessStatFullDetails, 16>,
+    pub residence_events_history: HistoryBufferWithDropCounter<LayerResidenceEvent, 16>,
 }

 #[derive(Debug, Clone, Serialize, Deserialize)]
@@ -1615,25 +1660,4 @@ mod tests {
            AuxFilePolicy::CrossValidation
        );
    }
-
-    #[test]
-    fn test_image_compression_algorithm_parsing() {
-        use ImageCompressionAlgorithm::*;
-        assert_eq!(
-            ImageCompressionAlgorithm::from_str("disabled").unwrap(),
-            Disabled
-        );
-        assert_eq!(
-            ImageCompressionAlgorithm::from_str("zstd").unwrap(),
-            Zstd { level: None }
-        );
-        assert_eq!(
-            ImageCompressionAlgorithm::from_str("zstd(18)").unwrap(),
-            Zstd { level: Some(18) }
-        );
-        assert_eq!(
-            ImageCompressionAlgorithm::from_str("zstd(-3)").unwrap(),
-            Zstd { level: Some(-3) }
-        );
-    }
 }
--- a/libs/pageserver_api/src/models/detach_ancestor.rs
+++ b/libs/pageserver_api/src/models/detach_ancestor.rs
@@ -1,6 +1,6 @@
 use utils::id::TimelineId;

-#[derive(Debug, Default, PartialEq, serde::Serialize, serde::Deserialize)]
+#[derive(Default, serde::Serialize)]
 pub struct AncestorDetached {
    pub reparented_timelines: Vec<TimelineId>,
 }
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -1,42 +1,59 @@
-//! See docs/rfcs/031-sharding-static.md for an overview of sharding.
-//!
-//! This module contains a variety of types used to represent the concept of sharding
-//! a Neon tenant across multiple physical shards.  Since there are quite a few of these,
-//! we provide an summary here.
-//!
-//! Types used to describe shards:
-//! - [`ShardCount`] describes how many shards make up a tenant, plus the magic `unsharded` value
-//!   which identifies a tenant which is not shard-aware.  This means its storage paths do not include
-//!   a shard suffix.
-//! - [`ShardNumber`] is simply the zero-based index of a shard within a tenant.
-//! - [`ShardIndex`] is the 2-tuple of `ShardCount` and `ShardNumber`, it's just like a `TenantShardId`
-//!   without the tenant ID.  This is useful for things that are implicitly scoped to a particular
-//!   tenant, such as layer files.
-//! - [`ShardIdentity`]` is the full description of a particular shard's parameters, in sufficient
-//!   detail to convert a [`Key`] to a [`ShardNumber`] when deciding where to write/read.
-//! - The [`ShardSlug`] is a terse formatter for ShardCount and ShardNumber, written as
-//!   four hex digits.  An unsharded tenant is `0000`.
-//! - [`TenantShardId`] is the unique ID of a particular shard within a particular tenant
-//!
-//! Types used to describe the parameters for data distribution in a sharded tenant:
-//! - [`ShardStripeSize`] controls how long contiguous runs of [`Key`]s (stripes) are when distributed across
-//!   multiple shards.  Its value is given in 8kiB pages.
-//! - [`ShardLayout`] describes the data distribution scheme, and at time of writing is
-//!   always zero: this is provided for future upgrades that might introduce different
-//!   data distribution schemes.
-//!
-//! Examples:
-//! - A legacy unsharded tenant has one shard with ShardCount(0), ShardNumber(0), and its slug is 0000
-//! - A single sharded tenant has one shard with ShardCount(1), ShardNumber(0), and its slug is 0001
-//! - In a tenant with 4 shards, each shard has ShardCount(N), ShardNumber(i) where i in 0..N-1 (inclusive),
-//!   and their slugs are 0004, 0104, 0204, and 0304.
+use std::{ops::RangeInclusive, str::FromStr};

 use crate::{key::Key, models::ShardParameters};
+use hex::FromHex;
 use postgres_ffi::relfile_utils::INIT_FORKNUM;
 use serde::{Deserialize, Serialize};
+use utils::id::TenantId;

-#[doc(inline)]
-pub use ::utils::shard::*;
+/// See docs/rfcs/031-sharding-static.md for an overview of sharding.
+///
+/// This module contains a variety of types used to represent the concept of sharding
+/// a Neon tenant across multiple physical shards.  Since there are quite a few of these,
+/// we provide an summary here.
+///
+/// Types used to describe shards:
+/// - [`ShardCount`] describes how many shards make up a tenant, plus the magic `unsharded` value
+///   which identifies a tenant which is not shard-aware.  This means its storage paths do not include
+///   a shard suffix.
+/// - [`ShardNumber`] is simply the zero-based index of a shard within a tenant.
+/// - [`ShardIndex`] is the 2-tuple of `ShardCount` and `ShardNumber`, it's just like a `TenantShardId`
+///   without the tenant ID.  This is useful for things that are implicitly scoped to a particular
+///   tenant, such as layer files.
+/// - [`ShardIdentity`]` is the full description of a particular shard's parameters, in sufficient
+///   detail to convert a [`Key`] to a [`ShardNumber`] when deciding where to write/read.
+/// - The [`ShardSlug`] is a terse formatter for ShardCount and ShardNumber, written as
+///   four hex digits.  An unsharded tenant is `0000`.
+/// - [`TenantShardId`] is the unique ID of a particular shard within a particular tenant
+///
+/// Types used to describe the parameters for data distribution in a sharded tenant:
+/// - [`ShardStripeSize`] controls how long contiguous runs of [`Key`]s (stripes) are when distributed across
+///   multiple shards.  Its value is given in 8kiB pages.
+/// - [`ShardLayout`] describes the data distribution scheme, and at time of writing is
+///   always zero: this is provided for future upgrades that might introduce different
+///   data distribution schemes.
+///
+/// Examples:
+/// - A legacy unsharded tenant has one shard with ShardCount(0), ShardNumber(0), and its slug is 0000
+/// - A single sharded tenant has one shard with ShardCount(1), ShardNumber(0), and its slug is 0001
+/// - In a tenant with 4 shards, each shard has ShardCount(N), ShardNumber(i) where i in 0..N-1 (inclusive),
+///   and their slugs are 0004, 0104, 0204, and 0304.
+
+#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
+pub struct ShardNumber(pub u8);
+
+#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
+pub struct ShardCount(u8);
+
+/// Combination of ShardNumber and ShardCount.  For use within the context of a particular tenant,
+/// when we need to know which shard we're dealing with, but do not need to know the full
+/// ShardIdentity (because we won't be doing any page->shard mapping), and do not need to know
+/// the fully qualified TenantShardId.
+#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
+pub struct ShardIndex {
+    pub shard_number: ShardNumber,
+    pub shard_count: ShardCount,
+}

 /// The ShardIdentity contains enough information to map a [`Key`] to a [`ShardNumber`],
 /// and to check whether that [`ShardNumber`] is the same as the current shard.
@@ -48,6 +65,362 @@ pub struct ShardIdentity {
    layout: ShardLayout,
 }

+/// Formatting helper, for generating the `shard_id` label in traces.
+struct ShardSlug<'a>(&'a TenantShardId);
+
+/// TenantShardId globally identifies a particular shard in a particular tenant.
+///
+/// These are written as `<TenantId>-<ShardSlug>`, for example:
+///   # The second shard in a two-shard tenant
+///   072f1291a5310026820b2fe4b2968934-0102
+///
+/// If the `ShardCount` is _unsharded_, the `TenantShardId` is written without
+/// a shard suffix and is equivalent to the encoding of a `TenantId`: this enables
+/// an unsharded [`TenantShardId`] to be used interchangably with a [`TenantId`].
+///
+/// The human-readable encoding of an unsharded TenantShardId, such as used in API URLs,
+/// is both forward and backward compatible with TenantId: a legacy TenantId can be
+/// decoded as a TenantShardId, and when re-encoded it will be parseable
+/// as a TenantId.
+#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
+pub struct TenantShardId {
+    pub tenant_id: TenantId,
+    pub shard_number: ShardNumber,
+    pub shard_count: ShardCount,
+}
+
+impl ShardCount {
+    pub const MAX: Self = Self(u8::MAX);
+
+    /// The internal value of a ShardCount may be zero, which means "1 shard, but use
+    /// legacy format for TenantShardId that excludes the shard suffix", also known
+    /// as [`TenantShardId::unsharded`].
+    ///
+    /// This method returns the actual number of shards, i.e. if our internal value is
+    /// zero, we return 1 (unsharded tenants have 1 shard).
+    pub fn count(&self) -> u8 {
+        if self.0 > 0 {
+            self.0
+        } else {
+            1
+        }
+    }
+
+    /// The literal internal value: this is **not** the number of shards in the
+    /// tenant, as we have a special zero value for legacy unsharded tenants.  Use
+    /// [`Self::count`] if you want to know the cardinality of shards.
+    pub fn literal(&self) -> u8 {
+        self.0
+    }
+
+    /// Whether the `ShardCount` is for an unsharded tenant, so uses one shard but
+    /// uses the legacy format for `TenantShardId`. See also the documentation for
+    /// [`Self::count`].
+    pub fn is_unsharded(&self) -> bool {
+        self.0 == 0
+    }
+
+    /// `v` may be zero, or the number of shards in the tenant.  `v` is what
+    /// [`Self::literal`] would return.
+    pub const fn new(val: u8) -> Self {
+        Self(val)
+    }
+}
+
+impl ShardNumber {
+    pub const MAX: Self = Self(u8::MAX);
+}
+
+impl TenantShardId {
+    pub fn unsharded(tenant_id: TenantId) -> Self {
+        Self {
+            tenant_id,
+            shard_number: ShardNumber(0),
+            shard_count: ShardCount(0),
+        }
+    }
+
+    /// The range of all TenantShardId that belong to a particular TenantId.  This is useful when
+    /// you have a BTreeMap of TenantShardId, and are querying by TenantId.
+    pub fn tenant_range(tenant_id: TenantId) -> RangeInclusive<Self> {
+        RangeInclusive::new(
+            Self {
+                tenant_id,
+                shard_number: ShardNumber(0),
+                shard_count: ShardCount(0),
+            },
+            Self {
+                tenant_id,
+                shard_number: ShardNumber::MAX,
+                shard_count: ShardCount::MAX,
+            },
+        )
+    }
+
+    pub fn shard_slug(&self) -> impl std::fmt::Display + '_ {
+        ShardSlug(self)
+    }
+
+    /// Convenience for code that has special behavior on the 0th shard.
+    pub fn is_shard_zero(&self) -> bool {
+        self.shard_number == ShardNumber(0)
+    }
+
+    /// The "unsharded" value is distinct from simply having a single shard: it represents
+    /// a tenant which is not shard-aware at all, and whose storage paths will not include
+    /// a shard suffix.
+    pub fn is_unsharded(&self) -> bool {
+        self.shard_number == ShardNumber(0) && self.shard_count.is_unsharded()
+    }
+
+    /// Convenience for dropping the tenant_id and just getting the ShardIndex: this
+    /// is useful when logging from code that is already in a span that includes tenant ID, to
+    /// keep messages reasonably terse.
+    pub fn to_index(&self) -> ShardIndex {
+        ShardIndex {
+            shard_number: self.shard_number,
+            shard_count: self.shard_count,
+        }
+    }
+
+    /// Calculate the children of this TenantShardId when splitting the overall tenant into
+    /// the given number of shards.
+    pub fn split(&self, new_shard_count: ShardCount) -> Vec<TenantShardId> {
+        let effective_old_shard_count = std::cmp::max(self.shard_count.0, 1);
+        let mut child_shards = Vec::new();
+        for shard_number in 0..ShardNumber(new_shard_count.0).0 {
+            // Key mapping is based on a round robin mapping of key hash modulo shard count,
+            // so our child shards are the ones which the same keys would map to.
+            if shard_number % effective_old_shard_count == self.shard_number.0 {
+                child_shards.push(TenantShardId {
+                    tenant_id: self.tenant_id,
+                    shard_number: ShardNumber(shard_number),
+                    shard_count: new_shard_count,
+                })
+            }
+        }
+
+        child_shards
+    }
+}
+
+impl<'a> std::fmt::Display for ShardSlug<'a> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "{:02x}{:02x}",
+            self.0.shard_number.0, self.0.shard_count.0
+        )
+    }
+}
+
+impl std::fmt::Display for TenantShardId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        if self.shard_count != ShardCount(0) {
+            write!(f, "{}-{}", self.tenant_id, self.shard_slug())
+        } else {
+            // Legacy case (shard_count == 0) -- format as just the tenant id.  Note that this
+            // is distinct from the normal single shard case (shard count == 1).
+            self.tenant_id.fmt(f)
+        }
+    }
+}
+
+impl std::fmt::Debug for TenantShardId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        // Debug is the same as Display: the compact hex representation
+        write!(f, "{}", self)
+    }
+}
+
+impl std::str::FromStr for TenantShardId {
+    type Err = hex::FromHexError;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        // Expect format: 16 byte TenantId, '-', 1 byte shard number, 1 byte shard count
+        if s.len() == 32 {
+            // Legacy case: no shard specified
+            Ok(Self {
+                tenant_id: TenantId::from_str(s)?,
+                shard_number: ShardNumber(0),
+                shard_count: ShardCount(0),
+            })
+        } else if s.len() == 37 {
+            let bytes = s.as_bytes();
+            let tenant_id = TenantId::from_hex(&bytes[0..32])?;
+            let mut shard_parts: [u8; 2] = [0u8; 2];
+            hex::decode_to_slice(&bytes[33..37], &mut shard_parts)?;
+            Ok(Self {
+                tenant_id,
+                shard_number: ShardNumber(shard_parts[0]),
+                shard_count: ShardCount(shard_parts[1]),
+            })
+        } else {
+            Err(hex::FromHexError::InvalidStringLength)
+        }
+    }
+}
+
+impl From<[u8; 18]> for TenantShardId {
+    fn from(b: [u8; 18]) -> Self {
+        let tenant_id_bytes: [u8; 16] = b[0..16].try_into().unwrap();
+
+        Self {
+            tenant_id: TenantId::from(tenant_id_bytes),
+            shard_number: ShardNumber(b[16]),
+            shard_count: ShardCount(b[17]),
+        }
+    }
+}
+
+impl ShardIndex {
+    pub fn new(number: ShardNumber, count: ShardCount) -> Self {
+        Self {
+            shard_number: number,
+            shard_count: count,
+        }
+    }
+    pub fn unsharded() -> Self {
+        Self {
+            shard_number: ShardNumber(0),
+            shard_count: ShardCount(0),
+        }
+    }
+
+    /// The "unsharded" value is distinct from simply having a single shard: it represents
+    /// a tenant which is not shard-aware at all, and whose storage paths will not include
+    /// a shard suffix.
+    pub fn is_unsharded(&self) -> bool {
+        self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
+    }
+
+    /// For use in constructing remote storage paths: concatenate this with a TenantId
+    /// to get a fully qualified TenantShardId.
+    ///
+    /// Backward compat: this function returns an empty string if Self::is_unsharded, such
+    /// that the legacy pre-sharding remote key format is preserved.
+    pub fn get_suffix(&self) -> String {
+        if self.is_unsharded() {
+            "".to_string()
+        } else {
+            format!("-{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
+        }
+    }
+}
+
+impl std::fmt::Display for ShardIndex {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
+    }
+}
+
+impl std::fmt::Debug for ShardIndex {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        // Debug is the same as Display: the compact hex representation
+        write!(f, "{}", self)
+    }
+}
+
+impl std::str::FromStr for ShardIndex {
+    type Err = hex::FromHexError;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        // Expect format: 1 byte shard number, 1 byte shard count
+        if s.len() == 4 {
+            let bytes = s.as_bytes();
+            let mut shard_parts: [u8; 2] = [0u8; 2];
+            hex::decode_to_slice(bytes, &mut shard_parts)?;
+            Ok(Self {
+                shard_number: ShardNumber(shard_parts[0]),
+                shard_count: ShardCount(shard_parts[1]),
+            })
+        } else {
+            Err(hex::FromHexError::InvalidStringLength)
+        }
+    }
+}
+
+impl From<[u8; 2]> for ShardIndex {
+    fn from(b: [u8; 2]) -> Self {
+        Self {
+            shard_number: ShardNumber(b[0]),
+            shard_count: ShardCount(b[1]),
+        }
+    }
+}
+
+impl Serialize for TenantShardId {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        if serializer.is_human_readable() {
+            serializer.collect_str(self)
+        } else {
+            // Note: while human encoding of [`TenantShardId`] is backward and forward
+            // compatible, this binary encoding is not.
+            let mut packed: [u8; 18] = [0; 18];
+            packed[0..16].clone_from_slice(&self.tenant_id.as_arr());
+            packed[16] = self.shard_number.0;
+            packed[17] = self.shard_count.0;
+
+            packed.serialize(serializer)
+        }
+    }
+}
+
+impl<'de> Deserialize<'de> for TenantShardId {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        struct IdVisitor {
+            is_human_readable_deserializer: bool,
+        }
+
+        impl<'de> serde::de::Visitor<'de> for IdVisitor {
+            type Value = TenantShardId;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
+                if self.is_human_readable_deserializer {
+                    formatter.write_str("value in form of hex string")
+                } else {
+                    formatter.write_str("value in form of integer array([u8; 18])")
+                }
+            }
+
+            fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
+            where
+                A: serde::de::SeqAccess<'de>,
+            {
+                let s = serde::de::value::SeqAccessDeserializer::new(seq);
+                let id: [u8; 18] = Deserialize::deserialize(s)?;
+                Ok(TenantShardId::from(id))
+            }
+
+            fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
+            where
+                E: serde::de::Error,
+            {
+                TenantShardId::from_str(v).map_err(E::custom)
+            }
+        }
+
+        if deserializer.is_human_readable() {
+            deserializer.deserialize_str(IdVisitor {
+                is_human_readable_deserializer: true,
+            })
+        } else {
+            deserializer.deserialize_tuple(
+                18,
+                IdVisitor {
+                    is_human_readable_deserializer: false,
+                },
+            )
+        }
+    }
+}
+
 /// Stripe size in number of pages
 #[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
 pub struct ShardStripeSize(pub u32);
@@ -212,6 +585,77 @@ impl ShardIdentity {
    }
 }

+impl Serialize for ShardIndex {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        if serializer.is_human_readable() {
+            serializer.collect_str(self)
+        } else {
+            // Binary encoding is not used in index_part.json, but is included in anticipation of
+            // switching various structures (e.g. inter-process communication, remote metadata) to more
+            // compact binary encodings in future.
+            let mut packed: [u8; 2] = [0; 2];
+            packed[0] = self.shard_number.0;
+            packed[1] = self.shard_count.0;
+            packed.serialize(serializer)
+        }
+    }
+}
+
+impl<'de> Deserialize<'de> for ShardIndex {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        struct IdVisitor {
+            is_human_readable_deserializer: bool,
+        }
+
+        impl<'de> serde::de::Visitor<'de> for IdVisitor {
+            type Value = ShardIndex;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
+                if self.is_human_readable_deserializer {
+                    formatter.write_str("value in form of hex string")
+                } else {
+                    formatter.write_str("value in form of integer array([u8; 2])")
+                }
+            }
+
+            fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
+            where
+                A: serde::de::SeqAccess<'de>,
+            {
+                let s = serde::de::value::SeqAccessDeserializer::new(seq);
+                let id: [u8; 2] = Deserialize::deserialize(s)?;
+                Ok(ShardIndex::from(id))
+            }
+
+            fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
+            where
+                E: serde::de::Error,
+            {
+                ShardIndex::from_str(v).map_err(E::custom)
+            }
+        }
+
+        if deserializer.is_human_readable() {
+            deserializer.deserialize_str(IdVisitor {
+                is_human_readable_deserializer: true,
+            })
+        } else {
+            deserializer.deserialize_tuple(
+                2,
+                IdVisitor {
+                    is_human_readable_deserializer: false,
+                },
+            )
+        }
+    }
+}
+
 /// Whether this key is always held on shard 0 (e.g. shard 0 holds all SLRU keys
 /// in order to be able to serve basebackup requests without peer communication).
 fn key_is_shard0(key: &Key) -> bool {
@@ -293,9 +737,7 @@ pub fn describe(

 #[cfg(test)]
 mod tests {
-    use std::str::FromStr;
-
-    use utils::{id::TenantId, Hex};
+    use utils::Hex;

    use super::*;

--- a/libs/postgres_backend/Cargo.toml
+++ b/libs/postgres_backend/Cargo.toml
@@ -13,7 +13,6 @@ rustls.workspace = true
 serde.workspace = true
 thiserror.workspace = true
 tokio.workspace = true
-tokio-util.workspace = true
 tokio-rustls.workspace = true
 tracing.workspace = true

@@ -24,4 +23,4 @@ workspace_hack.workspace = true
 once_cell.workspace = true
 rustls-pemfile.workspace = true
 tokio-postgres.workspace = true
-tokio-postgres-rustls.workspace = true
+tokio-postgres-rustls.workspace = true
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -16,7 +16,6 @@ use std::{fmt, io};
 use std::{future::Future, str::FromStr};
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_rustls::TlsAcceptor;
-use tokio_util::sync::CancellationToken;
 use tracing::{debug, error, info, trace, warn};

 use pq_proto::framed::{ConnectionError, Framed, FramedReader, FramedWriter};
@@ -401,15 +400,21 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
    }

    /// Wrapper for run_message_loop() that shuts down socket when we are done
-    pub async fn run(
+    pub async fn run<F, S>(
        mut self,
        handler: &mut impl Handler<IO>,
-        cancel: &CancellationToken,
-    ) -> Result<(), QueryError> {
-        let ret = self.run_message_loop(handler, cancel).await;
+        shutdown_watcher: F,
+    ) -> Result<(), QueryError>
+    where
+        F: Fn() -> S + Clone,
+        S: Future,
+    {
+        let ret = self
+            .run_message_loop(handler, shutdown_watcher.clone())
+            .await;

        tokio::select! {
-            _ = cancel.cancelled() => {
+            _ = shutdown_watcher() => {
                // do nothing; we most likely got already stopped by shutdown and will log it next.
            }
            _ = self.framed.shutdown() => {
@@ -439,17 +444,21 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
        }
    }

-    async fn run_message_loop(
+    async fn run_message_loop<F, S>(
        &mut self,
        handler: &mut impl Handler<IO>,
-        cancel: &CancellationToken,
-    ) -> Result<(), QueryError> {
+        shutdown_watcher: F,
+    ) -> Result<(), QueryError>
+    where
+        F: Fn() -> S,
+        S: Future,
+    {
        trace!("postgres backend to {:?} started", self.peer_addr);

        tokio::select!(
            biased;

-            _ = cancel.cancelled() => {
+            _ = shutdown_watcher() => {
                // We were requested to shut down.
                tracing::info!("shutdown request received during handshake");
                return Err(QueryError::Shutdown)
@@ -464,7 +473,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
        let mut query_string = Bytes::new();
        while let Some(msg) = tokio::select!(
            biased;
-            _ = cancel.cancelled() => {
+            _ = shutdown_watcher() => {
                // We were requested to shut down.
                tracing::info!("shutdown request received in run_message_loop");
                return Err(QueryError::Shutdown)
@@ -476,7 +485,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
            let result = self.process_message(handler, msg, &mut query_string).await;
            tokio::select!(
                biased;
-                _ = cancel.cancelled() => {
+                _ = shutdown_watcher() => {
                    // We were requested to shut down.
                    tracing::info!("shutdown request received during response flush");

@@ -663,17 +672,11 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
        assert!(self.state < ProtoState::Authentication);
        let have_tls = self.tls_config.is_some();
        match msg {
-            FeStartupPacket::SslRequest { direct } => {
+            FeStartupPacket::SslRequest => {
                debug!("SSL requested");

-                if !direct {
-                    self.write_message(&BeMessage::EncryptionResponse(have_tls))
-                        .await?;
-                } else if !have_tls {
-                    return Err(QueryError::Other(anyhow::anyhow!(
-                        "direct SSL negotiation but no TLS support"
-                    )));
-                }
+                self.write_message(&BeMessage::EncryptionResponse(have_tls))
+                    .await?;

                if have_tls {
                    self.start_tls().await?;
--- a/libs/postgres_backend/tests/simple_select.rs
+++ b/libs/postgres_backend/tests/simple_select.rs
@@ -3,14 +3,13 @@ use once_cell::sync::Lazy;
 use postgres_backend::{AuthType, Handler, PostgresBackend, QueryError};
 use pq_proto::{BeMessage, RowDescriptor};
 use std::io::Cursor;
-use std::sync::Arc;
+use std::{future, sync::Arc};
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio::net::{TcpListener, TcpStream};
 use tokio_postgres::config::SslMode;
 use tokio_postgres::tls::MakeTlsConnect;
 use tokio_postgres::{Config, NoTls, SimpleQueryMessage};
 use tokio_postgres_rustls::MakeRustlsConnect;
-use tokio_util::sync::CancellationToken;

 // generate client, server test streams
 async fn make_tcp_pair() -> (TcpStream, TcpStream) {
@@ -51,7 +50,7 @@ async fn simple_select() {

    tokio::spawn(async move {
        let mut handler = TestHandler {};
-        pgbackend.run(&mut handler, &CancellationToken::new()).await
+        pgbackend.run(&mut handler, future::pending::<()>).await
    });

    let conf = Config::new();
@@ -103,7 +102,7 @@ async fn simple_select_ssl() {

    tokio::spawn(async move {
        let mut handler = TestHandler {};
-        pgbackend.run(&mut handler, &CancellationToken::new()).await
+        pgbackend.run(&mut handler, future::pending::<()>).await
    });

    let client_cfg = rustls::ClientConfig::builder()
--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -48,15 +48,6 @@ pub const XLOG_SIZE_OF_XLOG_RECORD: usize = std::mem::size_of::<XLogRecord>();
 #[allow(clippy::identity_op)]
 pub const SIZE_OF_XLOG_RECORD_DATA_HEADER_SHORT: usize = 1 * 2;

-/// Interval of checkpointing metadata file. We should store metadata file to enforce
-/// predicate that checkpoint.nextXid is larger than any XID in WAL.
-/// But flushing checkpoint file for each transaction seems to be too expensive,
-/// so XID_CHECKPOINT_INTERVAL is used to forward align nextXid and so perform
-/// metadata checkpoint only once per XID_CHECKPOINT_INTERVAL transactions.
-/// XID_CHECKPOINT_INTERVAL should not be larger than BLCKSZ*CLOG_XACTS_PER_BYTE
-/// in order to let CLOG_TRUNCATE mechanism correctly extend CLOG.
-const XID_CHECKPOINT_INTERVAL: u32 = 1024;
-
 pub fn XLogSegmentsPerXLogId(wal_segsz_bytes: usize) -> XLogSegNo {
    (0x100000000u64 / wal_segsz_bytes as u64) as XLogSegNo
 }
@@ -331,14 +322,10 @@ impl CheckPoint {
    /// Returns 'true' if the XID was updated.
    pub fn update_next_xid(&mut self, xid: u32) -> bool {
        // nextXid should be greater than any XID in WAL, so increment provided XID and check for wraparround.
-        let mut new_xid = std::cmp::max(
+        let new_xid = std::cmp::max(
            xid.wrapping_add(1),
            pg_constants::FIRST_NORMAL_TRANSACTION_ID,
        );
-        // To reduce number of metadata checkpoints, we forward align XID on XID_CHECKPOINT_INTERVAL.
-        // XID_CHECKPOINT_INTERVAL should not be larger than BLCKSZ*CLOG_XACTS_PER_BYTE
-        new_xid =
-            new_xid.wrapping_add(XID_CHECKPOINT_INTERVAL - 1) & !(XID_CHECKPOINT_INTERVAL - 1);
        let full_xid = self.nextXid.value;
        let old_xid = full_xid as u32;
        if new_xid.wrapping_sub(old_xid) as i32 > 0 {
@@ -360,7 +347,7 @@ impl CheckPoint {
    /// Advance next multi-XID/offset to those given in arguments.
    ///
    /// It's important that this handles wraparound correctly. This should match the
-    /// MultiXactAdvanceNextMXact() logic in PostgreSQL's xlog_redo() function.
+    /// MultiXactAdvceNextMXact() logic in PostgreSQL's xlog_redo() function.
    ///
    /// Returns 'true' if the Checkpoint was updated.
    pub fn update_next_multixid(&mut self, multi_xid: u32, multi_offset: u32) -> bool {
--- a/libs/pq_proto/src/framed.rs
+++ b/libs/pq_proto/src/framed.rs
@@ -44,9 +44,9 @@ impl ConnectionError {
 /// Wraps async io `stream`, providing messages to write/flush + read Postgres
 /// messages.
 pub struct Framed<S> {
-    pub stream: S,
-    pub read_buf: BytesMut,
-    pub write_buf: BytesMut,
+    stream: S,
+    read_buf: BytesMut,
+    write_buf: BytesMut,
 }

 impl<S> Framed<S> {
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -39,39 +39,14 @@ pub enum FeMessage {
    PasswordMessage(Bytes),
 }

-#[derive(Clone, Copy, PartialEq, PartialOrd)]
-pub struct ProtocolVersion(u32);
-
-impl ProtocolVersion {
-    pub const fn new(major: u16, minor: u16) -> Self {
-        Self((major as u32) << 16 | minor as u32)
-    }
-    pub const fn minor(self) -> u16 {
-        self.0 as u16
-    }
-    pub const fn major(self) -> u16 {
-        (self.0 >> 16) as u16
-    }
-}
-
-impl fmt::Debug for ProtocolVersion {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        f.debug_list()
-            .entry(&self.major())
-            .entry(&self.minor())
-            .finish()
-    }
-}
-
 #[derive(Debug)]
 pub enum FeStartupPacket {
    CancelRequest(CancelKeyData),
-    SslRequest {
-        direct: bool,
-    },
+    SslRequest,
    GssEncRequest,
    StartupMessage {
-        version: ProtocolVersion,
+        major_version: u32,
+        minor_version: u32,
        params: StartupMessageParams,
    },
 }
@@ -326,23 +301,11 @@ impl FeStartupPacket {
    /// different from [`FeMessage::parse`] because startup messages don't have
    /// message type byte; otherwise, its comments apply.
    pub fn parse(buf: &mut BytesMut) -> Result<Option<FeStartupPacket>, ProtocolError> {
-        /// <https://github.com/postgres/postgres/blob/ca481d3c9ab7bf69ff0c8d71ad3951d407f6a33c/src/include/libpq/pqcomm.h#L118>
        const MAX_STARTUP_PACKET_LENGTH: usize = 10000;
-        const RESERVED_INVALID_MAJOR_VERSION: u16 = 1234;
-        /// <https://github.com/postgres/postgres/blob/ca481d3c9ab7bf69ff0c8d71ad3951d407f6a33c/src/include/libpq/pqcomm.h#L132>
-        const CANCEL_REQUEST_CODE: ProtocolVersion = ProtocolVersion::new(1234, 5678);
-        /// <https://github.com/postgres/postgres/blob/ca481d3c9ab7bf69ff0c8d71ad3951d407f6a33c/src/include/libpq/pqcomm.h#L166>
-        const NEGOTIATE_SSL_CODE: ProtocolVersion = ProtocolVersion::new(1234, 5679);
-        /// <https://github.com/postgres/postgres/blob/ca481d3c9ab7bf69ff0c8d71ad3951d407f6a33c/src/include/libpq/pqcomm.h#L167>
-        const NEGOTIATE_GSS_CODE: ProtocolVersion = ProtocolVersion::new(1234, 5680);
-
-        // <https://github.com/postgres/postgres/blob/04bcf9e19a4261fe9c7df37c777592c2e10c32a7/src/backend/tcop/backend_startup.c#L378-L382>
-        // First byte indicates standard SSL handshake message
-        // (It can't be a Postgres startup length because in network byte order
-        // that would be a startup packet hundreds of megabytes long)
-        if buf.first() == Some(&0x16) {
-            return Ok(Some(FeStartupPacket::SslRequest { direct: true }));
-        }
+        const RESERVED_INVALID_MAJOR_VERSION: u32 = 1234;
+        const CANCEL_REQUEST_CODE: u32 = 5678;
+        const NEGOTIATE_SSL_CODE: u32 = 5679;
+        const NEGOTIATE_GSS_CODE: u32 = 5680;

        // need at least 4 bytes with packet len
        if buf.len() < 4 {
@@ -375,10 +338,12 @@ impl FeStartupPacket {
        let mut msg = buf.split_to(len).freeze();
        msg.advance(4); // consume len

-        let request_code = ProtocolVersion(msg.get_u32());
+        let request_code = msg.get_u32();
+        let req_hi = request_code >> 16;
+        let req_lo = request_code & ((1 << 16) - 1);
        // StartupMessage, CancelRequest, SSLRequest etc are differentiated by request code.
-        let message = match request_code {
-            CANCEL_REQUEST_CODE => {
+        let message = match (req_hi, req_lo) {
+            (RESERVED_INVALID_MAJOR_VERSION, CANCEL_REQUEST_CODE) => {
                if msg.remaining() != 8 {
                    return Err(ProtocolError::BadMessage(
                        "CancelRequest message is malformed, backend PID / secret key missing"
@@ -390,22 +355,21 @@ impl FeStartupPacket {
                    cancel_key: msg.get_i32(),
                })
            }
-            NEGOTIATE_SSL_CODE => {
+            (RESERVED_INVALID_MAJOR_VERSION, NEGOTIATE_SSL_CODE) => {
                // Requested upgrade to SSL (aka TLS)
-                FeStartupPacket::SslRequest { direct: false }
+                FeStartupPacket::SslRequest
            }
-            NEGOTIATE_GSS_CODE => {
+            (RESERVED_INVALID_MAJOR_VERSION, NEGOTIATE_GSS_CODE) => {
                // Requested upgrade to GSSAPI
                FeStartupPacket::GssEncRequest
            }
-            version if version.major() == RESERVED_INVALID_MAJOR_VERSION => {
+            (RESERVED_INVALID_MAJOR_VERSION, unrecognized_code) => {
                return Err(ProtocolError::Protocol(format!(
-                    "Unrecognized request code {}",
-                    version.minor()
+                    "Unrecognized request code {unrecognized_code}"
                )));
            }
            // TODO bail if protocol major_version is not 3?
-            version => {
+            (major_version, minor_version) => {
                // StartupMessage

                let s = str::from_utf8(&msg).map_err(|_e| {
@@ -418,7 +382,8 @@ impl FeStartupPacket {
                })?;

                FeStartupPacket::StartupMessage {
-                    version,
+                    major_version,
+                    minor_version,
                    params: StartupMessageParams {
                        params: msg.slice_ref(s.as_bytes()),
                    },
@@ -557,10 +522,6 @@ pub enum BeMessage<'a> {
    RowDescription(&'a [RowDescriptor<'a>]),
    XLogData(XLogDataBody<'a>),
    NoticeResponse(&'a str),
-    NegotiateProtocolVersion {
-        version: ProtocolVersion,
-        options: &'a [&'a str],
-    },
    KeepAlive(WalSndKeepAlive),
 }

@@ -984,18 +945,6 @@ impl<'a> BeMessage<'a> {
                    buf.put_u8(u8::from(req.request_reply));
                });
            }
-
-            BeMessage::NegotiateProtocolVersion { version, options } => {
-                buf.put_u8(b'v');
-                write_body(buf, |buf| {
-                    buf.put_u32(version.0);
-                    buf.put_u32(options.len() as u32);
-                    for option in options.iter() {
-                        write_cstr(option, buf)?;
-                    }
-                    Ok(())
-                })?
-            }
        }
        Ok(())
    }
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -7,7 +7,6 @@ license.workspace = true
 [dependencies]
 anyhow.workspace = true
 async-trait.workspace = true
-async-stream.workspace = true
 once_cell.workspace = true
 aws-smithy-async.workspace = true
 aws-smithy-types.workspace = true
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -15,7 +15,7 @@ use std::time::SystemTime;
 use super::REMOTE_STORAGE_PREFIX_SEPARATOR;
 use anyhow::Result;
 use azure_core::request_options::{MaxResults, Metadata, Range};
-use azure_core::{Continuable, RetryOptions};
+use azure_core::RetryOptions;
 use azure_identity::DefaultAzureCredential;
 use azure_storage::StorageCredentials;
 use azure_storage_blobs::blob::CopyStatus;
@@ -40,7 +40,6 @@ use crate::{

 pub struct AzureBlobStorage {
    client: ContainerClient,
-    container_name: String,
    prefix_in_container: Option<String>,
    max_keys_per_list_response: Option<NonZeroU32>,
    concurrency_limiter: ConcurrencyLimiter,
@@ -86,7 +85,6 @@ impl AzureBlobStorage {

        Ok(AzureBlobStorage {
            client,
-            container_name: azure_config.container_name.to_owned(),
            prefix_in_container: azure_config.prefix_in_container.to_owned(),
            max_keys_per_list_response,
            concurrency_limiter: ConcurrencyLimiter::new(azure_config.concurrency_limit.get()),
@@ -240,10 +238,6 @@ impl AzureBlobStorage {
            _ = cancel.cancelled() => Err(Cancelled),
        }
    }
-
-    pub fn container_name(&self) -> &str {
-        &self.container_name
-    }
 }

 fn to_azure_metadata(metadata: StorageMetadata) -> Metadata {
@@ -267,30 +261,30 @@ fn to_download_error(error: azure_core::Error) -> DownloadError {
 }

 impl RemoteStorage for AzureBlobStorage {
-    fn list_streaming(
+    async fn list(
        &self,
        prefix: Option<&RemotePath>,
        mode: ListingMode,
        max_keys: Option<NonZeroU32>,
        cancel: &CancellationToken,
-    ) -> impl Stream<Item = Result<Listing, DownloadError>> {
-        // get the passed prefix or if it is not set use prefix_in_bucket value
-        let list_prefix = prefix
-            .map(|p| self.relative_path_to_name(p))
-            .or_else(|| self.prefix_in_container.clone())
-            .map(|mut p| {
-                // required to end with a separator
-                // otherwise request will return only the entry of a prefix
-                if matches!(mode, ListingMode::WithDelimiter)
-                    && !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR)
-                {
-                    p.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
-                }
-                p
-            });
+    ) -> anyhow::Result<Listing, DownloadError> {
+        let _permit = self.permit(RequestKind::List, cancel).await?;

-        async_stream::stream! {
-            let _permit = self.permit(RequestKind::List, cancel).await?;
+        let op = async {
+            // get the passed prefix or if it is not set use prefix_in_bucket value
+            let list_prefix = prefix
+                .map(|p| self.relative_path_to_name(p))
+                .or_else(|| self.prefix_in_container.clone())
+                .map(|mut p| {
+                    // required to end with a separator
+                    // otherwise request will return only the entry of a prefix
+                    if matches!(mode, ListingMode::WithDelimiter)
+                        && !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR)
+                    {
+                        p.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
+                    }
+                    p
+                });

            let mut builder = self.client.list_blobs();

@@ -306,43 +300,21 @@ impl RemoteStorage for AzureBlobStorage {
                builder = builder.max_results(MaxResults::new(limit));
            }

-            let mut next_marker = None;
+            let response = builder.into_stream();
+            let response = response.into_stream().map_err(to_download_error);
+            let response = tokio_stream::StreamExt::timeout(response, self.timeout);
+            let response = response.map(|res| match res {
+                Ok(res) => res,
+                Err(_elapsed) => Err(DownloadError::Timeout),
+            });

-            'outer: loop {
-                let mut builder = builder.clone();
-                if let Some(marker) = next_marker.clone() {
-                    builder = builder.marker(marker);
-                }
-                let response = builder.into_stream();
-                let response = response.into_stream().map_err(to_download_error);
-                let response = tokio_stream::StreamExt::timeout(response, self.timeout);
-                let response = response.map(|res| match res {
-                    Ok(res) => res,
-                    Err(_elapsed) => Err(DownloadError::Timeout),
-                });
+            let mut response = std::pin::pin!(response);

-                let mut response = std::pin::pin!(response);
+            let mut res = Listing::default();

-                let mut max_keys = max_keys.map(|mk| mk.get());
-                let next_item = tokio::select! {
-                    op = response.next() => Ok(op),
-                    _ = cancel.cancelled() => Err(DownloadError::Cancelled),
-                }?;
-                let Some(entry) = next_item else {
-                    // The list is complete, so yield it.
-                    break;
-                };
-
-                let mut res = Listing::default();
-                let entry = match entry {
-                    Ok(entry) => entry,
-                    Err(e) => {
-                        // The error is potentially retryable, so we must rewind the loop after yielding.
-                        yield Err(e);
-                        continue;
-                    }
-                };
-                next_marker = entry.continuation();
+            let mut max_keys = max_keys.map(|mk| mk.get());
+            while let Some(entry) = response.next().await {
+                let entry = entry?;
                let prefix_iter = entry
                    .blobs
                    .prefixes()
@@ -361,19 +333,19 @@ impl RemoteStorage for AzureBlobStorage {
                        assert!(mk > 0);
                        mk -= 1;
                        if mk == 0 {
-                            yield Ok(res); // limit reached
-                            break 'outer;
+                            return Ok(res); // limit reached
                        }
                        max_keys = Some(mk);
                    }
                }
-                yield Ok(res);
-
-                // We are done here
-                if next_marker.is_none() {
-                    break;
-                }
            }
+
+            Ok(res)
+        };
+
+        tokio::select! {
+            res = op => res,
+            _ = cancel.cancelled() => Err(DownloadError::Cancelled),
        }
    }

--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -26,7 +26,7 @@ use anyhow::Context;
 use camino::{Utf8Path, Utf8PathBuf};

 use bytes::Bytes;
-use futures::{stream::Stream, StreamExt};
+use futures::stream::Stream;
 use serde::{Deserialize, Serialize};
 use tokio::sync::Semaphore;
 use tokio_util::sync::CancellationToken;
@@ -160,18 +160,13 @@ pub struct Listing {
 /// providing basic CRUD operations for storage files.
 #[allow(async_fn_in_trait)]
 pub trait RemoteStorage: Send + Sync + 'static {
-    /// List objects in remote storage, with semantics matching AWS S3's [`ListObjectsV2`].
-    ///
-    /// The stream is guaranteed to return at least one element, even in the case of errors
-    /// (in that case it's an `Err()`), or an empty `Listing`.
-    ///
-    /// The stream is not ending if it returns an error, as long as [`is_permanent`] returns false on the error.
-    /// The `next` function can be retried, and maybe in a future retry, there will be success.
+    /// List objects in remote storage, with semantics matching AWS S3's ListObjectsV2.
+    /// (see `<https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html>`)
    ///
    /// Note that the prefix is relative to any `prefix_in_bucket` configured for the client, not
    /// from the absolute root of the bucket.
    ///
-    /// `mode` configures whether to use a delimiter.  Without a delimiter, all keys
+    /// `mode` configures whether to use a delimiter.  Without a delimiter all keys
    /// within the prefix are listed in the `keys` of the result.  With a delimiter, any "directories" at the top level of
    /// the prefix are returned in the `prefixes` of the result, and keys in the top level of the prefix are
    /// returned in `keys` ().
@@ -180,32 +175,13 @@ pub trait RemoteStorage: Send + Sync + 'static {
    /// will iteratively call listobjects until it runs out of keys.  Note that this is not safe to use on
    /// unlimted size buckets, as the full list of objects is allocated into a monolithic data structure.
    ///
-    /// [`ListObjectsV2`]: <https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html>
-    /// [`is_permanent`]: DownloadError::is_permanent
-    fn list_streaming(
-        &self,
-        prefix: Option<&RemotePath>,
-        mode: ListingMode,
-        max_keys: Option<NonZeroU32>,
-        cancel: &CancellationToken,
-    ) -> impl Stream<Item = Result<Listing, DownloadError>>;
-
    async fn list(
        &self,
        prefix: Option<&RemotePath>,
-        mode: ListingMode,
+        _mode: ListingMode,
        max_keys: Option<NonZeroU32>,
        cancel: &CancellationToken,
-    ) -> Result<Listing, DownloadError> {
-        let mut stream = std::pin::pin!(self.list_streaming(prefix, mode, max_keys, cancel));
-        let mut combined = stream.next().await.expect("At least one item required")?;
-        while let Some(list) = stream.next().await {
-            let list = list?;
-            combined.keys.extend_from_slice(&list.keys);
-            combined.prefixes.extend_from_slice(&list.prefixes);
-        }
-        Ok(combined)
-    }
+    ) -> Result<Listing, DownloadError>;

    /// Streams the local file contents into remote into the remote storage entry.
    ///
@@ -312,8 +288,8 @@ impl Debug for Download {

 /// Every storage, currently supported.
 /// Serves as a simple way to pass around the [`RemoteStorage`] without dealing with generics.
-// Require Clone for `Other` due to https://github.com/rust-lang/rust/issues/26925
 #[derive(Clone)]
+// Require Clone for `Other` due to https://github.com/rust-lang/rust/issues/26925
 pub enum GenericRemoteStorage<Other: Clone = Arc<UnreliableWrapper>> {
    LocalFs(LocalFs),
    AwsS3(Arc<S3Bucket>),
@@ -322,14 +298,13 @@ pub enum GenericRemoteStorage<Other: Clone = Arc<UnreliableWrapper>> {
 }

 impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
-    // See [`RemoteStorage::list`].
    pub async fn list(
        &self,
        prefix: Option<&RemotePath>,
        mode: ListingMode,
        max_keys: Option<NonZeroU32>,
        cancel: &CancellationToken,
-    ) -> Result<Listing, DownloadError> {
+    ) -> anyhow::Result<Listing, DownloadError> {
        match self {
            Self::LocalFs(s) => s.list(prefix, mode, max_keys, cancel).await,
            Self::AwsS3(s) => s.list(prefix, mode, max_keys, cancel).await,
@@ -338,23 +313,6 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
        }
    }

-    // See [`RemoteStorage::list_streaming`].
-    pub fn list_streaming<'a>(
-        &'a self,
-        prefix: Option<&'a RemotePath>,
-        mode: ListingMode,
-        max_keys: Option<NonZeroU32>,
-        cancel: &'a CancellationToken,
-    ) -> impl Stream<Item = Result<Listing, DownloadError>> + 'a {
-        match self {
-            Self::LocalFs(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel))
-                as Pin<Box<dyn Stream<Item = Result<Listing, DownloadError>>>>,
-            Self::AwsS3(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)),
-            Self::AzureBlob(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)),
-            Self::Unreliable(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)),
-        }
-    }
-
    /// See [`RemoteStorage::upload`]
    pub async fn upload(
        &self,
@@ -485,7 +443,7 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
 }

 impl GenericRemoteStorage {
-    pub async fn from_config(storage_config: &RemoteStorageConfig) -> anyhow::Result<Self> {
+    pub fn from_config(storage_config: &RemoteStorageConfig) -> anyhow::Result<Self> {
        let timeout = storage_config.timeout;
        Ok(match &storage_config.storage {
            RemoteStorageKind::LocalFs { local_path: path } => {
@@ -500,7 +458,7 @@ impl GenericRemoteStorage {
                    std::env::var("AWS_ACCESS_KEY_ID").unwrap_or_else(|_| "<none>".into());
                info!("Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}', profile: {profile}, access_key_id: {access_key_id}",
                      s3_config.bucket_name, s3_config.bucket_region, s3_config.prefix_in_bucket, s3_config.endpoint);
-                Self::AwsS3(Arc::new(S3Bucket::new(s3_config, timeout).await?))
+                Self::AwsS3(Arc::new(S3Bucket::new(s3_config, timeout)?))
            }
            RemoteStorageKind::AzureContainer(azure_config) => {
                let storage_account = azure_config
@@ -546,16 +504,6 @@ impl GenericRemoteStorage {
            None => self.download(from, cancel).await,
        }
    }
-
-    /// The name of the bucket/container/etc.
-    pub fn bucket_name(&self) -> Option<&str> {
-        match self {
-            Self::LocalFs(_s) => None,
-            Self::AwsS3(s) => Some(s.bucket_name()),
-            Self::AzureBlob(s) => Some(s.container_name()),
-            Self::Unreliable(_s) => None,
-        }
-    }
 }

 /// Extra set of key-value pairs that contain arbitrary metadata about the storage entry.
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -331,17 +331,6 @@ impl LocalFs {
 }

 impl RemoteStorage for LocalFs {
-    fn list_streaming(
-        &self,
-        prefix: Option<&RemotePath>,
-        mode: ListingMode,
-        max_keys: Option<NonZeroU32>,
-        cancel: &CancellationToken,
-    ) -> impl Stream<Item = Result<Listing, DownloadError>> {
-        let listing = self.list(prefix, mode, max_keys, cancel);
-        futures::stream::once(listing)
-    }
-
    async fn list(
        &self,
        prefix: Option<&RemotePath>,
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -16,10 +16,16 @@ use std::{

 use anyhow::{anyhow, Context as _};
 use aws_config::{
-    default_provider::credentials::DefaultCredentialsChain,
+    environment::credentials::EnvironmentVariableCredentialsProvider,
+    imds::credentials::ImdsCredentialsProvider,
+    meta::credentials::CredentialsProviderChain,
+    profile::ProfileFileCredentialsProvider,
+    provider_config::ProviderConfig,
    retry::{RetryConfigBuilder, RetryMode},
+    web_identity_token::WebIdentityTokenCredentialsProvider,
    BehaviorVersion,
 };
+use aws_credential_types::provider::SharedCredentialsProvider;
 use aws_sdk_s3::{
    config::{AsyncSleep, IdentityCache, Region, SharedAsyncSleep},
    error::SdkError,
@@ -70,27 +76,40 @@ struct GetObjectRequest {
 }
 impl S3Bucket {
    /// Creates the S3 storage, errors if incorrect AWS S3 configuration provided.
-    pub async fn new(remote_storage_config: &S3Config, timeout: Duration) -> anyhow::Result<Self> {
+    pub fn new(remote_storage_config: &S3Config, timeout: Duration) -> anyhow::Result<Self> {
        tracing::debug!(
            "Creating s3 remote storage for S3 bucket {}",
            remote_storage_config.bucket_name
        );

-        let region = Region::new(remote_storage_config.bucket_region.clone());
-        let region_opt = Some(region.clone());
+        let region = Some(Region::new(remote_storage_config.bucket_region.clone()));

-        // https://docs.aws.amazon.com/sdkref/latest/guide/standardized-credentials.html
-        // https://docs.rs/aws-config/latest/aws_config/default_provider/credentials/struct.DefaultCredentialsChain.html
-        // Incomplete list of auth methods used by this:
-        // * "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
-        // * "AWS_PROFILE" / `aws sso login --profile <profile>`
-        // * "AWS_WEB_IDENTITY_TOKEN_FILE", "AWS_ROLE_ARN", "AWS_ROLE_SESSION_NAME"
-        // * http (ECS/EKS) container credentials
-        // * imds v2
-        let credentials_provider = DefaultCredentialsChain::builder()
-            .region(region)
-            .build()
-            .await;
+        let provider_conf = ProviderConfig::without_region().with_region(region.clone());
+
+        let credentials_provider = {
+            // uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
+            CredentialsProviderChain::first_try(
+                "env",
+                EnvironmentVariableCredentialsProvider::new(),
+            )
+            // uses "AWS_PROFILE" / `aws sso login --profile <profile>`
+            .or_else(
+                "profile-sso",
+                ProfileFileCredentialsProvider::builder()
+                    .configure(&provider_conf)
+                    .build(),
+            )
+            // uses "AWS_WEB_IDENTITY_TOKEN_FILE", "AWS_ROLE_ARN", "AWS_ROLE_SESSION_NAME"
+            // needed to access remote extensions bucket
+            .or_else(
+                "token",
+                WebIdentityTokenCredentialsProvider::builder()
+                    .configure(&provider_conf)
+                    .build(),
+            )
+            // uses imds v2
+            .or_else("imds", ImdsCredentialsProvider::builder().build())
+        };

        // AWS SDK requires us to specify how the RetryConfig should sleep when it wants to back off
        let sleep_impl: Arc<dyn AsyncSleep> = Arc::new(TokioSleep::new());
@@ -99,9 +118,9 @@ impl S3Bucket {
            #[allow(deprecated)] /* TODO: https://github.com/neondatabase/neon/issues/7665 */
            BehaviorVersion::v2023_11_09(),
        )
-        .region(region_opt)
+        .region(region)
        .identity_cache(IdentityCache::lazy().build())
-        .credentials_provider(credentials_provider)
+        .credentials_provider(SharedCredentialsProvider::new(credentials_provider))
        .sleep_impl(SharedAsyncSleep::from(sleep_impl));

        let sdk_config: aws_config::SdkConfig = std::thread::scope(|s| {
@@ -386,10 +405,6 @@ impl S3Bucket {
        }
        Ok(())
    }
-
-    pub fn bucket_name(&self) -> &str {
-        &self.bucket_name
-    }
 }

 pin_project_lite::pin_project! {
@@ -467,16 +482,17 @@ impl<S: Stream<Item = std::io::Result<Bytes>>> Stream for TimedDownload<S> {
 }

 impl RemoteStorage for S3Bucket {
-    fn list_streaming(
+    async fn list(
        &self,
        prefix: Option<&RemotePath>,
        mode: ListingMode,
        max_keys: Option<NonZeroU32>,
        cancel: &CancellationToken,
-    ) -> impl Stream<Item = Result<Listing, DownloadError>> {
+    ) -> Result<Listing, DownloadError> {
        let kind = RequestKind::List;
        // s3 sdk wants i32
        let mut max_keys = max_keys.map(|mk| mk.get() as i32);
+        let mut result = Listing::default();

        // get the passed prefix or if it is not set use prefix_in_bucket value
        let list_prefix = prefix
@@ -488,99 +504,89 @@ impl RemoteStorage for S3Bucket {
                })
            });

-        async_stream::stream! {
-            let _permit = self.permit(kind, cancel).await?;
+        let _permit = self.permit(kind, cancel).await?;

-            let mut continuation_token = None;
-            'outer: loop {
-                let started_at = start_measuring_requests(kind);
+        let mut continuation_token = None;

-                // min of two Options, returning Some if one is value and another is
-                // None (None is smaller than anything, so plain min doesn't work).
-                let request_max_keys = self
-                    .max_keys_per_list_response
-                    .into_iter()
-                    .chain(max_keys.into_iter())
-                    .min();
-                let mut request = self
-                    .client
-                    .list_objects_v2()
-                    .bucket(self.bucket_name.clone())
-                    .set_prefix(list_prefix.clone())
-                    .set_continuation_token(continuation_token.clone())
-                    .set_max_keys(request_max_keys);
+        loop {
+            let started_at = start_measuring_requests(kind);

-                if let ListingMode::WithDelimiter = mode {
-                    request = request.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string());
-                }
+            // min of two Options, returning Some if one is value and another is
+            // None (None is smaller than anything, so plain min doesn't work).
+            let request_max_keys = self
+                .max_keys_per_list_response
+                .into_iter()
+                .chain(max_keys.into_iter())
+                .min();
+            let mut request = self
+                .client
+                .list_objects_v2()
+                .bucket(self.bucket_name.clone())
+                .set_prefix(list_prefix.clone())
+                .set_continuation_token(continuation_token)
+                .set_max_keys(request_max_keys);

-                let request = request.send();
-
-                let response = tokio::select! {
-                    res = request => Ok(res),
-                    _ = tokio::time::sleep(self.timeout) => Err(DownloadError::Timeout),
-                    _ = cancel.cancelled() => Err(DownloadError::Cancelled),
-                }?;
-
-                let response = response
-                    .context("Failed to list S3 prefixes")
-                    .map_err(DownloadError::Other);
-
-                let started_at = ScopeGuard::into_inner(started_at);
-
-                crate::metrics::BUCKET_METRICS
-                    .req_seconds
-                    .observe_elapsed(kind, &response, started_at);
-
-                let response = match response {
-                    Ok(response) => response,
-                    Err(e) => {
-                        // The error is potentially retryable, so we must rewind the loop after yielding.
-                        yield Err(e);
-                        continue 'outer;
-                    },
-                };
-
-                let keys = response.contents();
-                let prefixes = response.common_prefixes.as_deref().unwrap_or_default();
-
-                tracing::debug!("list: {} prefixes, {} keys", prefixes.len(), keys.len());
-                let mut result = Listing::default();
-
-                for object in keys {
-                    let object_path = object.key().expect("response does not contain a key");
-                    let remote_path = self.s3_object_to_relative_path(object_path);
-                    result.keys.push(remote_path);
-                    if let Some(mut mk) = max_keys {
-                        assert!(mk > 0);
-                        mk -= 1;
-                        if mk == 0 {
-                            // limit reached
-                            yield Ok(result);
-                            break 'outer;
-                        }
-                        max_keys = Some(mk);
-                    }
-                }
-
-                // S3 gives us prefixes like "foo/", we return them like "foo"
-                result.prefixes.extend(prefixes.iter().filter_map(|o| {
-                    Some(
-                        self.s3_object_to_relative_path(
-                            o.prefix()?
-                                .trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR),
-                        ),
-                    )
-                }));
-
-                yield Ok(result);
-
-                continuation_token = match response.next_continuation_token {
-                    Some(new_token) => Some(new_token),
-                    None => break,
-                };
+            if let ListingMode::WithDelimiter = mode {
+                request = request.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string());
            }
+
+            let request = request.send();
+
+            let response = tokio::select! {
+                res = request => res,
+                _ = tokio::time::sleep(self.timeout) => return Err(DownloadError::Timeout),
+                _ = cancel.cancelled() => return Err(DownloadError::Cancelled),
+            };
+
+            let response = response
+                .context("Failed to list S3 prefixes")
+                .map_err(DownloadError::Other);
+
+            let started_at = ScopeGuard::into_inner(started_at);
+
+            crate::metrics::BUCKET_METRICS
+                .req_seconds
+                .observe_elapsed(kind, &response, started_at);
+
+            let response = response?;
+
+            let keys = response.contents();
+            let empty = Vec::new();
+            let prefixes = response.common_prefixes.as_ref().unwrap_or(&empty);
+
+            tracing::debug!("list: {} prefixes, {} keys", prefixes.len(), keys.len());
+
+            for object in keys {
+                let object_path = object.key().expect("response does not contain a key");
+                let remote_path = self.s3_object_to_relative_path(object_path);
+                result.keys.push(remote_path);
+                if let Some(mut mk) = max_keys {
+                    assert!(mk > 0);
+                    mk -= 1;
+                    if mk == 0 {
+                        return Ok(result); // limit reached
+                    }
+                    max_keys = Some(mk);
+                }
+            }
+
+            // S3 gives us prefixes like "foo/", we return them like "foo"
+            result.prefixes.extend(prefixes.iter().filter_map(|o| {
+                Some(
+                    self.s3_object_to_relative_path(
+                        o.prefix()?
+                            .trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR),
+                    ),
+                )
+            }));
+
+            continuation_token = match response.next_continuation_token {
+                Some(new_token) => Some(new_token),
+                None => break,
+            };
        }
+
+        Ok(result)
    }

    async fn upload(
@@ -1035,8 +1041,8 @@ mod tests {

    use crate::{RemotePath, S3Bucket, S3Config};

-    #[tokio::test]
-    async fn relative_path() {
+    #[test]
+    fn relative_path() {
        let all_paths = ["", "some/path", "some/path/"];
        let all_paths: Vec<RemotePath> = all_paths
            .iter()
@@ -1079,9 +1085,8 @@ mod tests {
                max_keys_per_list_response: Some(5),
                upload_storage_class: None,
            };
-            let storage = S3Bucket::new(&config, std::time::Duration::ZERO)
-                .await
-                .expect("remote storage init");
+            let storage =
+                S3Bucket::new(&config, std::time::Duration::ZERO).expect("remote storage init");
            for (test_path_idx, test_path) in all_paths.iter().enumerate() {
                let result = storage.relative_path_to_s3_object(test_path);
                let expected = expected_outputs[prefix_idx][test_path_idx];
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -3,7 +3,6 @@
 //! testing purposes.
 use bytes::Bytes;
 use futures::stream::Stream;
-use futures::StreamExt;
 use std::collections::HashMap;
 use std::num::NonZeroU32;
 use std::sync::Mutex;
@@ -108,23 +107,6 @@ impl UnreliableWrapper {
 type VoidStorage = crate::LocalFs;

 impl RemoteStorage for UnreliableWrapper {
-    fn list_streaming(
-        &self,
-        prefix: Option<&RemotePath>,
-        mode: ListingMode,
-        max_keys: Option<NonZeroU32>,
-        cancel: &CancellationToken,
-    ) -> impl Stream<Item = Result<Listing, DownloadError>> {
-        async_stream::stream! {
-            self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))
-                .map_err(DownloadError::Other)?;
-            let mut stream = self.inner
-                .list_streaming(prefix, mode, max_keys, cancel);
-            while let Some(item) = stream.next().await {
-                yield item;
-            }
-        }
-    }
    async fn list(
        &self,
        prefix: Option<&RemotePath>,
--- a/libs/remote_storage/tests/common/mod.rs
+++ b/libs/remote_storage/tests/common/mod.rs
@@ -152,7 +152,7 @@ pub(crate) async fn upload_remote_data(
    let mut upload_tasks = JoinSet::new();
    let cancel = CancellationToken::new();

-    for i in 1..=upload_tasks_count {
+    for i in 1..upload_tasks_count + 1 {
        let task_client = Arc::clone(client);
        let cancel = cancel.clone();

--- a/libs/remote_storage/tests/common/tests.rs
+++ b/libs/remote_storage/tests/common/tests.rs
@@ -1,6 +1,5 @@
 use anyhow::Context;
 use camino::Utf8Path;
-use futures::StreamExt;
 use remote_storage::ListingMode;
 use remote_storage::RemotePath;
 use std::sync::Arc;
@@ -30,10 +29,10 @@ use super::{
 /// * with no prefix, it lists everything after its `${random_prefix_part}/` — that should be `${base_prefix_str}` value only
 /// * with `${base_prefix_str}/` prefix, it lists every `sub_prefix_${i}`
 ///
-/// In the `MaybeEnabledStorageWithTestBlobs::setup`, we set the `max_keys_in_list_response` param to limit the keys in a single response.
-/// This way, we are able to test the pagination, by ensuring all results are returned from the remote storage and avoid uploading too many blobs to S3,
-/// as the current default AWS S3 pagination limit is 1000.
-/// (see <https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax>).
+/// With the real S3 enabled and `#[cfg(test)]` Rust configuration used, the S3 client test adds a `max-keys` param to limit the response keys.
+/// This way, we are able to test the pagination implicitly, by ensuring all results are returned from the remote storage and avoid uploading too many blobs to S3,
+/// since current default AWS S3 pagination limit is 1000.
+/// (see https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax)
 ///
 /// Lastly, the test attempts to clean up and remove all uploaded S3 files.
 /// If any errors appear during the clean up, they get logged, but the test is not failed or stopped until clean up is finished.
@@ -88,41 +87,6 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a
        "remote storage nested prefixes list mismatches with the uploads. Remote only prefixes: {remote_only_prefixes:?}, missing uploaded prefixes: {missing_uploaded_prefixes:?}",
    );

-    // list_streaming
-
-    let prefix_with_slash = base_prefix.add_trailing_slash();
-    let mut nested_remote_prefixes_st = test_client.list_streaming(
-        Some(&prefix_with_slash),
-        ListingMode::WithDelimiter,
-        None,
-        &cancel,
-    );
-    let mut nested_remote_prefixes_combined = HashSet::new();
-    let mut segments = 0;
-    let mut segment_max_size = 0;
-    while let Some(st) = nested_remote_prefixes_st.next().await {
-        let st = st?;
-        segment_max_size = segment_max_size.max(st.prefixes.len());
-        nested_remote_prefixes_combined.extend(st.prefixes.into_iter());
-        segments += 1;
-    }
-    assert!(segments > 1, "less than 2 segments: {segments}");
-    assert!(
-        segment_max_size * 2 <= nested_remote_prefixes_combined.len(),
-        "double of segment_max_size={segment_max_size} larger number of remote prefixes of {}",
-        nested_remote_prefixes_combined.len()
-    );
-    let remote_only_prefixes = nested_remote_prefixes_combined
-        .difference(&expected_remote_prefixes)
-        .collect::<HashSet<_>>();
-    let missing_uploaded_prefixes = expected_remote_prefixes
-        .difference(&nested_remote_prefixes_combined)
-        .collect::<HashSet<_>>();
-    assert_eq!(
-        remote_only_prefixes.len() + missing_uploaded_prefixes.len(), 0,
-        "remote storage nested prefixes list mismatches with the uploads. Remote only prefixes: {remote_only_prefixes:?}, missing uploaded prefixes: {missing_uploaded_prefixes:?}",
-    );
-
    Ok(())
 }

--- a/libs/remote_storage/tests/test_real_azure.rs
+++ b/libs/remote_storage/tests/test_real_azure.rs
@@ -31,7 +31,6 @@ struct EnabledAzure {
 impl EnabledAzure {
    async fn setup(max_keys_in_list_response: Option<i32>) -> Self {
        let client = create_azure_client(max_keys_in_list_response)
-            .await
            .context("Azure client creation")
            .expect("Azure client creation failed");

@@ -188,7 +187,7 @@ impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs {
    }
 }

-async fn create_azure_client(
+fn create_azure_client(
    max_keys_per_list_response: Option<i32>,
 ) -> anyhow::Result<Arc<GenericRemoteStorage>> {
    use rand::Rng;
@@ -222,8 +221,6 @@ async fn create_azure_client(
        timeout: Duration::from_secs(120),
    };
    Ok(Arc::new(
-        GenericRemoteStorage::from_config(&remote_storage_config)
-            .await
-            .context("remote storage init")?,
+        GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?,
    ))
 }
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -197,7 +197,6 @@ struct EnabledS3 {
 impl EnabledS3 {
    async fn setup(max_keys_in_list_response: Option<i32>) -> Self {
        let client = create_s3_client(max_keys_in_list_response)
-            .await
            .context("S3 client creation")
            .expect("S3 client creation failed");

@@ -353,7 +352,7 @@ impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs {
    }
 }

-async fn create_s3_client(
+fn create_s3_client(
    max_keys_per_list_response: Option<i32>,
 ) -> anyhow::Result<Arc<GenericRemoteStorage>> {
    use rand::Rng;
@@ -386,9 +385,7 @@ async fn create_s3_client(
        timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
    };
    Ok(Arc::new(
-        GenericRemoteStorage::from_config(&remote_storage_config)
-            .await
-            .context("remote storage init")?,
+        GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?,
    ))
 }

--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -20,6 +20,7 @@ bincode.workspace = true
 bytes.workspace = true
 camino.workspace = true
 chrono.workspace = true
+heapless.workspace = true
 hex = { workspace = true, features = ["serde"] }
 humantime.workspace = true
 hyper = { workspace = true, features = ["full"] }
--- a/libs/utils/src/auth.rs
+++ b/libs/utils/src/auth.rs
@@ -33,10 +33,6 @@ pub enum Scope {
    GenerationsApi,
    // Allows access to control plane managment API and some storage controller endpoints.
    Admin,
-
-    /// Allows access to storage controller APIs used by the scrubber, to interrogate the state
-    /// of a tenant & post scrub results.
-    Scrubber,
 }

 /// JWT payload. See docs/authentication.md for the format
--- a/libs/utils/src/circuit_breaker.rs
+++ b/libs/utils/src/circuit_breaker.rs
@@ -1,114 +0,0 @@
-use std::{
-    fmt::Display,
-    time::{Duration, Instant},
-};
-
-use metrics::IntCounter;
-
-/// Circuit breakers are for operations that are expensive and fallible: if they fail repeatedly,
-/// we will stop attempting them for some period of time, to avoid denial-of-service from retries, and
-/// to mitigate the log spam from repeated failures.
-pub struct CircuitBreaker {
-    /// An identifier that enables us to log useful errors when a circuit is broken
-    name: String,
-
-    /// Consecutive failures since last success
-    fail_count: usize,
-
-    /// How many consecutive failures before we break the circuit
-    fail_threshold: usize,
-
-    /// If circuit is broken, when was it broken?
-    broken_at: Option<Instant>,
-
-    /// If set, we will auto-reset the circuit this long after it was broken.  If None, broken
-    /// circuits stay broken forever, or until success() is called.
-    reset_period: Option<Duration>,
-
-    /// If this is true, no actual circuit-breaking happens.  This is for overriding a circuit breaker
-    /// to permit something to keep running even if it would otherwise have tripped it.
-    short_circuit: bool,
-}
-
-impl CircuitBreaker {
-    pub fn new(name: String, fail_threshold: usize, reset_period: Option<Duration>) -> Self {
-        Self {
-            name,
-            fail_count: 0,
-            fail_threshold,
-            broken_at: None,
-            reset_period,
-            short_circuit: false,
-        }
-    }
-
-    /// Construct an unbreakable circuit breaker, for use in unit tests etc.
-    pub fn short_circuit() -> Self {
-        Self {
-            name: String::new(),
-            fail_threshold: 0,
-            fail_count: 0,
-            broken_at: None,
-            reset_period: None,
-            short_circuit: true,
-        }
-    }
-
-    pub fn fail<E>(&mut self, metric: &IntCounter, error: E)
-    where
-        E: Display,
-    {
-        if self.short_circuit {
-            return;
-        }
-
-        self.fail_count += 1;
-        if self.broken_at.is_none() && self.fail_count >= self.fail_threshold {
-            self.break_circuit(metric, error);
-        }
-    }
-
-    /// Call this after successfully executing an operation
-    pub fn success(&mut self, metric: &IntCounter) {
-        self.fail_count = 0;
-        if let Some(broken_at) = &self.broken_at {
-            tracing::info!(breaker=%self.name, "Circuit breaker failure ended (was broken for {})",
-                humantime::format_duration(broken_at.elapsed()));
-            self.broken_at = None;
-            metric.inc();
-        }
-    }
-
-    /// Call this before attempting an operation, and skip the operation if we are currently broken.
-    pub fn is_broken(&mut self) -> bool {
-        if self.short_circuit {
-            return false;
-        }
-
-        if let Some(broken_at) = self.broken_at {
-            match self.reset_period {
-                Some(reset_period) if broken_at.elapsed() > reset_period => {
-                    self.reset_circuit();
-                    false
-                }
-                _ => true,
-            }
-        } else {
-            false
-        }
-    }
-
-    fn break_circuit<E>(&mut self, metric: &IntCounter, error: E)
-    where
-        E: Display,
-    {
-        self.broken_at = Some(Instant::now());
-        tracing::error!(breaker=%self.name, "Circuit breaker broken!  Last error: {error}");
-        metric.inc();
-    }
-
-    fn reset_circuit(&mut self) {
-        self.broken_at = None;
-        self.fail_count = 0;
-    }
-}
--- a/libs/utils/src/history_buffer.rs
+++ b/libs/utils/src/history_buffer.rs
@@ -0,0 +1,196 @@
+//! A heapless buffer for events of sorts.
+
+use std::ops;
+
+use heapless::HistoryBuffer;
+
+#[derive(Debug, Clone)]
+pub struct HistoryBufferWithDropCounter<T, const L: usize> {
+    buffer: HistoryBuffer<T, L>,
+    drop_count: u64,
+}
+
+impl<T, const L: usize> HistoryBufferWithDropCounter<T, L> {
+    pub fn write(&mut self, data: T) {
+        let len_before = self.buffer.len();
+        self.buffer.write(data);
+        let len_after = self.buffer.len();
+        self.drop_count += u64::from(len_before == len_after);
+    }
+    pub fn drop_count(&self) -> u64 {
+        self.drop_count
+    }
+    pub fn map<U, F: Fn(&T) -> U>(&self, f: F) -> HistoryBufferWithDropCounter<U, L> {
+        let mut buffer = HistoryBuffer::new();
+        buffer.extend(self.buffer.oldest_ordered().map(f));
+        HistoryBufferWithDropCounter::<U, L> {
+            buffer,
+            drop_count: self.drop_count,
+        }
+    }
+}
+
+impl<T, const L: usize> Default for HistoryBufferWithDropCounter<T, L> {
+    fn default() -> Self {
+        Self {
+            buffer: HistoryBuffer::default(),
+            drop_count: 0,
+        }
+    }
+}
+
+impl<T, const L: usize> ops::Deref for HistoryBufferWithDropCounter<T, L> {
+    type Target = HistoryBuffer<T, L>;
+
+    fn deref(&self) -> &Self::Target {
+        &self.buffer
+    }
+}
+
+#[derive(serde::Serialize, serde::Deserialize)]
+struct SerdeRepr<T> {
+    buffer: Vec<T>,
+    buffer_size: usize,
+    drop_count: u64,
+}
+
+impl<'a, T, const L: usize> From<&'a HistoryBufferWithDropCounter<T, L>> for SerdeRepr<T>
+where
+    T: Clone + serde::Serialize,
+{
+    fn from(value: &'a HistoryBufferWithDropCounter<T, L>) -> Self {
+        let HistoryBufferWithDropCounter { buffer, drop_count } = value;
+        SerdeRepr {
+            buffer: buffer.iter().cloned().collect(),
+            buffer_size: L,
+            drop_count: *drop_count,
+        }
+    }
+}
+
+impl<T, const L: usize> serde::Serialize for HistoryBufferWithDropCounter<T, L>
+where
+    T: Clone + serde::Serialize,
+{
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        SerdeRepr::from(self).serialize(serializer)
+    }
+}
+
+impl<'de, T, const L: usize> serde::de::Deserialize<'de> for HistoryBufferWithDropCounter<T, L>
+where
+    T: Clone + serde::Deserialize<'de>,
+{
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        let SerdeRepr {
+            buffer: des_buffer,
+            drop_count,
+            buffer_size,
+        } = SerdeRepr::<T>::deserialize(deserializer)?;
+        if buffer_size != L {
+            use serde::de::Error;
+            return Err(D::Error::custom(format!(
+                "invalid buffer_size, expecting {L} got {buffer_size}"
+            )));
+        }
+        let mut buffer = HistoryBuffer::new();
+        buffer.extend(des_buffer);
+        Ok(HistoryBufferWithDropCounter { buffer, drop_count })
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::HistoryBufferWithDropCounter;
+
+    #[test]
+    fn test_basics() {
+        let mut b = HistoryBufferWithDropCounter::<usize, 2>::default();
+        b.write(1);
+        b.write(2);
+        b.write(3);
+        assert!(b.iter().any(|e| *e == 2));
+        assert!(b.iter().any(|e| *e == 3));
+        assert!(!b.iter().any(|e| *e == 1));
+
+        // round-trip serde
+        let round_tripped: HistoryBufferWithDropCounter<usize, 2> =
+            serde_json::from_str(&serde_json::to_string(&b).unwrap()).unwrap();
+        assert_eq!(
+            round_tripped.iter().cloned().collect::<Vec<_>>(),
+            b.iter().cloned().collect::<Vec<_>>()
+        );
+    }
+
+    #[test]
+    fn test_drop_count_works() {
+        let mut b = HistoryBufferWithDropCounter::<_, 2>::default();
+        b.write(1);
+        assert_eq!(b.drop_count(), 0);
+        b.write(2);
+        assert_eq!(b.drop_count(), 0);
+        b.write(3);
+        assert_eq!(b.drop_count(), 1);
+        b.write(4);
+        assert_eq!(b.drop_count(), 2);
+    }
+
+    #[test]
+    fn test_clone_works() {
+        let mut b = HistoryBufferWithDropCounter::<_, 2>::default();
+        b.write(1);
+        b.write(2);
+        b.write(3);
+        assert_eq!(b.drop_count(), 1);
+        let mut c = b.clone();
+        assert_eq!(c.drop_count(), 1);
+        assert!(c.iter().any(|e| *e == 2));
+        assert!(c.iter().any(|e| *e == 3));
+        assert!(!c.iter().any(|e| *e == 1));
+
+        c.write(4);
+        assert!(c.iter().any(|e| *e == 4));
+        assert!(!b.iter().any(|e| *e == 4));
+    }
+
+    #[test]
+    fn test_map() {
+        let mut b = HistoryBufferWithDropCounter::<_, 2>::default();
+
+        b.write(1);
+        assert_eq!(b.drop_count(), 0);
+        {
+            let c = b.map(|i| i + 10);
+            assert_eq!(c.oldest_ordered().cloned().collect::<Vec<_>>(), vec![11]);
+            assert_eq!(c.drop_count(), 0);
+        }
+
+        b.write(2);
+        assert_eq!(b.drop_count(), 0);
+        {
+            let c = b.map(|i| i + 10);
+            assert_eq!(
+                c.oldest_ordered().cloned().collect::<Vec<_>>(),
+                vec![11, 12]
+            );
+            assert_eq!(c.drop_count(), 0);
+        }
+
+        b.write(3);
+        assert_eq!(b.drop_count(), 1);
+        {
+            let c = b.map(|i| i + 10);
+            assert_eq!(
+                c.oldest_ordered().cloned().collect::<Vec<_>>(),
+                vec![12, 13]
+            );
+            assert_eq!(c.drop_count(), 1);
+        }
+    }
+}
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -52,17 +52,17 @@ struct RequestId(String);
 /// There could be other ways to implement similar functionality:
 ///
 /// * procmacros placed on top of all handler methods
-///   With all the drawbacks of procmacros, brings no difference implementation-wise,
-///   and little code reduction compared to the existing approach.
+/// With all the drawbacks of procmacros, brings no difference implementation-wise,
+/// and little code reduction compared to the existing approach.
 ///
 /// * Another `TraitExt` with e.g. the `get_with_span`, `post_with_span` methods to do similar logic,
-///   implemented for [`RouterBuilder`].
-///   Could be simpler, but we don't want to depend on [`routerify`] more, targeting to use other library later.
+/// implemented for [`RouterBuilder`].
+/// Could be simpler, but we don't want to depend on [`routerify`] more, targeting to use other library later.
 ///
 /// * In theory, a span guard could've been created in a pre-request middleware and placed into a global collection, to be dropped
-///   later, in a post-response middleware.
-///   Due to suspendable nature of the futures, would give contradictive results which is exactly the opposite of what `tracing-futures`
-///   tries to achive with its `.instrument` used in the current approach.
+/// later, in a post-response middleware.
+/// Due to suspendable nature of the futures, would give contradictive results which is exactly the opposite of what `tracing-futures`
+/// tries to achive with its `.instrument` used in the current approach.
 ///
 /// If needed, a declarative macro to substitute the |r| ... closure boilerplate could be introduced.
 pub async fn request_span<R, H>(request: Request<Body>, handler: H) -> R::Output
--- a/libs/utils/src/http/request.rs
+++ b/libs/utils/src/http/request.rs
@@ -74,15 +74,6 @@ pub fn parse_query_param<E: fmt::Display, T: FromStr<Err = E>>(
        .transpose()
 }

-pub fn must_parse_query_param<E: fmt::Display, T: FromStr<Err = E>>(
-    request: &Request<Body>,
-    param_name: &str,
-) -> Result<T, ApiError> {
-    parse_query_param(request, param_name)?.ok_or_else(|| {
-        ApiError::BadRequest(anyhow!("no {param_name} specified in query parameters"))
-    })
-}
-
 pub async fn ensure_no_body(request: &mut Request<Body>) -> Result<(), ApiError> {
    match request.body_mut().data().await {
        Some(_) => Err(ApiError::BadRequest(anyhow!("Unexpected request body"))),
--- a/libs/utils/src/id.rs
+++ b/libs/utils/src/id.rs
@@ -302,6 +302,17 @@ pub struct TenantId(Id);

 id_newtype!(TenantId);

+/// Neon Connection Id identifies long-lived connections (for example a pagestream
+/// connection with the page_service). Is used for better logging and tracing
+///
+/// NOTE: It (de)serializes as an array of hex bytes, so the string representation would look
+/// like `[173,80,132,115,129,226,72,254,170,201,135,108,199,26,228,24]`.
+/// See [`Id`] for alternative ways to serialize it.
+#[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, PartialOrd, Ord)]
+pub struct ConnectionId(Id);
+
+id_newtype!(ConnectionId);
+
 // A pair uniquely identifying Neon instance.
 #[derive(Debug, Clone, Copy, PartialOrd, Ord, PartialEq, Eq, Hash, Serialize, Deserialize)]
 pub struct TenantTimelineId {
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -26,8 +26,6 @@ pub mod auth;
 // utility functions and helper traits for unified unique id generation/serialization etc.
 pub mod id;

-pub mod shard;
-
 mod hex;
 pub use hex::Hex;

@@ -59,6 +57,8 @@ pub mod signals;

 pub mod fs_ext;

+pub mod history_buffer;
+
 pub mod measured_stream;

 pub mod serde_percent;
@@ -96,8 +96,6 @@ pub mod poison;

 pub mod toml_edit_ext;

-pub mod circuit_breaker;
-
 /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
 ///
 /// we have several cases:
--- a/libs/utils/src/shard.rs
+++ b/libs/utils/src/shard.rs
@@ -1,452 +0,0 @@
-//! See `pageserver_api::shard` for description on sharding.
-
-use std::{ops::RangeInclusive, str::FromStr};
-
-use hex::FromHex;
-use serde::{Deserialize, Serialize};
-
-use crate::id::TenantId;
-
-#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
-pub struct ShardNumber(pub u8);
-
-#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
-pub struct ShardCount(pub u8);
-
-/// Combination of ShardNumber and ShardCount.  For use within the context of a particular tenant,
-/// when we need to know which shard we're dealing with, but do not need to know the full
-/// ShardIdentity (because we won't be doing any page->shard mapping), and do not need to know
-/// the fully qualified TenantShardId.
-#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
-pub struct ShardIndex {
-    pub shard_number: ShardNumber,
-    pub shard_count: ShardCount,
-}
-
-/// Formatting helper, for generating the `shard_id` label in traces.
-pub struct ShardSlug<'a>(&'a TenantShardId);
-
-/// TenantShardId globally identifies a particular shard in a particular tenant.
-///
-/// These are written as `<TenantId>-<ShardSlug>`, for example:
-///   # The second shard in a two-shard tenant
-///   072f1291a5310026820b2fe4b2968934-0102
-///
-/// If the `ShardCount` is _unsharded_, the `TenantShardId` is written without
-/// a shard suffix and is equivalent to the encoding of a `TenantId`: this enables
-/// an unsharded [`TenantShardId`] to be used interchangably with a [`TenantId`].
-///
-/// The human-readable encoding of an unsharded TenantShardId, such as used in API URLs,
-/// is both forward and backward compatible with TenantId: a legacy TenantId can be
-/// decoded as a TenantShardId, and when re-encoded it will be parseable
-/// as a TenantId.
-#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
-pub struct TenantShardId {
-    pub tenant_id: TenantId,
-    pub shard_number: ShardNumber,
-    pub shard_count: ShardCount,
-}
-
-impl ShardCount {
-    pub const MAX: Self = Self(u8::MAX);
-    pub const MIN: Self = Self(0);
-
-    /// The internal value of a ShardCount may be zero, which means "1 shard, but use
-    /// legacy format for TenantShardId that excludes the shard suffix", also known
-    /// as [`TenantShardId::unsharded`].
-    ///
-    /// This method returns the actual number of shards, i.e. if our internal value is
-    /// zero, we return 1 (unsharded tenants have 1 shard).
-    pub fn count(&self) -> u8 {
-        if self.0 > 0 {
-            self.0
-        } else {
-            1
-        }
-    }
-
-    /// The literal internal value: this is **not** the number of shards in the
-    /// tenant, as we have a special zero value for legacy unsharded tenants.  Use
-    /// [`Self::count`] if you want to know the cardinality of shards.
-    pub fn literal(&self) -> u8 {
-        self.0
-    }
-
-    /// Whether the `ShardCount` is for an unsharded tenant, so uses one shard but
-    /// uses the legacy format for `TenantShardId`. See also the documentation for
-    /// [`Self::count`].
-    pub fn is_unsharded(&self) -> bool {
-        self.0 == 0
-    }
-
-    /// `v` may be zero, or the number of shards in the tenant.  `v` is what
-    /// [`Self::literal`] would return.
-    pub const fn new(val: u8) -> Self {
-        Self(val)
-    }
-}
-
-impl ShardNumber {
-    pub const MAX: Self = Self(u8::MAX);
-}
-
-impl TenantShardId {
-    pub fn unsharded(tenant_id: TenantId) -> Self {
-        Self {
-            tenant_id,
-            shard_number: ShardNumber(0),
-            shard_count: ShardCount(0),
-        }
-    }
-
-    /// The range of all TenantShardId that belong to a particular TenantId.  This is useful when
-    /// you have a BTreeMap of TenantShardId, and are querying by TenantId.
-    pub fn tenant_range(tenant_id: TenantId) -> RangeInclusive<Self> {
-        RangeInclusive::new(
-            Self {
-                tenant_id,
-                shard_number: ShardNumber(0),
-                shard_count: ShardCount(0),
-            },
-            Self {
-                tenant_id,
-                shard_number: ShardNumber::MAX,
-                shard_count: ShardCount::MAX,
-            },
-        )
-    }
-
-    pub fn shard_slug(&self) -> impl std::fmt::Display + '_ {
-        ShardSlug(self)
-    }
-
-    /// Convenience for code that has special behavior on the 0th shard.
-    pub fn is_shard_zero(&self) -> bool {
-        self.shard_number == ShardNumber(0)
-    }
-
-    /// The "unsharded" value is distinct from simply having a single shard: it represents
-    /// a tenant which is not shard-aware at all, and whose storage paths will not include
-    /// a shard suffix.
-    pub fn is_unsharded(&self) -> bool {
-        self.shard_number == ShardNumber(0) && self.shard_count.is_unsharded()
-    }
-
-    /// Convenience for dropping the tenant_id and just getting the ShardIndex: this
-    /// is useful when logging from code that is already in a span that includes tenant ID, to
-    /// keep messages reasonably terse.
-    pub fn to_index(&self) -> ShardIndex {
-        ShardIndex {
-            shard_number: self.shard_number,
-            shard_count: self.shard_count,
-        }
-    }
-
-    /// Calculate the children of this TenantShardId when splitting the overall tenant into
-    /// the given number of shards.
-    pub fn split(&self, new_shard_count: ShardCount) -> Vec<TenantShardId> {
-        let effective_old_shard_count = std::cmp::max(self.shard_count.0, 1);
-        let mut child_shards = Vec::new();
-        for shard_number in 0..ShardNumber(new_shard_count.0).0 {
-            // Key mapping is based on a round robin mapping of key hash modulo shard count,
-            // so our child shards are the ones which the same keys would map to.
-            if shard_number % effective_old_shard_count == self.shard_number.0 {
-                child_shards.push(TenantShardId {
-                    tenant_id: self.tenant_id,
-                    shard_number: ShardNumber(shard_number),
-                    shard_count: new_shard_count,
-                })
-            }
-        }
-
-        child_shards
-    }
-}
-
-impl<'a> std::fmt::Display for ShardSlug<'a> {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(
-            f,
-            "{:02x}{:02x}",
-            self.0.shard_number.0, self.0.shard_count.0
-        )
-    }
-}
-
-impl std::fmt::Display for TenantShardId {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        if self.shard_count != ShardCount(0) {
-            write!(f, "{}-{}", self.tenant_id, self.shard_slug())
-        } else {
-            // Legacy case (shard_count == 0) -- format as just the tenant id.  Note that this
-            // is distinct from the normal single shard case (shard count == 1).
-            self.tenant_id.fmt(f)
-        }
-    }
-}
-
-impl std::fmt::Debug for TenantShardId {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        // Debug is the same as Display: the compact hex representation
-        write!(f, "{}", self)
-    }
-}
-
-impl std::str::FromStr for TenantShardId {
-    type Err = hex::FromHexError;
-
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        // Expect format: 16 byte TenantId, '-', 1 byte shard number, 1 byte shard count
-        if s.len() == 32 {
-            // Legacy case: no shard specified
-            Ok(Self {
-                tenant_id: TenantId::from_str(s)?,
-                shard_number: ShardNumber(0),
-                shard_count: ShardCount(0),
-            })
-        } else if s.len() == 37 {
-            let bytes = s.as_bytes();
-            let tenant_id = TenantId::from_hex(&bytes[0..32])?;
-            let mut shard_parts: [u8; 2] = [0u8; 2];
-            hex::decode_to_slice(&bytes[33..37], &mut shard_parts)?;
-            Ok(Self {
-                tenant_id,
-                shard_number: ShardNumber(shard_parts[0]),
-                shard_count: ShardCount(shard_parts[1]),
-            })
-        } else {
-            Err(hex::FromHexError::InvalidStringLength)
-        }
-    }
-}
-
-impl From<[u8; 18]> for TenantShardId {
-    fn from(b: [u8; 18]) -> Self {
-        let tenant_id_bytes: [u8; 16] = b[0..16].try_into().unwrap();
-
-        Self {
-            tenant_id: TenantId::from(tenant_id_bytes),
-            shard_number: ShardNumber(b[16]),
-            shard_count: ShardCount(b[17]),
-        }
-    }
-}
-
-impl ShardIndex {
-    pub fn new(number: ShardNumber, count: ShardCount) -> Self {
-        Self {
-            shard_number: number,
-            shard_count: count,
-        }
-    }
-    pub fn unsharded() -> Self {
-        Self {
-            shard_number: ShardNumber(0),
-            shard_count: ShardCount(0),
-        }
-    }
-
-    /// The "unsharded" value is distinct from simply having a single shard: it represents
-    /// a tenant which is not shard-aware at all, and whose storage paths will not include
-    /// a shard suffix.
-    pub fn is_unsharded(&self) -> bool {
-        self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
-    }
-
-    /// For use in constructing remote storage paths: concatenate this with a TenantId
-    /// to get a fully qualified TenantShardId.
-    ///
-    /// Backward compat: this function returns an empty string if Self::is_unsharded, such
-    /// that the legacy pre-sharding remote key format is preserved.
-    pub fn get_suffix(&self) -> String {
-        if self.is_unsharded() {
-            "".to_string()
-        } else {
-            format!("-{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
-        }
-    }
-}
-
-impl std::fmt::Display for ShardIndex {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
-    }
-}
-
-impl std::fmt::Debug for ShardIndex {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        // Debug is the same as Display: the compact hex representation
-        write!(f, "{}", self)
-    }
-}
-
-impl std::str::FromStr for ShardIndex {
-    type Err = hex::FromHexError;
-
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        // Expect format: 1 byte shard number, 1 byte shard count
-        if s.len() == 4 {
-            let bytes = s.as_bytes();
-            let mut shard_parts: [u8; 2] = [0u8; 2];
-            hex::decode_to_slice(bytes, &mut shard_parts)?;
-            Ok(Self {
-                shard_number: ShardNumber(shard_parts[0]),
-                shard_count: ShardCount(shard_parts[1]),
-            })
-        } else {
-            Err(hex::FromHexError::InvalidStringLength)
-        }
-    }
-}
-
-impl From<[u8; 2]> for ShardIndex {
-    fn from(b: [u8; 2]) -> Self {
-        Self {
-            shard_number: ShardNumber(b[0]),
-            shard_count: ShardCount(b[1]),
-        }
-    }
-}
-
-impl Serialize for TenantShardId {
-    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
-    where
-        S: serde::Serializer,
-    {
-        if serializer.is_human_readable() {
-            serializer.collect_str(self)
-        } else {
-            // Note: while human encoding of [`TenantShardId`] is backward and forward
-            // compatible, this binary encoding is not.
-            let mut packed: [u8; 18] = [0; 18];
-            packed[0..16].clone_from_slice(&self.tenant_id.as_arr());
-            packed[16] = self.shard_number.0;
-            packed[17] = self.shard_count.0;
-
-            packed.serialize(serializer)
-        }
-    }
-}
-
-impl<'de> Deserialize<'de> for TenantShardId {
-    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
-    where
-        D: serde::Deserializer<'de>,
-    {
-        struct IdVisitor {
-            is_human_readable_deserializer: bool,
-        }
-
-        impl<'de> serde::de::Visitor<'de> for IdVisitor {
-            type Value = TenantShardId;
-
-            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
-                if self.is_human_readable_deserializer {
-                    formatter.write_str("value in form of hex string")
-                } else {
-                    formatter.write_str("value in form of integer array([u8; 18])")
-                }
-            }
-
-            fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
-            where
-                A: serde::de::SeqAccess<'de>,
-            {
-                let s = serde::de::value::SeqAccessDeserializer::new(seq);
-                let id: [u8; 18] = Deserialize::deserialize(s)?;
-                Ok(TenantShardId::from(id))
-            }
-
-            fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
-            where
-                E: serde::de::Error,
-            {
-                TenantShardId::from_str(v).map_err(E::custom)
-            }
-        }
-
-        if deserializer.is_human_readable() {
-            deserializer.deserialize_str(IdVisitor {
-                is_human_readable_deserializer: true,
-            })
-        } else {
-            deserializer.deserialize_tuple(
-                18,
-                IdVisitor {
-                    is_human_readable_deserializer: false,
-                },
-            )
-        }
-    }
-}
-
-impl Serialize for ShardIndex {
-    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
-    where
-        S: serde::Serializer,
-    {
-        if serializer.is_human_readable() {
-            serializer.collect_str(self)
-        } else {
-            // Binary encoding is not used in index_part.json, but is included in anticipation of
-            // switching various structures (e.g. inter-process communication, remote metadata) to more
-            // compact binary encodings in future.
-            let mut packed: [u8; 2] = [0; 2];
-            packed[0] = self.shard_number.0;
-            packed[1] = self.shard_count.0;
-            packed.serialize(serializer)
-        }
-    }
-}
-
-impl<'de> Deserialize<'de> for ShardIndex {
-    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
-    where
-        D: serde::Deserializer<'de>,
-    {
-        struct IdVisitor {
-            is_human_readable_deserializer: bool,
-        }
-
-        impl<'de> serde::de::Visitor<'de> for IdVisitor {
-            type Value = ShardIndex;
-
-            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
-                if self.is_human_readable_deserializer {
-                    formatter.write_str("value in form of hex string")
-                } else {
-                    formatter.write_str("value in form of integer array([u8; 2])")
-                }
-            }
-
-            fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
-            where
-                A: serde::de::SeqAccess<'de>,
-            {
-                let s = serde::de::value::SeqAccessDeserializer::new(seq);
-                let id: [u8; 2] = Deserialize::deserialize(s)?;
-                Ok(ShardIndex::from(id))
-            }
-
-            fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
-            where
-                E: serde::de::Error,
-            {
-                ShardIndex::from_str(v).map_err(E::custom)
-            }
-        }
-
-        if deserializer.is_human_readable() {
-            deserializer.deserialize_str(IdVisitor {
-                is_human_readable_deserializer: true,
-            })
-        } else {
-            deserializer.deserialize_tuple(
-                2,
-                IdVisitor {
-                    is_human_readable_deserializer: false,
-                },
-            )
-        }
-    }
-}
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -62,7 +62,6 @@ sync_wrapper.workspace = true
 sysinfo.workspace = true
 tokio-tar.workspace = true
 thiserror.workspace = true
-tikv-jemallocator.workspace = true
 tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
 tokio-epoll-uring.workspace = true
 tokio-io-timeout.workspace = true
--- a/pageserver/client/Cargo.toml
+++ b/pageserver/client/Cargo.toml
@@ -8,7 +8,7 @@ license.workspace = true
 pageserver_api.workspace = true
 thiserror.workspace = true
 async-trait.workspace = true
-reqwest = { workspace = true, features = [ "stream" ] }
+reqwest.workspace = true
 utils.workspace = true
 serde.workspace = true
 workspace_hack = { version = "0.1", path = "../../workspace_hack" }
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -1,7 +1,6 @@
 use std::collections::HashMap;

 use bytes::Bytes;
-use detach_ancestor::AncestorDetached;
 use pageserver_api::{models::*, shard::TenantShardId};
 use reqwest::{IntoUrl, Method, StatusCode};
 use utils::{
@@ -10,8 +9,6 @@ use utils::{
    lsn::Lsn,
 };

-pub use reqwest::Body as ReqwestBody;
-
 pub mod util;

 #[derive(Debug, Clone)]
@@ -23,9 +20,6 @@ pub struct Client {

 #[derive(thiserror::Error, Debug)]
 pub enum Error {
-    #[error("send request: {0}")]
-    SendRequest(reqwest::Error),
-
    #[error("receive body: {0}")]
    ReceiveBody(reqwest::Error),

@@ -179,30 +173,19 @@ impl Client {
        self.request(Method::GET, uri, ()).await
    }

-    fn start_request<U: reqwest::IntoUrl>(
-        &self,
-        method: Method,
-        uri: U,
-    ) -> reqwest::RequestBuilder {
-        let req = self.client.request(method, uri);
-        if let Some(value) = &self.authorization_header {
-            req.header(reqwest::header::AUTHORIZATION, value)
-        } else {
-            req
-        }
-    }
-
    async fn request_noerror<B: serde::Serialize, U: reqwest::IntoUrl>(
        &self,
        method: Method,
        uri: U,
        body: B,
    ) -> Result<reqwest::Response> {
-        self.start_request(method, uri)
-            .json(&body)
-            .send()
-            .await
-            .map_err(Error::ReceiveBody)
+        let req = self.client.request(method, uri);
+        let req = if let Some(value) = &self.authorization_header {
+            req.header(reqwest::header::AUTHORIZATION, value)
+        } else {
+            req
+        };
+        req.json(&body).send().await.map_err(Error::ReceiveBody)
    }

    async fn request<B: serde::Serialize, U: reqwest::IntoUrl>(
@@ -419,23 +402,6 @@ impl Client {
        }
    }

-    pub async fn timeline_detach_ancestor(
-        &self,
-        tenant_shard_id: TenantShardId,
-        timeline_id: TimelineId,
-    ) -> Result<AncestorDetached> {
-        let uri = format!(
-            "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/detach_ancestor",
-            self.mgmt_api_endpoint
-        );
-
-        self.request(Method::PUT, &uri, ())
-            .await?
-            .json()
-            .await
-            .map_err(Error::ReceiveBody)
-    }
-
    pub async fn tenant_reset(&self, tenant_shard_id: TenantShardId) -> Result<()> {
        let uri = format!(
            "{}/v1/tenant/{}/reset",
@@ -643,53 +609,4 @@ impl Client {
            }),
        }
    }
-
-    pub async fn import_basebackup(
-        &self,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-        base_lsn: Lsn,
-        end_lsn: Lsn,
-        pg_version: u32,
-        basebackup_tarball: ReqwestBody,
-    ) -> Result<()> {
-        let uri = format!(
-            "{}/v1/tenant/{tenant_id}/timeline/{timeline_id}/import_basebackup?base_lsn={base_lsn}&end_lsn={end_lsn}&pg_version={pg_version}",
-            self.mgmt_api_endpoint,
-        );
-        self.start_request(Method::PUT, uri)
-            .body(basebackup_tarball)
-            .send()
-            .await
-            .map_err(Error::SendRequest)?
-            .error_from_body()
-            .await?
-            .json()
-            .await
-            .map_err(Error::ReceiveBody)
-    }
-
-    pub async fn import_wal(
-        &self,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-        start_lsn: Lsn,
-        end_lsn: Lsn,
-        wal_tarball: ReqwestBody,
-    ) -> Result<()> {
-        let uri = format!(
-            "{}/v1/tenant/{tenant_id}/timeline/{timeline_id}/import_wal?start_lsn={start_lsn}&end_lsn={end_lsn}",
-            self.mgmt_api_endpoint,
-        );
-        self.start_request(Method::PUT, uri)
-            .body(wal_tarball)
-            .send()
-            .await
-            .map_err(Error::SendRequest)?
-            .error_from_body()
-            .await?
-            .json()
-            .await
-            .map_err(Error::ReceiveBody)
-    }
 }
--- a/pageserver/compaction/src/interface.rs
+++ b/pageserver/compaction/src/interface.rs
@@ -131,7 +131,7 @@ impl CompactionKey for Key {
 pub type CompactionKeySpace<K> = Vec<Range<K>>;

 /// Functions needed from all layers.
-pub trait CompactionLayer<K: CompactionKey> {
+pub trait CompactionLayer<K: CompactionKey + ?Sized> {
    fn key_range(&self) -> &Range<K>;
    fn lsn_range(&self) -> &Range<Lsn>;

--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -179,7 +179,7 @@ async fn main() -> anyhow::Result<()> {
                .get("remote_storage")
                .expect("need remote_storage");
            let config = RemoteStorageConfig::from_toml(toml_item)?;
-            let storage = remote_storage::GenericRemoteStorage::from_config(&config).await;
+            let storage = remote_storage::GenericRemoteStorage::from_config(&config);
            let cancel = CancellationToken::new();
            storage
                .unwrap()
--- a/pageserver/src/auth.rs
+++ b/pageserver/src/auth.rs
@@ -14,14 +14,12 @@ pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<
        }
        (Scope::PageServerApi, None) => Ok(()), // access to management api for PageServerApi scope
        (Scope::PageServerApi, Some(_)) => Ok(()), // access to tenant api using PageServerApi scope
-        (Scope::Admin | Scope::SafekeeperData | Scope::GenerationsApi | Scope::Scrubber, _) => {
-            Err(AuthError(
-                format!(
-                    "JWT scope '{:?}' is ineligible for Pageserver auth",
-                    claims.scope
-                )
-                .into(),
-            ))
-        }
+        (Scope::Admin | Scope::SafekeeperData | Scope::GenerationsApi, _) => Err(AuthError(
+            format!(
+                "JWT scope '{:?}' is ineligible for Pageserver auth",
+                claims.scope
+            )
+            .into(),
+        )),
    }
 }
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -2,35 +2,30 @@

 //! Main entry point for the Page Server executable.

-use std::env;
 use std::env::{var, VarError};
 use std::io::Read;
 use std::sync::Arc;
 use std::time::Duration;
+use std::{env, ops::ControlFlow, str::FromStr};

 use anyhow::{anyhow, Context};
 use camino::Utf8Path;
 use clap::{Arg, ArgAction, Command};

 use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp};
-use pageserver::config::PageserverIdentity;
 use pageserver::control_plane_client::ControlPlaneClient;
 use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
 use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
 use pageserver::task_mgr::WALRECEIVER_RUNTIME;
 use pageserver::tenant::{secondary, TenantSharedResources};
-use pageserver::{
-    CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener, LibpqEndpointListener,
-};
 use remote_storage::GenericRemoteStorage;
 use tokio::signal::unix::SignalKind;
 use tokio::time::Instant;
-use tokio_util::sync::CancellationToken;
 use tracing::*;

 use metrics::set_build_info_metric;
 use pageserver::{
-    config::PageServerConf,
+    config::{defaults::*, PageServerConf},
    context::{DownloadBehavior, RequestContext},
    deletion_queue::DeletionQueue,
    http, page_cache, page_service, task_mgr,
@@ -52,9 +47,6 @@ use utils::{
 project_git_version!(GIT_VERSION);
 project_build_tag!(BUILD_TAG);

-#[global_allocator]
-static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
-
 const PID_FILE_NAME: &str = "pageserver.pid";

 const FEATURES: &[&str] = &[
@@ -89,13 +81,18 @@ fn main() -> anyhow::Result<()> {
        .with_context(|| format!("Error opening workdir '{workdir}'"))?;

    let cfg_file_path = workdir.join("pageserver.toml");
-    let identity_file_path = workdir.join("identity.toml");

    // Set CWD to workdir for non-daemon modes
    env::set_current_dir(&workdir)
        .with_context(|| format!("Failed to set application's current dir to '{workdir}'"))?;

-    let conf = initialize_config(&identity_file_path, &cfg_file_path, &workdir)?;
+    let conf = match initialize_config(&cfg_file_path, arg_matches, &workdir)? {
+        ControlFlow::Continue(conf) => conf,
+        ControlFlow::Break(()) => {
+            info!("Pageserver config init successful");
+            return Ok(());
+        }
+    };

    // Initialize logging.
    //
@@ -150,55 +147,70 @@ fn main() -> anyhow::Result<()> {
 }

 fn initialize_config(
-    identity_file_path: &Utf8Path,
    cfg_file_path: &Utf8Path,
+    arg_matches: clap::ArgMatches,
    workdir: &Utf8Path,
-) -> anyhow::Result<&'static PageServerConf> {
-    // The deployment orchestrator writes out an indentity file containing the node id
-    // for all pageservers. This file is the source of truth for the node id. In order
-    // to allow for rolling back pageserver releases, the node id is also included in
-    // the pageserver config that the deployment orchestrator writes to disk for the pageserver.
-    // A rolled back version of the pageserver will get the node id from the pageserver.toml
-    // config file.
-    let identity = match std::fs::File::open(identity_file_path) {
+) -> anyhow::Result<ControlFlow<(), &'static PageServerConf>> {
+    let init = arg_matches.get_flag("init");
+
+    let file_contents: Option<toml_edit::Document> = match std::fs::File::open(cfg_file_path) {
        Ok(mut f) => {
-            let md = f.metadata().context("stat config file")?;
-            if !md.is_file() {
-                anyhow::bail!("Pageserver found identity file but it is a dir entry: {identity_file_path}. Aborting start up ...");
+            if init {
+                anyhow::bail!("config file already exists: {cfg_file_path}");
            }
-
-            let mut s = String::new();
-            f.read_to_string(&mut s).context("read identity file")?;
-            toml_edit::de::from_str::<PageserverIdentity>(&s)?
-        }
-        Err(e) => {
-            anyhow::bail!("Pageserver could not read identity file: {identity_file_path}: {e}. Aborting start up ...");
-        }
-    };
-
-    let config: toml_edit::Document = match std::fs::File::open(cfg_file_path) {
-        Ok(mut f) => {
            let md = f.metadata().context("stat config file")?;
            if md.is_file() {
                let mut s = String::new();
                f.read_to_string(&mut s).context("read config file")?;
-                s.parse().context("parse config file toml")?
+                Some(s.parse().context("parse config file toml")?)
            } else {
                anyhow::bail!("directory entry exists but is not a file: {cfg_file_path}");
            }
        }
+        Err(e) if e.kind() == std::io::ErrorKind::NotFound => None,
        Err(e) => {
            anyhow::bail!("open pageserver config: {e}: {cfg_file_path}");
        }
    };

-    debug!("Using pageserver toml: {config}");
+    let mut effective_config = file_contents.unwrap_or_else(|| {
+        DEFAULT_CONFIG_FILE
+            .parse()
+            .expect("unit tests ensure this works")
+    });
+
+    // Patch with overrides from the command line
+    if let Some(values) = arg_matches.get_many::<String>("config-override") {
+        for option_line in values {
+            let doc = toml_edit::Document::from_str(option_line).with_context(|| {
+                format!("Option '{option_line}' could not be parsed as a toml document")
+            })?;
+
+            for (key, item) in doc.iter() {
+                effective_config.insert(key, item.clone());
+            }
+        }
+    }
+
+    debug!("Resulting toml: {effective_config}");

    // Construct the runtime representation
-    let conf = PageServerConf::parse_and_validate(identity.id, &config, workdir)
+    let conf = PageServerConf::parse_and_validate(&effective_config, workdir)
        .context("Failed to parse pageserver configuration")?;

-    Ok(Box::leak(Box::new(conf)))
+    if init {
+        info!("Writing pageserver config to '{cfg_file_path}'");
+
+        std::fs::write(cfg_file_path, effective_config.to_string())
+            .with_context(|| format!("Failed to write pageserver config to '{cfg_file_path}'"))?;
+        info!("Config successfully written to '{cfg_file_path}'")
+    }
+
+    Ok(if init {
+        ControlFlow::Break(())
+    } else {
+        ControlFlow::Continue(Box::leak(Box::new(conf)))
+    })
 }

 struct WaitForPhaseResult<F: std::future::Future + Unpin> {
@@ -290,7 +302,6 @@ fn start_pageserver(
    // Create and lock PID file. This ensures that there cannot be more than one
    // pageserver process running at the same time.
    let lock_file_path = conf.workdir.join(PID_FILE_NAME);
-    info!("Claiming pid file at {lock_file_path:?}...");
    let lock_file =
        utils::pid_file::claim_for_current_process(&lock_file_path).context("claim pid file")?;
    info!("Claimed pid file at {lock_file_path:?}");
@@ -371,7 +382,7 @@ fn start_pageserver(
    let shutdown_pageserver = tokio_util::sync::CancellationToken::new();

    // Set up remote storage client
-    let remote_storage = BACKGROUND_RUNTIME.block_on(create_remote_storage_client(conf))?;
+    let remote_storage = create_remote_storage_client(conf)?;

    // Set up deletion queue
    let (deletion_queue, deletion_workers) = DeletionQueue::new(
@@ -416,10 +427,8 @@ fn start_pageserver(

    // Scan the local 'tenants/' directory and start loading the tenants
    let deletion_queue_client = deletion_queue.new_client();
-    let background_purges = mgr::BackgroundPurges::default();
    let tenant_manager = BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
        conf,
-        background_purges.clone(),
        TenantSharedResources {
            broker_client: broker_client.clone(),
            remote_storage: remote_storage.clone(),
@@ -511,7 +520,7 @@ fn start_pageserver(
        }
    });

-    let (secondary_controller, secondary_controller_tasks) = secondary::spawn_tasks(
+    let secondary_controller = secondary::spawn_tasks(
        tenant_manager.clone(),
        remote_storage.clone(),
        background_jobs_barrier.clone(),
@@ -524,19 +533,18 @@ fn start_pageserver(
    // been configured.
    let disk_usage_eviction_state: Arc<disk_usage_eviction_task::State> = Arc::default();

-    let disk_usage_eviction_task = launch_disk_usage_global_eviction_task(
+    launch_disk_usage_global_eviction_task(
        conf,
        remote_storage.clone(),
        disk_usage_eviction_state.clone(),
        tenant_manager.clone(),
        background_jobs_barrier.clone(),
-    );
+    )?;

    // Start up the service to handle HTTP mgmt API request. We created the
    // listener earlier already.
-    let http_endpoint_listener = {
-        let _rt_guard = MGMT_REQUEST_RUNTIME.enter(); // for hyper
-        let cancel = CancellationToken::new();
+    {
+        let _rt_guard = MGMT_REQUEST_RUNTIME.enter();

        let router_state = Arc::new(
            http::routes::State::new(
@@ -557,44 +565,78 @@ fn start_pageserver(
        let service = utils::http::RouterService::new(router).unwrap();
        let server = hyper::Server::from_tcp(http_listener)?
            .serve(service)
-            .with_graceful_shutdown({
-                let cancel = cancel.clone();
-                async move { cancel.clone().cancelled().await }
-            });
+            .with_graceful_shutdown(task_mgr::shutdown_watcher());

-        let task = MGMT_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
+        task_mgr::spawn(
+            MGMT_REQUEST_RUNTIME.handle(),
+            TaskKind::HttpEndpointListener,
+            None,
+            None,
            "http endpoint listener",
-            server,
-        ));
-        HttpEndpointListener(CancellableTask { task, cancel })
-    };
+            true,
+            async {
+                server.await?;
+                Ok(())
+            },
+        );
+    }

-    let consumption_metrics_tasks = {
-        let cancel = shutdown_pageserver.child_token();
-        let task = crate::BACKGROUND_RUNTIME.spawn({
-            let tenant_manager = tenant_manager.clone();
-            let cancel = cancel.clone();
-            async move {
-                // first wait until background jobs are cleared to launch.
-                //
-                // this is because we only process active tenants and timelines, and the
-                // Timeline::get_current_logical_size will spawn the logical size calculation,
-                // which will not be rate-limited.
-                tokio::select! {
-                    _ = cancel.cancelled() => { return; },
-                    _ = background_jobs_barrier.wait() => {}
-                };
+    if let Some(metric_collection_endpoint) = &conf.metric_collection_endpoint {
+        let metrics_ctx = RequestContext::todo_child(
+            TaskKind::MetricsCollection,
+            // This task itself shouldn't download anything.
+            // The actual size calculation does need downloads, and
+            // creates a child context with the right DownloadBehavior.
+            DownloadBehavior::Error,
+        );

-                pageserver::consumption_metrics::run(conf, tenant_manager, cancel).await;
-            }
-        });
-        ConsumptionMetricsTasks(CancellableTask { task, cancel })
-    };
+        let local_disk_storage = conf.workdir.join("last_consumption_metrics.json");
+
+        task_mgr::spawn(
+            crate::BACKGROUND_RUNTIME.handle(),
+            TaskKind::MetricsCollection,
+            None,
+            None,
+            "consumption metrics collection",
+            true,
+            {
+                let tenant_manager = tenant_manager.clone();
+                async move {
+                    // first wait until background jobs are cleared to launch.
+                    //
+                    // this is because we only process active tenants and timelines, and the
+                    // Timeline::get_current_logical_size will spawn the logical size calculation,
+                    // which will not be rate-limited.
+                    let cancel = task_mgr::shutdown_token();
+
+                    tokio::select! {
+                        _ = cancel.cancelled() => { return Ok(()); },
+                        _ = background_jobs_barrier.wait() => {}
+                    };
+
+                    pageserver::consumption_metrics::collect_metrics(
+                        tenant_manager,
+                        metric_collection_endpoint,
+                        &conf.metric_collection_bucket,
+                        conf.metric_collection_interval,
+                        conf.cached_metric_collection_interval,
+                        conf.synthetic_size_calculation_interval,
+                        conf.id,
+                        local_disk_storage,
+                        cancel,
+                        metrics_ctx,
+                    )
+                    .instrument(info_span!("metrics_collection"))
+                    .await?;
+                    Ok(())
+                }
+            },
+        );
+    }

    // Spawn a task to listen for libpq connections. It will spawn further tasks
    // for each connection. We created the listener earlier already.
-    let libpq_listener = {
-        let cancel = CancellationToken::new();
+    {
        let libpq_ctx = RequestContext::todo_child(
            TaskKind::LibpqEndpointListener,
            // listener task shouldn't need to download anything. (We will
@@ -603,20 +645,30 @@ fn start_pageserver(
            // accept connections.)
            DownloadBehavior::Error,
        );
-
-        let task = COMPUTE_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
-            "libpq listener",
-            page_service::libpq_listener_main(
-                tenant_manager.clone(),
-                pg_auth,
-                pageserver_listener,
-                conf.pg_auth_type,
-                libpq_ctx,
-                cancel.clone(),
-            ),
-        ));
-        LibpqEndpointListener(CancellableTask { task, cancel })
-    };
+        task_mgr::spawn(
+            COMPUTE_REQUEST_RUNTIME.handle(),
+            TaskKind::LibpqEndpointListener,
+            None,
+            None,
+            "libpq endpoint listener",
+            true,
+            {
+                let tenant_manager = tenant_manager.clone();
+                async move {
+                    page_service::libpq_listener_main(
+                        tenant_manager,
+                        broker_client,
+                        pg_auth,
+                        pageserver_listener,
+                        conf.pg_auth_type,
+                        libpq_ctx,
+                        task_mgr::shutdown_token(),
+                    )
+                    .await
+                }
+            },
+        );
+    }

    let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard());

@@ -642,24 +694,13 @@ fn start_pageserver(
            // Right now that tree doesn't reach very far, and `task_mgr` is used instead.
            // The plan is to change that over time.
            shutdown_pageserver.take();
-            pageserver::shutdown_pageserver(
-                http_endpoint_listener,
-                libpq_listener,
-                consumption_metrics_tasks,
-                disk_usage_eviction_task,
-                &tenant_manager,
-                background_purges,
-                deletion_queue.clone(),
-                secondary_controller_tasks,
-                0,
-            )
-            .await;
+            pageserver::shutdown_pageserver(&tenant_manager, deletion_queue.clone(), 0).await;
            unreachable!()
        })
    }
 }

-async fn create_remote_storage_client(
+fn create_remote_storage_client(
    conf: &'static PageServerConf,
 ) -> anyhow::Result<GenericRemoteStorage> {
    let config = if let Some(config) = &conf.remote_storage_config {
@@ -669,7 +710,7 @@ async fn create_remote_storage_client(
    };

    // Create the client
-    let mut remote_storage = GenericRemoteStorage::from_config(config).await?;
+    let mut remote_storage = GenericRemoteStorage::from_config(config)?;

    // If `test_remote_failures` is non-zero, wrap the client with a
    // wrapper that simulates failures.
@@ -692,12 +733,28 @@ fn cli() -> Command {
    Command::new("Neon page server")
        .about("Materializes WAL stream to pages and serves them to the postgres")
        .version(version())
+        .arg(
+            Arg::new("init")
+                .long("init")
+                .action(ArgAction::SetTrue)
+                .help("Initialize pageserver with all given config overrides"),
+        )
        .arg(
            Arg::new("workdir")
                .short('D')
                .long("workdir")
                .help("Working directory for the pageserver"),
        )
+        // See `settings.md` for more details on the extra configuration patameters pageserver can process
+        .arg(
+            Arg::new("config-override")
+                .long("config-override")
+                .short('c')
+                .num_args(1)
+                .action(ArgAction::Append)
+                .help("Additional configuration overrides of the ones from the toml config file (or new ones to add there). \
+                Any option has to be a valid toml document, example: `-c=\"foo='hey'\"` `-c=\"foo={value=1}\"`"),
+        )
        .arg(
            Arg::new("enabled-features")
                .long("enabled-features")
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -7,11 +7,12 @@
 use anyhow::{anyhow, bail, ensure, Context, Result};
 use pageserver_api::{models::ImageCompressionAlgorithm, shard::TenantShardId};
 use remote_storage::{RemotePath, RemoteStorageConfig};
+use serde;
 use serde::de::IntoDeserializer;
-use serde::{self, Deserialize};
 use std::env;
 use storage_broker::Uri;
 use utils::crashsafe::path_with_suffix_extension;
+use utils::id::ConnectionId;
 use utils::logging::SecretString;

 use once_cell::sync::OnceCell;
@@ -68,6 +69,7 @@ pub mod defaults {
        super::ConfigurableSemaphore::DEFAULT_INITIAL.get();

    pub const DEFAULT_METRIC_COLLECTION_INTERVAL: &str = "10 min";
+    pub const DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL: &str = "0s";
    pub const DEFAULT_METRIC_COLLECTION_ENDPOINT: Option<reqwest::Url> = None;
    pub const DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL: &str = "10 min";
    pub const DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY: &str = "10s";
@@ -90,7 +92,7 @@ pub mod defaults {
    pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB

    pub const DEFAULT_IMAGE_COMPRESSION: ImageCompressionAlgorithm =
-        ImageCompressionAlgorithm::Disabled;
+        ImageCompressionAlgorithm::DisabledNoDecompress;

    pub const DEFAULT_VALIDATE_VECTORED_GET: bool = true;

@@ -122,6 +124,7 @@ pub mod defaults {
 #concurrent_tenant_warmup = '{DEFAULT_CONCURRENT_TENANT_WARMUP}'

 #metric_collection_interval = '{DEFAULT_METRIC_COLLECTION_INTERVAL}'
+#cached_metric_collection_interval = '{DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL}'
 #synthetic_size_calculation_interval = '{DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL}'

 #disk_usage_based_eviction = {{ max_usage_pct = .., min_avail_bytes = .., period = "10s"}}
@@ -236,6 +239,7 @@ pub struct PageServerConf {
    // How often to collect metrics and send them to the metrics endpoint.
    pub metric_collection_interval: Duration,
    // How often to send unchanged cached metrics to the metrics endpoint.
+    pub cached_metric_collection_interval: Duration,
    pub metric_collection_endpoint: Option<Url>,
    pub metric_collection_bucket: Option<RemoteStorageConfig>,
    pub synthetic_size_calculation_interval: Duration,
@@ -367,6 +371,7 @@ struct PageServerConfigBuilder {
    concurrent_tenant_size_logical_size_queries: BuilderValue<NonZeroUsize>,

    metric_collection_interval: BuilderValue<Duration>,
+    cached_metric_collection_interval: BuilderValue<Duration>,
    metric_collection_endpoint: BuilderValue<Option<Url>>,
    synthetic_size_calculation_interval: BuilderValue<Duration>,
    metric_collection_bucket: BuilderValue<Option<RemoteStorageConfig>>,
@@ -406,13 +411,6 @@ struct PageServerConfigBuilder {
 }

 impl PageServerConfigBuilder {
-    fn new(node_id: NodeId) -> Self {
-        let mut this = Self::default();
-        this.id(node_id);
-
-        this
-    }
-
    #[inline(always)]
    fn default_values() -> Self {
        use self::BuilderValue::*;
@@ -457,6 +455,10 @@ impl PageServerConfigBuilder {
                DEFAULT_METRIC_COLLECTION_INTERVAL,
            )
            .expect("cannot parse default metric collection interval")),
+            cached_metric_collection_interval: Set(humantime::parse_duration(
+                DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL,
+            )
+            .expect("cannot parse default cached_metric_collection_interval")),
            synthetic_size_calculation_interval: Set(humantime::parse_duration(
                DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL,
            )
@@ -588,6 +590,14 @@ impl PageServerConfigBuilder {
        self.metric_collection_interval = BuilderValue::Set(metric_collection_interval)
    }

+    pub fn cached_metric_collection_interval(
+        &mut self,
+        cached_metric_collection_interval: Duration,
+    ) {
+        self.cached_metric_collection_interval =
+            BuilderValue::Set(cached_metric_collection_interval)
+    }
+
    pub fn metric_collection_endpoint(&mut self, metric_collection_endpoint: Option<Url>) {
        self.metric_collection_endpoint = BuilderValue::Set(metric_collection_endpoint)
    }
@@ -721,6 +731,7 @@ impl PageServerConfigBuilder {
                broker_keepalive_interval,
                log_format,
                metric_collection_interval,
+                cached_metric_collection_interval,
                metric_collection_endpoint,
                metric_collection_bucket,
                synthetic_size_calculation_interval,
@@ -859,6 +870,22 @@ impl PageServerConf {
        )
    }

+    pub fn traces_path(&self) -> Utf8PathBuf {
+        self.workdir.join("traces")
+    }
+
+    pub fn trace_path(
+        &self,
+        tenant_shard_id: &TenantShardId,
+        timeline_id: &TimelineId,
+        connection_id: &ConnectionId,
+    ) -> Utf8PathBuf {
+        self.traces_path()
+            .join(tenant_shard_id.to_string())
+            .join(timeline_id.to_string())
+            .join(connection_id.to_string())
+    }
+
    /// Turns storage remote path of a file into its local path.
    pub fn local_path(&self, remote_path: &RemotePath) -> Utf8PathBuf {
        remote_path.with_base(&self.workdir)
@@ -888,12 +915,8 @@ impl PageServerConf {
    /// validating the input and failing on errors.
    ///
    /// This leaves any options not present in the file in the built-in defaults.
-    pub fn parse_and_validate(
-        node_id: NodeId,
-        toml: &Document,
-        workdir: &Utf8Path,
-    ) -> anyhow::Result<Self> {
-        let mut builder = PageServerConfigBuilder::new(node_id);
+    pub fn parse_and_validate(toml: &Document, workdir: &Utf8Path) -> anyhow::Result<Self> {
+        let mut builder = PageServerConfigBuilder::default();
        builder.workdir(workdir.to_owned());

        let mut t_conf = TenantConfOpt::default();
@@ -924,8 +947,7 @@ impl PageServerConf {
                "tenant_config" => {
                    t_conf = TenantConfOpt::try_from(item.to_owned()).context(format!("failed to parse: '{key}'"))?;
                }
-                "id" => {}, // Ignoring `id` field in pageserver.toml - using identity.toml as the source of truth
-                            // Logging is not set up yet, so we can't do it.
+                "id" => builder.id(NodeId(parse_toml_u64(key, item)?)),
                "broker_endpoint" => builder.broker_endpoint(parse_toml_string(key, item)?.parse().context("failed to parse broker endpoint")?),
                "broker_keepalive_interval" => builder.broker_keepalive_interval(parse_toml_duration(key, item)?),
                "log_format" => builder.log_format(
@@ -942,6 +964,7 @@ impl PageServerConf {
                    NonZeroUsize::new(permits).context("initial semaphore permits out of range: 0, use other configuration to disable a feature")?
                }),
                "metric_collection_interval" => builder.metric_collection_interval(parse_toml_duration(key, item)?),
+                "cached_metric_collection_interval" => builder.cached_metric_collection_interval(parse_toml_duration(key, item)?),
                "metric_collection_endpoint" => {
                    let endpoint = parse_toml_string(key, item)?.parse().context("failed to parse metric_collection_endpoint")?;
                    builder.metric_collection_endpoint(Some(endpoint));
@@ -1074,6 +1097,7 @@ impl PageServerConf {
            eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::default(
            ),
            metric_collection_interval: Duration::from_secs(60),
+            cached_metric_collection_interval: Duration::from_secs(60 * 60),
            metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
            metric_collection_bucket: None,
            synthetic_size_calculation_interval: Duration::from_secs(60),
@@ -1102,12 +1126,6 @@ impl PageServerConf {
    }
 }

-#[derive(Deserialize)]
-#[serde(deny_unknown_fields)]
-pub struct PageserverIdentity {
-    pub id: NodeId,
-}
-
 // Helper functions to parse a toml Item

 fn parse_toml_string(name: &str, item: &Item) -> Result<String> {
@@ -1258,6 +1276,7 @@ initial_superuser_name = 'zzzz'
 id = 10

 metric_collection_interval = '222 s'
+cached_metric_collection_interval = '22200 s'
 metric_collection_endpoint = 'http://localhost:80/metrics'
 synthetic_size_calculation_interval = '333 s'

@@ -1277,7 +1296,7 @@ background_task_maximum_delay = '334 s'
        );
        let toml = config_string.parse()?;

-        let parsed_config = PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir)
+        let parsed_config = PageServerConf::parse_and_validate(&toml, &workdir)
            .unwrap_or_else(|e| panic!("Failed to parse config '{config_string}', reason: {e:?}"));

        assert_eq!(
@@ -1313,6 +1332,9 @@ background_task_maximum_delay = '334 s'
                metric_collection_interval: humantime::parse_duration(
                    defaults::DEFAULT_METRIC_COLLECTION_INTERVAL
                )?,
+                cached_metric_collection_interval: humantime::parse_duration(
+                    defaults::DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL
+                )?,
                metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
                metric_collection_bucket: None,
                synthetic_size_calculation_interval: humantime::parse_duration(
@@ -1359,7 +1381,7 @@ background_task_maximum_delay = '334 s'
        );
        let toml = config_string.parse()?;

-        let parsed_config = PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir)
+        let parsed_config = PageServerConf::parse_and_validate(&toml, &workdir)
            .unwrap_or_else(|e| panic!("Failed to parse config '{config_string}', reason: {e:?}"));

        assert_eq!(
@@ -1391,6 +1413,7 @@ background_task_maximum_delay = '334 s'
                eviction_task_immitated_concurrent_logical_size_queries:
                    ConfigurableSemaphore::default(),
                metric_collection_interval: Duration::from_secs(222),
+                cached_metric_collection_interval: Duration::from_secs(22200),
                metric_collection_endpoint: Some(Url::parse("http://localhost:80/metrics")?),
                metric_collection_bucket: None,
                synthetic_size_calculation_interval: Duration::from_secs(333),
@@ -1449,13 +1472,12 @@ broker_endpoint = '{broker_endpoint}'

            let toml = config_string.parse()?;

-            let parsed_remote_storage_config =
-                PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir)
-                    .unwrap_or_else(|e| {
-                        panic!("Failed to parse config '{config_string}', reason: {e:?}")
-                    })
-                    .remote_storage_config
-                    .expect("Should have remote storage config for the local FS");
+            let parsed_remote_storage_config = PageServerConf::parse_and_validate(&toml, &workdir)
+                .unwrap_or_else(|e| {
+                    panic!("Failed to parse config '{config_string}', reason: {e:?}")
+                })
+                .remote_storage_config
+                .expect("Should have remote storage config for the local FS");

            assert_eq!(
                parsed_remote_storage_config,
@@ -1511,13 +1533,12 @@ broker_endpoint = '{broker_endpoint}'

            let toml = config_string.parse()?;

-            let parsed_remote_storage_config =
-                PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir)
-                    .unwrap_or_else(|e| {
-                        panic!("Failed to parse config '{config_string}', reason: {e:?}")
-                    })
-                    .remote_storage_config
-                    .expect("Should have remote storage config for S3");
+            let parsed_remote_storage_config = PageServerConf::parse_and_validate(&toml, &workdir)
+                .unwrap_or_else(|e| {
+                    panic!("Failed to parse config '{config_string}', reason: {e:?}")
+                })
+                .remote_storage_config
+                .expect("Should have remote storage config for S3");

            assert_eq!(
                parsed_remote_storage_config,
@@ -1539,6 +1560,34 @@ broker_endpoint = '{broker_endpoint}'
        Ok(())
    }

+    #[test]
+    fn parse_tenant_config() -> anyhow::Result<()> {
+        let tempdir = tempdir()?;
+        let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?;
+
+        let broker_endpoint = "http://127.0.0.1:7777";
+        let trace_read_requests = true;
+
+        let config_string = format!(
+            r#"{ALL_BASE_VALUES_TOML}
+pg_distrib_dir='{pg_distrib_dir}'
+broker_endpoint = '{broker_endpoint}'
+
+[tenant_config]
+trace_read_requests = {trace_read_requests}"#,
+        );
+
+        let toml = config_string.parse()?;
+
+        let conf = PageServerConf::parse_and_validate(&toml, &workdir)?;
+        assert_eq!(
+            conf.default_tenant_conf.trace_read_requests, trace_read_requests,
+            "Tenant config from pageserver config file should be parsed and udpated values used as defaults for all tenants",
+        );
+
+        Ok(())
+    }
+
    #[test]
    fn parse_incorrect_tenant_config() -> anyhow::Result<()> {
        let config_string = r#"
@@ -1596,7 +1645,7 @@ threshold = "20m"
 "#,
        );
        let toml: Document = pageserver_conf_toml.parse()?;
-        let conf = PageServerConf::parse_and_validate(NodeId(333), &toml, &workdir)?;
+        let conf = PageServerConf::parse_and_validate(&toml, &workdir)?;

        assert_eq!(conf.pg_distrib_dir, pg_distrib_dir);
        assert_eq!(
@@ -1612,11 +1661,7 @@ threshold = "20m"
                .evictions_low_residence_duration_metric_threshold,
            Duration::from_secs(20 * 60)
        );
-
-        // Assert that the node id provided by the indentity file (threaded
-        // through the call to [`PageServerConf::parse_and_validate`] is
-        // used.
-        assert_eq!(conf.id, NodeId(333));
+        assert_eq!(conf.id, NodeId(222));
        assert_eq!(
            conf.disk_usage_based_eviction,
            Some(DiskUsageEvictionTaskConfig {
@@ -1625,7 +1670,7 @@ threshold = "20m"
                period: Duration::from_secs(10),
                #[cfg(feature = "testing")]
                mock_statvfs: None,
-                eviction_order: Default::default(),
+                eviction_order: crate::disk_usage_eviction_task::EvictionOrder::AbsoluteAccessed,
            })
        );

@@ -1661,7 +1706,7 @@ threshold = "20m"
 "#,
        );
        let toml: Document = pageserver_conf_toml.parse().unwrap();
-        let conf = PageServerConf::parse_and_validate(NodeId(222), &toml, &workdir).unwrap();
+        let conf = PageServerConf::parse_and_validate(&toml, &workdir).unwrap();

        match &conf.default_tenant_conf.eviction_policy {
            EvictionPolicy::OnlyImitiate(t) => {
@@ -1680,7 +1725,7 @@ threshold = "20m"
 remote_storage = {}
        "#;
        let doc = toml_edit::Document::from_str(input).unwrap();
-        let err = PageServerConf::parse_and_validate(NodeId(222), &doc, &workdir)
+        let err = PageServerConf::parse_and_validate(&doc, &workdir)
            .expect_err("empty remote_storage field should fail, don't specify it if you want no remote_storage");
        assert!(format!("{err}").contains("remote_storage"), "{err}");
    }
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -1,6 +1,5 @@
 //! Periodically collect consumption metrics for all active tenants
 //! and push them to a HTTP endpoint.
-use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
 use crate::tenant::size::CalculateSyntheticSizeError;
@@ -40,74 +39,56 @@ type RawMetric = (MetricsKey, (EventType, u64));
 /// for deduplication, but that is no longer needed.
 type Cache = HashMap<MetricsKey, (EventType, u64)>;

-pub async fn run(
-    conf: &'static PageServerConf,
-    tenant_manager: Arc<TenantManager>,
-    cancel: CancellationToken,
-) {
-    let Some(metric_collection_endpoint) = conf.metric_collection_endpoint.as_ref() else {
-        return;
-    };
-
-    let local_disk_storage = conf.workdir.join("last_consumption_metrics.json");
-
-    let metrics_ctx = RequestContext::todo_child(
-        TaskKind::MetricsCollection,
-        // This task itself shouldn't download anything.
-        // The actual size calculation does need downloads, and
-        // creates a child context with the right DownloadBehavior.
-        DownloadBehavior::Error,
-    );
-    let collect_metrics = BACKGROUND_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
-        "consumption metrics collection",
-        collect_metrics(
-            tenant_manager.clone(),
-            metric_collection_endpoint,
-            &conf.metric_collection_bucket,
-            conf.metric_collection_interval,
-            conf.id,
-            local_disk_storage,
-            cancel.clone(),
-            metrics_ctx,
-        )
-        .instrument(info_span!("metrics_collection")),
-    ));
-
-    let worker_ctx =
-        RequestContext::todo_child(TaskKind::CalculateSyntheticSize, DownloadBehavior::Download);
-    let synthetic_size_worker = BACKGROUND_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
-        "synthetic size calculation",
-        calculate_synthetic_size_worker(
-            tenant_manager.clone(),
-            conf.synthetic_size_calculation_interval,
-            cancel.clone(),
-            worker_ctx,
-        )
-        .instrument(info_span!("synthetic_size_worker")),
-    ));
-
-    let (collect_metrics, synthetic_size_worker) =
-        futures::future::join(collect_metrics, synthetic_size_worker).await;
-    collect_metrics
-        .expect("unreachable: exit_on_panic_or_error would catch the panic and exit the process");
-    synthetic_size_worker
-        .expect("unreachable: exit_on_panic_or_error would catch the panic and exit the process");
-}
-
 /// Main thread that serves metrics collection
 #[allow(clippy::too_many_arguments)]
-async fn collect_metrics(
+pub async fn collect_metrics(
    tenant_manager: Arc<TenantManager>,
    metric_collection_endpoint: &Url,
    metric_collection_bucket: &Option<RemoteStorageConfig>,
    metric_collection_interval: Duration,
+    _cached_metric_collection_interval: Duration,
+    synthetic_size_calculation_interval: Duration,
    node_id: NodeId,
    local_disk_storage: Utf8PathBuf,
    cancel: CancellationToken,
    ctx: RequestContext,
 ) -> anyhow::Result<()> {
+    if _cached_metric_collection_interval != Duration::ZERO {
+        tracing::warn!(
+            "cached_metric_collection_interval is no longer used, please set it to zero."
+        )
+    }
+
+    // spin up background worker that caclulates tenant sizes
+    let worker_ctx =
+        ctx.detached_child(TaskKind::CalculateSyntheticSize, DownloadBehavior::Download);
+    task_mgr::spawn(
+        BACKGROUND_RUNTIME.handle(),
+        TaskKind::CalculateSyntheticSize,
+        None,
+        None,
+        "synthetic size calculation",
+        false,
+        {
+            let tenant_manager = tenant_manager.clone();
+            async move {
+                calculate_synthetic_size_worker(
+                    tenant_manager,
+                    synthetic_size_calculation_interval,
+                    &cancel,
+                    &worker_ctx,
+                )
+                .instrument(info_span!("synthetic_size_worker"))
+                .await?;
+                Ok(())
+            }
+        },
+    );
+
    let path: Arc<Utf8PathBuf> = Arc::new(local_disk_storage);

+    let cancel = task_mgr::shutdown_token();
+
    let restore_and_reschedule = restore_and_reschedule(&path, metric_collection_interval);

    let mut cached_metrics = tokio::select! {
@@ -122,7 +103,7 @@ async fn collect_metrics(
        .expect("Failed to create http client with timeout");

    let bucket_client = if let Some(bucket_config) = metric_collection_bucket {
-        match GenericRemoteStorage::from_config(bucket_config).await {
+        match GenericRemoteStorage::from_config(bucket_config) {
            Ok(client) => Some(client),
            Err(e) => {
                // Non-fatal error: if we were given an invalid config, we will proceed
@@ -194,9 +175,11 @@ async fn collect_metrics(
            BackgroundLoopKind::ConsumptionMetricsCollectMetrics,
        );

-        let res =
-            tokio::time::timeout_at(started_at + metric_collection_interval, cancel.cancelled())
-                .await;
+        let res = tokio::time::timeout_at(
+            started_at + metric_collection_interval,
+            task_mgr::shutdown_token().cancelled(),
+        )
+        .await;
        if res.is_ok() {
            return Ok(());
        }
@@ -296,8 +279,8 @@ async fn reschedule(
 async fn calculate_synthetic_size_worker(
    tenant_manager: Arc<TenantManager>,
    synthetic_size_calculation_interval: Duration,
-    cancel: CancellationToken,
-    ctx: RequestContext,
+    cancel: &CancellationToken,
+    ctx: &RequestContext,
 ) -> anyhow::Result<()> {
    info!("starting calculate_synthetic_size_worker");
    scopeguard::defer! {
@@ -337,7 +320,7 @@ async fn calculate_synthetic_size_worker(
            // there is never any reason to exit calculate_synthetic_size_worker following any
            // return value -- we don't need to care about shutdown because no tenant is found when
            // pageserver is shut down.
-            calculate_and_log(&tenant, &cancel, &ctx).await;
+            calculate_and_log(&tenant, cancel, ctx).await;
        }

        crate::tenant::tasks::warn_when_period_overrun(
--- a/pageserver/src/context.rs
+++ b/pageserver/src/context.rs
@@ -59,7 +59,6 @@
 //! 1. It should be easy to forward the context to callees.
 //! 2. To propagate more data from high-level to low-level code, the functions in
 //!    the middle should not need to be modified.
-//!
 //! The solution is to have a container structure ([`RequestContext`]) that
 //! carries the information. Functions that don't care about what's in it
 //! pass it along to callees.
--- a/pageserver/src/control_plane_client.rs
+++ b/pageserver/src/control_plane_client.rs
@@ -171,14 +171,14 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
            register,
        };

+        fail::fail_point!("control-plane-client-re-attach");
+
        let response: ReAttachResponse = self.retry_http_forever(&re_attach_path, request).await?;
        tracing::info!(
            "Received re-attach response with {} tenants",
            response.tenants.len()
        );

-        failpoint_support::sleep_millis_async!("control-plane-client-re-attach");
-
        Ok(response
            .tenants
            .into_iter()
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -828,9 +828,9 @@ mod test {
        }
    }

-    async fn setup(test_name: &str) -> anyhow::Result<TestSetup> {
+    fn setup(test_name: &str) -> anyhow::Result<TestSetup> {
        let test_name = Box::leak(Box::new(format!("deletion_queue__{test_name}")));
-        let harness = TenantHarness::create(test_name).await?;
+        let harness = TenantHarness::create(test_name)?;

        // We do not load() the harness: we only need its config and remote_storage

@@ -844,9 +844,7 @@ mod test {
            },
            timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
        };
-        let storage = GenericRemoteStorage::from_config(&storage_config)
-            .await
-            .unwrap();
+        let storage = GenericRemoteStorage::from_config(&storage_config).unwrap();

        let mock_control_plane = MockControlPlane::new();

@@ -924,9 +922,7 @@ mod test {
    #[tokio::test]
    async fn deletion_queue_smoke() -> anyhow::Result<()> {
        // Basic test that the deletion queue processes the deletions we pass into it
-        let ctx = setup("deletion_queue_smoke")
-            .await
-            .expect("Failed test setup");
+        let ctx = setup("deletion_queue_smoke").expect("Failed test setup");
        let client = ctx.deletion_queue.new_client();
        client.recover(HashMap::new())?;

@@ -996,9 +992,7 @@ mod test {

    #[tokio::test]
    async fn deletion_queue_validation() -> anyhow::Result<()> {
-        let ctx = setup("deletion_queue_validation")
-            .await
-            .expect("Failed test setup");
+        let ctx = setup("deletion_queue_validation").expect("Failed test setup");
        let client = ctx.deletion_queue.new_client();
        client.recover(HashMap::new())?;

@@ -1057,9 +1051,7 @@ mod test {
    #[tokio::test]
    async fn deletion_queue_recovery() -> anyhow::Result<()> {
        // Basic test that the deletion queue processes the deletions we pass into it
-        let mut ctx = setup("deletion_queue_recovery")
-            .await
-            .expect("Failed test setup");
+        let mut ctx = setup("deletion_queue_recovery").expect("Failed test setup");
        let client = ctx.deletion_queue.new_client();
        client.recover(HashMap::new())?;

--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -59,14 +59,13 @@ use utils::{completion, id::TimelineId};
 use crate::{
    config::PageServerConf,
    metrics::disk_usage_based_eviction::METRICS,
-    task_mgr::{self, BACKGROUND_RUNTIME},
+    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
    tenant::{
        mgr::TenantManager,
        remote_timeline_client::LayerFileMetadata,
        secondary::SecondaryTenant,
        storage_layer::{AsLayerDesc, EvictionError, Layer, LayerName},
    },
-    CancellableTask, DiskUsageEvictionTask,
 };

 #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
@@ -84,9 +83,17 @@ pub struct DiskUsageEvictionTaskConfig {

 /// Selects the sort order for eviction candidates *after* per tenant `min_resident_size`
 /// partitioning.
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+#[derive(Default, Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
 #[serde(tag = "type", content = "args")]
 pub enum EvictionOrder {
+    /// Order the layers to be evicted by how recently they have been accessed in absolute
+    /// time.
+    ///
+    /// This strategy is unfair when some tenants grow faster than others towards the slower
+    /// growing.
+    #[default]
+    AbsoluteAccessed,
+
    /// Order the layers to be evicted by how recently they have been accessed relatively within
    /// the set of resident layers of a tenant.
    RelativeAccessed {
@@ -101,14 +108,6 @@ pub enum EvictionOrder {
    },
 }

-impl Default for EvictionOrder {
-    fn default() -> Self {
-        Self::RelativeAccessed {
-            highest_layer_count_loses_first: true,
-        }
-    }
-}
-
 fn default_highest_layer_count_loses_first() -> bool {
    true
 }
@@ -118,6 +117,11 @@ impl EvictionOrder {
        use EvictionOrder::*;

        match self {
+            AbsoluteAccessed => {
+                candidates.sort_unstable_by_key(|(partition, candidate)| {
+                    (*partition, candidate.last_activity_ts)
+                });
+            }
            RelativeAccessed { .. } => candidates.sort_unstable_by_key(|(partition, candidate)| {
                (*partition, candidate.relative_last_activity)
            }),
@@ -130,6 +134,7 @@ impl EvictionOrder {
        use EvictionOrder::*;

        match self {
+            AbsoluteAccessed => finite_f32::FiniteF32::ZERO,
            RelativeAccessed {
                highest_layer_count_loses_first,
            } => {
@@ -187,34 +192,36 @@ pub fn launch_disk_usage_global_eviction_task(
    state: Arc<State>,
    tenant_manager: Arc<TenantManager>,
    background_jobs_barrier: completion::Barrier,
-) -> Option<DiskUsageEvictionTask> {
+) -> anyhow::Result<()> {
    let Some(task_config) = &conf.disk_usage_based_eviction else {
        info!("disk usage based eviction task not configured");
-        return None;
+        return Ok(());
    };

    info!("launching disk usage based eviction task");

-    let cancel = CancellationToken::new();
-    let task = BACKGROUND_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
+    task_mgr::spawn(
+        BACKGROUND_RUNTIME.handle(),
+        TaskKind::DiskUsageEviction,
+        None,
+        None,
        "disk usage based eviction",
-        {
-            let cancel = cancel.clone();
-            async move {
-                // wait until initial load is complete, because we cannot evict from loading tenants.
-                tokio::select! {
-                    _ = cancel.cancelled() => { return anyhow::Ok(()); },
-                    _ = background_jobs_barrier.wait() => { }
-                };
+        false,
+        async move {
+            let cancel = task_mgr::shutdown_token();

-                disk_usage_eviction_task(&state, task_config, &storage, tenant_manager, cancel)
-                    .await;
-                anyhow::Ok(())
-            }
+            // wait until initial load is complete, because we cannot evict from loading tenants.
+            tokio::select! {
+                _ = cancel.cancelled() => { return Ok(()); },
+                _ = background_jobs_barrier.wait() => { }
+            };
+
+            disk_usage_eviction_task(&state, task_config, &storage, tenant_manager, cancel).await;
+            Ok(())
        },
-    ));
+    );

-    Some(DiskUsageEvictionTask(CancellableTask { cancel, task }))
+    Ok(())
 }

 #[instrument(skip_all)]
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -377,7 +377,7 @@ paths:
              schema:
                $ref: "#/components/schemas/ConflictError"

-  /v1/tenant/{tenant_id}/timeline/{timeline_id}/preserve_initdb_archive:
+  /v1/tenant/{tenant_id}/{timeline_id}/preserve_initdb_archive:
    parameters:
      - name: tenant_id
        in: path
@@ -397,51 +397,6 @@ paths:
        "202":
          description: Tenant scheduled to load successfully

-  /v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/archival_config:
-    parameters:
-      - name: tenant_shard_id
-        in: path
-        required: true
-        schema:
-          type: string
-      - name: timeline_id
-        in: path
-        required: true
-        schema:
-          type: string
-    put:
-      description: |
-        Either archives or unarchives the given timeline.
-        An archived timeline may not have any non-archived children.
-      requestBody:
-        required: true
-        content:
-          application/json:
-            schema:
-              $ref: "#/components/schemas/ArchivalConfigRequest"
-      responses:
-        "200":
-          description: Timeline (un)archived successfully
-        "409":
-          description: |
-            The tenant/timeline is already being modified, perhaps by a concurrent call to this API
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ConflictError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
-
  /v1/tenant/{tenant_id}/synthetic_size:
    parameters:
      - name: tenant_id
@@ -474,9 +429,7 @@ paths:
              schema:
                $ref: "#/components/schemas/SyntheticSizeResponse"
            text/html:
-              schema:
-                type: string
-                description: SVG representation of the tenant and its timelines.
+              description: SVG representation of the tenant and it's timelines.
        "401":
          description: Unauthorized Error
          content:
@@ -615,7 +568,7 @@ paths:
          type: string
      - name: timeline_id
        in: path
-        required: true
+        ŕequired: true
        schema:
          type: string

@@ -821,13 +774,15 @@ components:
    TenantCreateRequest:
      allOf:
        - $ref: '#/components/schemas/TenantConfig'
-        - $ref: '#/components/schemas/TenantLoadRequest'
        - type: object
          required:
            - new_tenant_id
          properties:
            new_tenant_id:
              type: string
+            generation:
+              type: integer
+              description: Attachment generation number.
    TenantLoadRequest:
      type: object
      properties:
@@ -891,15 +846,6 @@ components:
        warm:
          type: boolean
          description: Whether to poll remote storage for layers to download.  If false, secondary locations don't download anything.
-    ArchivalConfigRequest:
-      type: object
-      required
-        - state
-      properties:
-        state:
-          description: The archival state of a timeline
-          type: string
-          enum: ["Archived", "Unarchived"]
    TenantConfig:
      type: object
      properties:
@@ -927,6 +873,8 @@ components:
          type: string
        max_lsn_wal_lag:
          type: integer
+        trace_read_requests:
+          type: boolean
        heatmap_period:
          type: string
    TenantConfigResponse:
@@ -1160,7 +1108,7 @@ components:
        reparented_timelines:
          type: array
          description: Set of reparented timeline ids
-          items:
+          properties:
            type: string
            format: hex
            description: TimelineId
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -10,7 +10,6 @@ use std::time::Duration;

 use anyhow::{anyhow, Context, Result};
 use enumset::EnumSet;
-use futures::StreamExt;
 use futures::TryFutureExt;
 use humantime::format_rfc3339;
 use hyper::header;
@@ -18,17 +17,14 @@ use hyper::StatusCode;
 use hyper::{Body, Request, Response, Uri};
 use metrics::launch_timestamp::LaunchTimestamp;
 use pageserver_api::models::AuxFilePolicy;
-use pageserver_api::models::DownloadRemoteLayersTaskSpawnRequest;
 use pageserver_api::models::IngestAuxFilesRequest;
 use pageserver_api::models::ListAuxFilesRequest;
 use pageserver_api::models::LocationConfig;
 use pageserver_api::models::LocationConfigListResponse;
-use pageserver_api::models::LocationConfigMode;
 use pageserver_api::models::LsnLease;
 use pageserver_api::models::LsnLeaseRequest;
 use pageserver_api::models::ShardParameters;
 use pageserver_api::models::TenantDetails;
-use pageserver_api::models::TenantLocationConfigRequest;
 use pageserver_api::models::TenantLocationConfigResponse;
 use pageserver_api::models::TenantScanRemoteStorageResponse;
 use pageserver_api::models::TenantScanRemoteStorageShard;
@@ -36,24 +32,24 @@ use pageserver_api::models::TenantShardLocation;
 use pageserver_api::models::TenantShardSplitRequest;
 use pageserver_api::models::TenantShardSplitResponse;
 use pageserver_api::models::TenantSorting;
-use pageserver_api::models::TimelineArchivalConfigRequest;
 use pageserver_api::models::TopTenantShardItem;
 use pageserver_api::models::TopTenantShardsRequest;
 use pageserver_api::models::TopTenantShardsResponse;
+use pageserver_api::models::{
+    DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantLocationConfigRequest,
+};
 use pageserver_api::shard::ShardCount;
 use pageserver_api::shard::TenantShardId;
 use remote_storage::DownloadError;
 use remote_storage::GenericRemoteStorage;
 use remote_storage::TimeTravelError;
 use tenant_size_model::{svg::SvgBranchKind, SizeResult, StorageModel};
-use tokio_util::io::StreamReader;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::auth::JwtAuth;
 use utils::failpoint_support::failpoints_handler;
 use utils::http::endpoint::prometheus_metrics_handler;
 use utils::http::endpoint::request_span;
-use utils::http::request::must_parse_query_param;
 use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};

 use crate::context::{DownloadBehavior, RequestContext};
@@ -665,39 +661,6 @@ async fn timeline_preserve_initdb_handler(
    json_response(StatusCode::OK, ())
 }

-async fn timeline_archival_config_handler(
-    mut request: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
-    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
-
-    let request_data: TimelineArchivalConfigRequest = json_request(&mut request).await?;
-    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
-    let state = get_state(&request);
-
-    async {
-        let tenant = state
-            .tenant_manager
-            .get_attached_tenant_shard(tenant_shard_id)?;
-
-        tenant
-            .apply_timeline_archival_config(timeline_id, request_data.state)
-            .await
-            .context("applying archival config")
-            .map_err(ApiError::InternalServerError)?;
-        Ok::<_, ApiError>(())
-    }
-    .instrument(info_span!("timeline_archival_config",
-                tenant_id = %tenant_shard_id.tenant_id,
-                shard_id = %tenant_shard_id.shard_slug(),
-                state = ?request_data.state,
-                %timeline_id))
-    .await?;
-
-    json_response(StatusCode::OK, ())
-}
-
 async fn timeline_detail_handler(
    request: Request<Body>,
    _cancel: CancellationToken,
@@ -1650,9 +1613,7 @@ async fn timeline_compact_handler(
            .await
            .map_err(|e| ApiError::InternalServerError(e.into()))?;
        if wait_until_uploaded {
-            timeline.remote_client.wait_completion().await
-            // XXX map to correct ApiError for the cases where it's due to shutdown
-            .context("wait completion").map_err(ApiError::InternalServerError)?;
+            timeline.remote_client.wait_completion().await.map_err(ApiError::InternalServerError)?;
        }
        json_response(StatusCode::OK, ())
    }
@@ -1678,10 +1639,6 @@ async fn timeline_checkpoint_handler(
    if Some(true) == parse_query_param::<_, bool>(&request, "force_image_layer_creation")? {
        flags |= CompactFlags::ForceImageLayerCreation;
    }
-
-    // By default, checkpoints come with a compaction, but this may be optionally disabled by tests that just want to flush + upload.
-    let compact = parse_query_param::<_, bool>(&request, "compact")?.unwrap_or(true);
-
    let wait_until_uploaded =
        parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false);

@@ -1698,22 +1655,18 @@ async fn timeline_checkpoint_handler(

                }
            })?;
-        if compact {
-            timeline
-                .compact(&cancel, flags, &ctx)
-                .await
-                .map_err(|e|
-                    match e {
-                        CompactionError::ShuttingDown => ApiError::ShuttingDown,
-                        CompactionError::Other(e) => ApiError::InternalServerError(e)
-                    }
-                )?;
-        }
+        timeline
+            .compact(&cancel, flags, &ctx)
+            .await
+            .map_err(|e|
+                match e {
+                    CompactionError::ShuttingDown => ApiError::ShuttingDown,
+                    CompactionError::Other(e) => ApiError::InternalServerError(e)
+                }
+            )?;

        if wait_until_uploaded {
-            timeline.remote_client.wait_completion().await
-            // XXX map to correct ApiError for the cases where it's due to shutdown
-            .context("wait completion").map_err(ApiError::InternalServerError)?;
+            timeline.remote_client.wait_completion().await.map_err(ApiError::InternalServerError)?;
        }

        json_response(StatusCode::OK, ())
@@ -1765,9 +1718,7 @@ async fn timeline_detach_ancestor_handler(
    request: Request<Body>,
    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    use crate::tenant::timeline::detach_ancestor;
-    use pageserver_api::models::detach_ancestor::AncestorDetached;
-
+    use crate::tenant::timeline::detach_ancestor::Options;
    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
@@ -1775,7 +1726,7 @@ async fn timeline_detach_ancestor_handler(
    let span = tracing::info_span!("detach_ancestor", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id);

    async move {
-        let mut options = detach_ancestor::Options::default();
+        let mut options = Options::default();

        let rewrite_concurrency =
            parse_query_param::<_, std::num::NonZeroUsize>(&request, "rewrite_concurrency")?;
@@ -1803,36 +1754,27 @@ async fn timeline_detach_ancestor_handler(

        let timeline = tenant.get_timeline(timeline_id, true)?;

-        let progress = timeline
+        let (_guard, prepared) = timeline
            .prepare_to_detach_from_ancestor(&tenant, options, ctx)
            .await?;

-        // uncomment to allow early as possible Tenant::drop
-        // drop(tenant);
+        let res = state
+            .tenant_manager
+            .complete_detaching_timeline_ancestor(tenant_shard_id, timeline_id, prepared, ctx)
+            .await;

-        let resp = match progress {
-            detach_ancestor::Progress::Prepared(_guard, prepared) => {
-                // it would be great to tag the guard on to the tenant activation future
-                let reparented_timelines = state
-                    .tenant_manager
-                    .complete_detaching_timeline_ancestor(
-                        tenant_shard_id,
-                        timeline_id,
-                        prepared,
-                        ctx,
-                    )
-                    .await
-                    .context("timeline detach ancestor completion")
-                    .map_err(ApiError::InternalServerError)?;
-
-                AncestorDetached {
+        match res {
+            Ok(reparented_timelines) => {
+                let resp = pageserver_api::models::detach_ancestor::AncestorDetached {
                    reparented_timelines,
-                }
-            }
-            detach_ancestor::Progress::Done(resp) => resp,
-        };
+                };

-        json_response(StatusCode::OK, resp)
+                json_response(StatusCode::OK, resp)
+            }
+            Err(e) => Err(ApiError::InternalServerError(
+                e.context("timeline detach completion"),
+            )),
+        }
    }
    .instrument(span)
    .await
@@ -2462,189 +2404,6 @@ async fn post_top_tenants(
    )
 }

-async fn put_tenant_timeline_import_basebackup(
-    request: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
-    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
-    let base_lsn: Lsn = must_parse_query_param(&request, "base_lsn")?;
-    let end_lsn: Lsn = must_parse_query_param(&request, "end_lsn")?;
-    let pg_version: u32 = must_parse_query_param(&request, "pg_version")?;
-
-    check_permission(&request, Some(tenant_id))?;
-
-    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
-
-    let span = info_span!("import_basebackup", tenant_id=%tenant_id, timeline_id=%timeline_id, base_lsn=%base_lsn, end_lsn=%end_lsn, pg_version=%pg_version);
-    async move {
-        let state = get_state(&request);
-        let tenant = state
-            .tenant_manager
-            .get_attached_tenant_shard(TenantShardId::unsharded(tenant_id))?;
-
-        let broker_client = state.broker_client.clone();
-
-        let mut body = StreamReader::new(request.into_body().map(|res| {
-            res.map_err(|error| {
-                std::io::Error::new(std::io::ErrorKind::Other, anyhow::anyhow!(error))
-            })
-        }));
-
-        tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
-
-        let timeline = tenant
-            .create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx)
-            .map_err(ApiError::InternalServerError)
-            .await?;
-
-        // TODO mark timeline as not ready until it reaches end_lsn.
-        // We might have some wal to import as well, and we should prevent compute
-        // from connecting before that and writing conflicting wal.
-        //
-        // This is not relevant for pageserver->pageserver migrations, since there's
-        // no wal to import. But should be fixed if we want to import from postgres.
-
-        // TODO leave clean state on error. For now you can use detach to clean
-        // up broken state from a failed import.
-
-        // Import basebackup provided via CopyData
-        info!("importing basebackup");
-
-        timeline
-            .import_basebackup_from_tar(tenant.clone(), &mut body, base_lsn, broker_client, &ctx)
-            .await
-            .map_err(ApiError::InternalServerError)?;
-
-        // Read the end of the tar archive.
-        read_tar_eof(body)
-            .await
-            .map_err(ApiError::InternalServerError)?;
-
-        // TODO check checksum
-        // Meanwhile you can verify client-side by taking fullbackup
-        // and checking that it matches in size with what was imported.
-        // It wouldn't work if base came from vanilla postgres though,
-        // since we discard some log files.
-
-        info!("done");
-        json_response(StatusCode::OK, ())
-    }
-    .instrument(span)
-    .await
-}
-
-async fn put_tenant_timeline_import_wal(
-    request: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
-    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
-    let start_lsn: Lsn = must_parse_query_param(&request, "start_lsn")?;
-    let end_lsn: Lsn = must_parse_query_param(&request, "end_lsn")?;
-
-    check_permission(&request, Some(tenant_id))?;
-
-    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
-
-    let span = info_span!("import_wal", tenant_id=%tenant_id, timeline_id=%timeline_id, start_lsn=%start_lsn, end_lsn=%end_lsn);
-    async move {
-        let state = get_state(&request);
-
-        let timeline = active_timeline_of_active_tenant(&state.tenant_manager, TenantShardId::unsharded(tenant_id), timeline_id).await?;
-
-        let mut body = StreamReader::new(request.into_body().map(|res| {
-            res.map_err(|error| {
-                std::io::Error::new(std::io::ErrorKind::Other, anyhow::anyhow!(error))
-            })
-        }));
-
-        let last_record_lsn = timeline.get_last_record_lsn();
-        if last_record_lsn != start_lsn {
-            return Err(ApiError::InternalServerError(anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}")));
-        }
-
-        // TODO leave clean state on error. For now you can use detach to clean
-        // up broken state from a failed import.
-
-        // Import wal provided via CopyData
-        info!("importing wal");
-        crate::import_datadir::import_wal_from_tar(&timeline, &mut body, start_lsn, end_lsn, &ctx).await.map_err(ApiError::InternalServerError)?;
-        info!("wal import complete");
-
-        // Read the end of the tar archive.
-        read_tar_eof(body).await.map_err(ApiError::InternalServerError)?;
-
-        // TODO Does it make sense to overshoot?
-        if timeline.get_last_record_lsn() < end_lsn {
-            return Err(ApiError::InternalServerError(anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}")));
-        }
-
-        // Flush data to disk, then upload to s3. No need for a forced checkpoint.
-        // We only want to persist the data, and it doesn't matter if it's in the
-        // shape of deltas or images.
-        info!("flushing layers");
-        timeline.freeze_and_flush().await.map_err(|e| match e {
-            tenant::timeline::FlushLayerError::Cancelled => ApiError::ShuttingDown,
-            other => ApiError::InternalServerError(anyhow::anyhow!(other)),
-        })?;
-
-        info!("done");
-
-        json_response(StatusCode::OK, ())
-    }.instrument(span).await
-}
-
-/// Read the end of a tar archive.
-///
-/// A tar archive normally ends with two consecutive blocks of zeros, 512 bytes each.
-/// `tokio_tar` already read the first such block. Read the second all-zeros block,
-/// and check that there is no more data after the EOF marker.
-///
-/// 'tar' command can also write extra blocks of zeros, up to a record
-/// size, controlled by the --record-size argument. Ignore them too.
-async fn read_tar_eof(mut reader: (impl tokio::io::AsyncRead + Unpin)) -> anyhow::Result<()> {
-    use tokio::io::AsyncReadExt;
-    let mut buf = [0u8; 512];
-
-    // Read the all-zeros block, and verify it
-    let mut total_bytes = 0;
-    while total_bytes < 512 {
-        let nbytes = reader.read(&mut buf[total_bytes..]).await?;
-        total_bytes += nbytes;
-        if nbytes == 0 {
-            break;
-        }
-    }
-    if total_bytes < 512 {
-        anyhow::bail!("incomplete or invalid tar EOF marker");
-    }
-    if !buf.iter().all(|&x| x == 0) {
-        anyhow::bail!("invalid tar EOF marker");
-    }
-
-    // Drain any extra zero-blocks after the EOF marker
-    let mut trailing_bytes = 0;
-    let mut seen_nonzero_bytes = false;
-    loop {
-        let nbytes = reader.read(&mut buf).await?;
-        trailing_bytes += nbytes;
-        if !buf.iter().all(|&x| x == 0) {
-            seen_nonzero_bytes = true;
-        }
-        if nbytes == 0 {
-            break;
-        }
-    }
-    if seen_nonzero_bytes {
-        anyhow::bail!("unexpected non-zero bytes after the tar archive");
-    }
-    if trailing_bytes % 512 != 0 {
-        anyhow::bail!("unexpected number of zeros ({trailing_bytes}), not divisible by tar block size (512 bytes), after the tar archive");
-    }
-    Ok(())
-}
-
 /// Common functionality of all the HTTP API handlers.
 ///
 /// - Adds a tracing span to each request (by `request_span`)
@@ -2833,10 +2592,6 @@ pub fn make_router(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/preserve_initdb_archive",
            |r| api_handler(r, timeline_preserve_initdb_handler),
        )
-        .post(
-            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/archival_config",
-            |r| api_handler(r, timeline_archival_config_handler),
-        )
        .get("/v1/tenant/:tenant_shard_id/timeline/:timeline_id", |r| {
            api_handler(r, timeline_detail_handler)
        })
@@ -2943,13 +2698,5 @@ pub fn make_router(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/perf_info",
            |r| testing_api_handler("perf_info", r, perf_info),
        )
-        .put(
-            "/v1/tenant/:tenant_id/timeline/:timeline_id/import_basebackup",
-            |r| api_handler(r, put_tenant_timeline_import_basebackup),
-        )
-        .put(
-            "/v1/tenant/:tenant_id/timeline/:timeline_id/import_wal",
-            |r| api_handler(r, put_tenant_timeline_import_wal),
-        )
        .any(handler_404))
 }
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -13,7 +13,6 @@ pub mod http;
 pub mod import_datadir;
 pub mod l0_flush;
 pub use pageserver_api::keyspace;
-use tokio_util::sync::CancellationToken;
 pub mod aux_file;
 pub mod metrics;
 pub mod page_cache;
@@ -24,6 +23,7 @@ pub mod span;
 pub(crate) mod statvfs;
 pub mod task_mgr;
 pub mod tenant;
+pub mod trace;
 pub mod utilization;
 pub mod virtual_file;
 pub mod walingest;
@@ -33,10 +33,7 @@ pub mod walredo;
 use crate::task_mgr::TaskKind;
 use camino::Utf8Path;
 use deletion_queue::DeletionQueue;
-use tenant::{
-    mgr::{BackgroundPurges, TenantManager},
-    secondary,
-};
+use tenant::mgr::TenantManager;
 use tracing::info;

 /// Current storage format version
@@ -58,39 +55,17 @@ static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]);

 pub use crate::metrics::preinitialize_metrics;

-pub struct CancellableTask {
-    pub task: tokio::task::JoinHandle<()>,
-    pub cancel: CancellationToken,
-}
-pub struct HttpEndpointListener(pub CancellableTask);
-pub struct LibpqEndpointListener(pub CancellableTask);
-pub struct ConsumptionMetricsTasks(pub CancellableTask);
-pub struct DiskUsageEvictionTask(pub CancellableTask);
-impl CancellableTask {
-    pub async fn shutdown(self) {
-        self.cancel.cancel();
-        self.task.await.unwrap();
-    }
-}
-
 #[tracing::instrument(skip_all, fields(%exit_code))]
-#[allow(clippy::too_many_arguments)]
 pub async fn shutdown_pageserver(
-    http_listener: HttpEndpointListener,
-    libpq_listener: LibpqEndpointListener,
-    consumption_metrics_worker: ConsumptionMetricsTasks,
-    disk_usage_eviction_task: Option<DiskUsageEvictionTask>,
    tenant_manager: &TenantManager,
-    background_purges: BackgroundPurges,
    mut deletion_queue: DeletionQueue,
-    secondary_controller_tasks: secondary::GlobalTasks,
    exit_code: i32,
 ) {
    use std::time::Duration;
    // Shut down the libpq endpoint task. This prevents new connections from
    // being accepted.
    timed(
-        libpq_listener.0.shutdown(),
+        task_mgr::shutdown_tasks(Some(TaskKind::LibpqEndpointListener), None, None),
        "shutdown LibpqEndpointListener",
        Duration::from_secs(1),
    )
@@ -117,44 +92,16 @@ pub async fn shutdown_pageserver(
    // Best effort to persist any outstanding deletions, to avoid leaking objects
    deletion_queue.shutdown(Duration::from_secs(5)).await;

-    timed(
-        consumption_metrics_worker.0.shutdown(),
-        "shutdown consumption metrics",
-        Duration::from_secs(1),
-    )
-    .await;
-
-    timed(
-        futures::future::OptionFuture::from(disk_usage_eviction_task.map(|t| t.0.shutdown())),
-        "shutdown disk usage eviction",
-        Duration::from_secs(1),
-    )
-    .await;
-
-    timed(
-        background_purges.shutdown(),
-        "shutdown background purges",
-        Duration::from_secs(1),
-    )
-    .await;
-
    // Shut down the HTTP endpoint last, so that you can still check the server's
    // status while it's shutting down.
    // FIXME: We should probably stop accepting commands like attach/detach earlier.
    timed(
-        http_listener.0.shutdown(),
+        task_mgr::shutdown_tasks(Some(TaskKind::HttpEndpointListener), None, None),
        "shutdown http",
        Duration::from_secs(1),
    )
    .await;

-    timed(
-        secondary_controller_tasks.wait(), // cancellation happened in caller
-        "secondary controller wait",
-        Duration::from_secs(1),
-    )
-    .await;
-
    // There should be nothing left, but let's be sure
    timed(
        task_mgr::shutdown_tasks(None, None, None),
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -473,31 +473,6 @@ static PITR_HISTORY_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-#[derive(strum_macros::EnumString, strum_macros::Display, strum_macros::IntoStaticStr)]
-#[strum(serialize_all = "kebab_case")]
-pub(crate) enum MetricLayerKind {
-    Delta,
-    Image,
-}
-
-static TIMELINE_LAYER_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
-    register_uint_gauge_vec!(
-        "pageserver_layer_bytes",
-        "Sum of layer physical sizes in bytes",
-        &["tenant_id", "shard_id", "timeline_id", "kind"]
-    )
-    .expect("failed to define a metric")
-});
-
-static TIMELINE_LAYER_COUNT: Lazy<UIntGaugeVec> = Lazy::new(|| {
-    register_uint_gauge_vec!(
-        "pageserver_layer_count",
-        "Number of layers that exist",
-        &["tenant_id", "shard_id", "timeline_id", "kind"]
-    )
-    .expect("failed to define a metric")
-});
-
 static TIMELINE_ARCHIVE_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
    register_uint_gauge_vec!(
        "pageserver_archive_size",
@@ -594,38 +569,6 @@ static VALID_LSN_LEASE_COUNT: Lazy<UIntGaugeVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub(crate) static CIRCUIT_BREAKERS_BROKEN: Lazy<IntCounter> = Lazy::new(|| {
-    register_int_counter!(
-        "pageserver_circuit_breaker_broken",
-        "How many times a circuit breaker has broken"
-    )
-    .expect("failed to define a metric")
-});
-
-pub(crate) static CIRCUIT_BREAKERS_UNBROKEN: Lazy<IntCounter> = Lazy::new(|| {
-    register_int_counter!(
-        "pageserver_circuit_breaker_unbroken",
-        "How many times a circuit breaker has been un-broken (recovered)"
-    )
-    .expect("failed to define a metric")
-});
-
-pub(crate) static COMPRESSION_IMAGE_INPUT_BYTES: Lazy<IntCounter> = Lazy::new(|| {
-    register_int_counter!(
-        "pageserver_compression_image_in_bytes_total",
-        "Size of uncompressed data written into image layers"
-    )
-    .expect("failed to define a metric")
-});
-
-pub(crate) static COMPRESSION_IMAGE_OUTPUT_BYTES: Lazy<IntCounter> = Lazy::new(|| {
-    register_int_counter!(
-        "pageserver_compression_image_out_bytes_total",
-        "Size of compressed image layer written"
-    )
-    .expect("failed to define a metric")
-});
-
 pub(crate) mod initial_logical_size {
    use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
    use once_cell::sync::Lazy;
@@ -1513,12 +1456,10 @@ impl<'a, 'c> BasebackupQueryTimeOngoingRecording<'a, 'c> {
    }
 }

-pub(crate) static LIVE_CONNECTIONS: Lazy<IntCounterPairVec> = Lazy::new(|| {
-    register_int_counter_pair_vec!(
-        "pageserver_live_connections_started",
-        "Number of network connections that we started handling",
-        "pageserver_live_connections_finished",
-        "Number of network connections that we finished handling",
+pub(crate) static LIVE_CONNECTIONS_COUNT: Lazy<IntGaugeVec> = Lazy::new(|| {
+    register_int_gauge_vec!(
+        "pageserver_live_connections",
+        "Number of live network connections",
        &["pageserver_connection_kind"]
    )
    .expect("failed to define a metric")
@@ -1530,7 +1471,10 @@ pub(crate) enum ComputeCommandKind {
    PageStream,
    Basebackup,
    Fullbackup,
+    ImportBasebackup,
+    ImportWal,
    LeaseLsn,
+    Show,
 }

 pub(crate) struct ComputeCommandCounters {
@@ -2182,10 +2126,6 @@ pub(crate) struct TimelineMetrics {
    pub last_record_gauge: IntGauge,
    pub pitr_history_size: UIntGauge,
    pub archival_size: UIntGauge,
-    pub(crate) layer_size_image: UIntGauge,
-    pub(crate) layer_count_image: UIntGauge,
-    pub(crate) layer_size_delta: UIntGauge,
-    pub(crate) layer_count_delta: UIntGauge,
    pub standby_horizon_gauge: IntGauge,
    pub resident_physical_size_gauge: UIntGauge,
    /// copy of LayeredTimeline.current_logical_size
@@ -2268,42 +2208,6 @@ impl TimelineMetrics {
            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
            .unwrap();

-        let layer_size_image = TIMELINE_LAYER_SIZE
-            .get_metric_with_label_values(&[
-                &tenant_id,
-                &shard_id,
-                &timeline_id,
-                MetricLayerKind::Image.into(),
-            ])
-            .unwrap();
-
-        let layer_count_image = TIMELINE_LAYER_COUNT
-            .get_metric_with_label_values(&[
-                &tenant_id,
-                &shard_id,
-                &timeline_id,
-                MetricLayerKind::Image.into(),
-            ])
-            .unwrap();
-
-        let layer_size_delta = TIMELINE_LAYER_SIZE
-            .get_metric_with_label_values(&[
-                &tenant_id,
-                &shard_id,
-                &timeline_id,
-                MetricLayerKind::Delta.into(),
-            ])
-            .unwrap();
-
-        let layer_count_delta = TIMELINE_LAYER_COUNT
-            .get_metric_with_label_values(&[
-                &tenant_id,
-                &shard_id,
-                &timeline_id,
-                MetricLayerKind::Delta.into(),
-            ])
-            .unwrap();
-
        let standby_horizon_gauge = STANDBY_HORIZON
            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
            .unwrap();
@@ -2358,10 +2262,6 @@ impl TimelineMetrics {
            last_record_gauge,
            pitr_history_size,
            archival_size,
-            layer_size_image,
-            layer_count_image,
-            layer_size_delta,
-            layer_count_delta,
            standby_horizon_gauge,
            resident_physical_size_gauge,
            current_logical_size_gauge,
@@ -2423,31 +2323,6 @@ impl TimelineMetrics {
        let _ = TIMELINE_ARCHIVE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        let _ = PITR_HISTORY_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);

-        let _ = TIMELINE_LAYER_SIZE.remove_label_values(&[
-            tenant_id,
-            shard_id,
-            timeline_id,
-            MetricLayerKind::Image.into(),
-        ]);
-        let _ = TIMELINE_LAYER_COUNT.remove_label_values(&[
-            tenant_id,
-            shard_id,
-            timeline_id,
-            MetricLayerKind::Image.into(),
-        ]);
-        let _ = TIMELINE_LAYER_SIZE.remove_label_values(&[
-            tenant_id,
-            shard_id,
-            timeline_id,
-            MetricLayerKind::Delta.into(),
-        ]);
-        let _ = TIMELINE_LAYER_COUNT.remove_label_values(&[
-            tenant_id,
-            shard_id,
-            timeline_id,
-            MetricLayerKind::Delta.into(),
-        ]);
-
        let _ = EVICTIONS.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        let _ = AUX_FILE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        let _ = VALID_LSN_LEASE_COUNT.remove_label_values(&[tenant_id, shard_id, timeline_id]);
@@ -3104,8 +2979,6 @@ pub fn preinitialize_metrics() {
        &tokio_epoll_uring::THREAD_LOCAL_LAUNCH_SUCCESSES,
        &REMOTE_ONDEMAND_DOWNLOADED_LAYERS,
        &REMOTE_ONDEMAND_DOWNLOADED_BYTES,
-        &CIRCUIT_BREAKERS_BROKEN,
-        &CIRCUIT_BREAKERS_UNBROKEN,
    ]
    .into_iter()
    .for_each(|c| {
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -4,7 +4,9 @@
 use anyhow::Context;
 use async_compression::tokio::write::GzipEncoder;
 use bytes::Buf;
+use bytes::Bytes;
 use futures::stream::FuturesUnordered;
+use futures::Stream;
 use futures::StreamExt;
 use pageserver_api::key::Key;
 use pageserver_api::models::TenantState;
@@ -26,6 +28,7 @@ use std::borrow::Cow;
 use std::collections::HashMap;
 use std::io;
 use std::net::TcpListener;
+use std::pin::pin;
 use std::str;
 use std::str::FromStr;
 use std::sync::Arc;
@@ -34,8 +37,10 @@ use std::time::Instant;
 use std::time::SystemTime;
 use tokio::io::AsyncWriteExt;
 use tokio::io::{AsyncRead, AsyncWrite};
+use tokio_util::io::StreamReader;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
+use utils::id::ConnectionId;
 use utils::sync::gate::GateGuard;
 use utils::{
    auth::{Claims, Scope, SwappableJwtAuth},
@@ -48,8 +53,9 @@ use crate::auth::check_permission;
 use crate::basebackup;
 use crate::basebackup::BasebackupError;
 use crate::context::{DownloadBehavior, RequestContext};
+use crate::import_datadir::import_wal_from_tar;
 use crate::metrics;
-use crate::metrics::{ComputeCommandKind, COMPUTE_COMMANDS_COUNTERS, LIVE_CONNECTIONS};
+use crate::metrics::{ComputeCommandKind, COMPUTE_COMMANDS_COUNTERS, LIVE_CONNECTIONS_COUNT};
 use crate::pgdatadir_mapping::Version;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
@@ -60,11 +66,13 @@ use crate::tenant::mgr::GetTenantError;
 use crate::tenant::mgr::ShardResolveResult;
 use crate::tenant::mgr::ShardSelector;
 use crate::tenant::mgr::TenantManager;
+use crate::tenant::timeline::FlushLayerError;
 use crate::tenant::timeline::WaitLsnError;
 use crate::tenant::GetTimelineError;
 use crate::tenant::PageReconstructError;
 use crate::tenant::Tenant;
 use crate::tenant::Timeline;
+use crate::trace::Tracer;
 use pageserver_api::key::rel_block_to_key;
 use pageserver_api::reltag::SlruKind;
 use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
@@ -74,6 +82,56 @@ use postgres_ffi::BLCKSZ;
 // is not yet in state [`TenantState::Active`].
 const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000);

+/// Read the end of a tar archive.
+///
+/// A tar archive normally ends with two consecutive blocks of zeros, 512 bytes each.
+/// `tokio_tar` already read the first such block. Read the second all-zeros block,
+/// and check that there is no more data after the EOF marker.
+///
+/// 'tar' command can also write extra blocks of zeros, up to a record
+/// size, controlled by the --record-size argument. Ignore them too.
+async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<()> {
+    use tokio::io::AsyncReadExt;
+    let mut buf = [0u8; 512];
+
+    // Read the all-zeros block, and verify it
+    let mut total_bytes = 0;
+    while total_bytes < 512 {
+        let nbytes = reader.read(&mut buf[total_bytes..]).await?;
+        total_bytes += nbytes;
+        if nbytes == 0 {
+            break;
+        }
+    }
+    if total_bytes < 512 {
+        anyhow::bail!("incomplete or invalid tar EOF marker");
+    }
+    if !buf.iter().all(|&x| x == 0) {
+        anyhow::bail!("invalid tar EOF marker");
+    }
+
+    // Drain any extra zero-blocks after the EOF marker
+    let mut trailing_bytes = 0;
+    let mut seen_nonzero_bytes = false;
+    loop {
+        let nbytes = reader.read(&mut buf).await?;
+        trailing_bytes += nbytes;
+        if !buf.iter().all(|&x| x == 0) {
+            seen_nonzero_bytes = true;
+        }
+        if nbytes == 0 {
+            break;
+        }
+    }
+    if seen_nonzero_bytes {
+        anyhow::bail!("unexpected non-zero bytes after the tar archive");
+    }
+    if trailing_bytes % 512 != 0 {
+        anyhow::bail!("unexpected number of zeros ({trailing_bytes}), not divisible by tar block size (512 bytes), after the tar archive");
+    }
+    Ok(())
+}
+
 ///////////////////////////////////////////////////////////////////////////////

 ///
@@ -83,6 +141,7 @@ const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000);
 ///
 pub async fn libpq_listener_main(
    tenant_manager: Arc<TenantManager>,
+    broker_client: storage_broker::BrokerClientChannel,
    auth: Option<Arc<SwappableJwtAuth>>,
    listener: TcpListener,
    auth_type: AuthType,
@@ -124,8 +183,10 @@ pub async fn libpq_listener_main(
                    None,
                    None,
                    "serving compute connection task",
+                    false,
                    page_service_conn_main(
                        tenant_manager.clone(),
+                        broker_client.clone(),
                        local_auth,
                        socket,
                        auth_type,
@@ -148,14 +209,20 @@ pub async fn libpq_listener_main(
 #[instrument(skip_all, fields(peer_addr))]
 async fn page_service_conn_main(
    tenant_manager: Arc<TenantManager>,
+    broker_client: storage_broker::BrokerClientChannel,
    auth: Option<Arc<SwappableJwtAuth>>,
    socket: tokio::net::TcpStream,
    auth_type: AuthType,
    connection_ctx: RequestContext,
 ) -> anyhow::Result<()> {
-    let _guard = LIVE_CONNECTIONS
-        .with_label_values(&["page_service"])
-        .guard();
+    // Immediately increment the gauge, then create a job to decrement it on task exit.
+    // One of the pros of `defer!` is that this will *most probably*
+    // get called, even in presence of panics.
+    let gauge = LIVE_CONNECTIONS_COUNT.with_label_values(&["page_service"]);
+    gauge.inc();
+    scopeguard::defer! {
+        gauge.dec();
+    }

    socket
        .set_nodelay(true)
@@ -200,11 +267,12 @@ async fn page_service_conn_main(
    // and create a child per-query context when it invokes process_query.
    // But it's in a shared crate, so, we store connection_ctx inside PageServerHandler
    // and create the per-query context in process_query ourselves.
-    let mut conn_handler = PageServerHandler::new(tenant_manager, auth, connection_ctx);
+    let mut conn_handler =
+        PageServerHandler::new(tenant_manager, broker_client, auth, connection_ctx);
    let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;

    match pgbackend
-        .run(&mut conn_handler, &task_mgr::shutdown_token())
+        .run(&mut conn_handler, task_mgr::shutdown_watcher)
        .await
    {
        Ok(()) => {
@@ -231,6 +299,7 @@ struct HandlerTimeline {
 }

 struct PageServerHandler {
+    broker_client: storage_broker::BrokerClientChannel,
    auth: Option<Arc<SwappableJwtAuth>>,
    claims: Option<Claims>,

@@ -322,11 +391,13 @@ impl From<WaitLsnError> for QueryError {
 impl PageServerHandler {
    pub fn new(
        tenant_manager: Arc<TenantManager>,
+        broker_client: storage_broker::BrokerClientChannel,
        auth: Option<Arc<SwappableJwtAuth>>,
        connection_ctx: RequestContext,
    ) -> Self {
        PageServerHandler {
            tenant_manager,
+            broker_client,
            auth,
            claims: None,
            connection_ctx,
@@ -409,6 +480,73 @@ impl PageServerHandler {
        )
    }

+    fn copyin_stream<'a, IO>(
+        &'a self,
+        pgb: &'a mut PostgresBackend<IO>,
+        cancel: &'a CancellationToken,
+    ) -> impl Stream<Item = io::Result<Bytes>> + 'a
+    where
+        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
+    {
+        async_stream::try_stream! {
+            loop {
+                let msg = tokio::select! {
+                    biased;
+
+                    _ = cancel.cancelled() => {
+                        // We were requested to shut down.
+                        let msg = "pageserver is shutting down";
+                        let _ = pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, None));
+                        Err(QueryError::Shutdown)
+                    }
+
+                    msg = pgb.read_message() => { msg.map_err(QueryError::from)}
+                };
+
+                match msg {
+                    Ok(Some(message)) => {
+                        let copy_data_bytes = match message {
+                            FeMessage::CopyData(bytes) => bytes,
+                            FeMessage::CopyDone => { break },
+                            FeMessage::Sync => continue,
+                            FeMessage::Terminate => {
+                                let msg = "client terminated connection with Terminate message during COPY";
+                                let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
+                                // error can't happen here, ErrorResponse serialization should be always ok
+                                pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?;
+                                Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
+                                break;
+                            }
+                            m => {
+                                let msg = format!("unexpected message {m:?}");
+                                // error can't happen here, ErrorResponse serialization should be always ok
+                                pgb.write_message_noflush(&BeMessage::ErrorResponse(&msg, None)).map_err(|e| e.into_io_error())?;
+                                Err(io::Error::new(io::ErrorKind::Other, msg))?;
+                                break;
+                            }
+                        };
+
+                        yield copy_data_bytes;
+                    }
+                    Ok(None) => {
+                        let msg = "client closed connection during COPY";
+                        let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
+                        // error can't happen here, ErrorResponse serialization should be always ok
+                        pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?;
+                        self.flush_cancellable(pgb, cancel).await.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
+                        Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
+                    }
+                    Err(QueryError::Disconnected(ConnectionError::Io(io_error))) => {
+                        Err(io_error)?;
+                    }
+                    Err(other) => {
+                        Err(io::Error::new(io::ErrorKind::Other, other.to_string()))?;
+                    }
+                };
+            }
+        }
+    }
+
    #[instrument(skip_all)]
    async fn handle_pagerequests<IO>(
        &mut self,
@@ -427,6 +565,18 @@ impl PageServerHandler {
            .get_active_tenant_with_timeout(tenant_id, ShardSelector::First, ACTIVE_TENANT_TIMEOUT)
            .await?;

+        // Make request tracer if needed
+        let mut tracer = if tenant.get_trace_read_requests() {
+            let connection_id = ConnectionId::generate();
+            let path =
+                tenant
+                    .conf
+                    .trace_path(&tenant.tenant_shard_id(), &timeline_id, &connection_id);
+            Some(Tracer::new(path))
+        } else {
+            None
+        };
+
        // switch client to COPYBOTH
        pgb.write_message_noflush(&BeMessage::CopyBothResponse)?;
        self.flush_cancellable(pgb, &tenant.cancel).await?;
@@ -458,6 +608,11 @@ impl PageServerHandler {
            trace!("query: {copy_data_bytes:?}");
            fail::fail_point!("ps::handle-pagerequest-message");

+            // Trace request if needed
+            if let Some(t) = tracer.as_mut() {
+                t.trace(&copy_data_bytes)
+            }
+
            let neon_fe_msg =
                PagestreamFeMessage::parse(&mut copy_data_bytes.reader(), protocol_version)?;

@@ -563,6 +718,128 @@ impl PageServerHandler {
        Ok(())
    }

+    #[allow(clippy::too_many_arguments)]
+    #[instrument(skip_all, fields(%base_lsn, end_lsn=%_end_lsn, %pg_version))]
+    async fn handle_import_basebackup<IO>(
+        &self,
+        pgb: &mut PostgresBackend<IO>,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        base_lsn: Lsn,
+        _end_lsn: Lsn,
+        pg_version: u32,
+        ctx: RequestContext,
+    ) -> Result<(), QueryError>
+    where
+        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
+    {
+        debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id();
+
+        // Create empty timeline
+        info!("creating new timeline");
+        let tenant = self
+            .get_active_tenant_with_timeout(tenant_id, ShardSelector::Zero, ACTIVE_TENANT_TIMEOUT)
+            .await?;
+        let timeline = tenant
+            .create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx)
+            .await?;
+
+        // TODO mark timeline as not ready until it reaches end_lsn.
+        // We might have some wal to import as well, and we should prevent compute
+        // from connecting before that and writing conflicting wal.
+        //
+        // This is not relevant for pageserver->pageserver migrations, since there's
+        // no wal to import. But should be fixed if we want to import from postgres.
+
+        // TODO leave clean state on error. For now you can use detach to clean
+        // up broken state from a failed import.
+
+        // Import basebackup provided via CopyData
+        info!("importing basebackup");
+        pgb.write_message_noflush(&BeMessage::CopyInResponse)?;
+        self.flush_cancellable(pgb, &tenant.cancel).await?;
+
+        let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb, &tenant.cancel)));
+        timeline
+            .import_basebackup_from_tar(
+                tenant.clone(),
+                &mut copyin_reader,
+                base_lsn,
+                self.broker_client.clone(),
+                &ctx,
+            )
+            .await?;
+
+        // Read the end of the tar archive.
+        read_tar_eof(copyin_reader).await?;
+
+        // TODO check checksum
+        // Meanwhile you can verify client-side by taking fullbackup
+        // and checking that it matches in size with what was imported.
+        // It wouldn't work if base came from vanilla postgres though,
+        // since we discard some log files.
+
+        info!("done");
+        Ok(())
+    }
+
+    #[instrument(skip_all, fields(shard_id, %start_lsn, %end_lsn))]
+    async fn handle_import_wal<IO>(
+        &self,
+        pgb: &mut PostgresBackend<IO>,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        start_lsn: Lsn,
+        end_lsn: Lsn,
+        ctx: RequestContext,
+    ) -> Result<(), QueryError>
+    where
+        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
+    {
+        let timeline = self
+            .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
+            .await?;
+        let last_record_lsn = timeline.get_last_record_lsn();
+        if last_record_lsn != start_lsn {
+            return Err(QueryError::Other(
+                anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}"))
+            );
+        }
+
+        // TODO leave clean state on error. For now you can use detach to clean
+        // up broken state from a failed import.
+
+        // Import wal provided via CopyData
+        info!("importing wal");
+        pgb.write_message_noflush(&BeMessage::CopyInResponse)?;
+        self.flush_cancellable(pgb, &timeline.cancel).await?;
+        let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb, &timeline.cancel)));
+        import_wal_from_tar(&timeline, &mut copyin_reader, start_lsn, end_lsn, &ctx).await?;
+        info!("wal import complete");
+
+        // Read the end of the tar archive.
+        read_tar_eof(copyin_reader).await?;
+
+        // TODO Does it make sense to overshoot?
+        if timeline.get_last_record_lsn() < end_lsn {
+            return Err(QueryError::Other(
+                anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}"))
+            );
+        }
+
+        // Flush data to disk, then upload to s3. No need for a forced checkpoint.
+        // We only want to persist the data, and it doesn't matter if it's in the
+        // shape of deltas or images.
+        info!("flushing layers");
+        timeline.freeze_and_flush().await.map_err(|e| match e {
+            FlushLayerError::Cancelled => QueryError::Shutdown,
+            other => QueryError::Other(other.into()),
+        })?;
+
+        info!("done");
+        Ok(())
+    }
+
    /// Helper function to handle the LSN from client request.
    ///
    /// Each GetPage (and Exists and Nblocks) request includes information about
@@ -1433,6 +1710,109 @@ where
            )
            .await?;
            pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
+        } else if query_string.starts_with("import basebackup ") {
+            // Import the `base` section (everything but the wal) of a basebackup.
+            // Assumes the tenant already exists on this pageserver.
+            //
+            // Files are scheduled to be persisted to remote storage, and the
+            // caller should poll the http api to check when that is done.
+            //
+            // Example import command:
+            // 1. Get start/end LSN from backup_manifest file
+            // 2. Run:
+            // cat my_backup/base.tar | psql -h $PAGESERVER \
+            //     -c "import basebackup $TENANT $TIMELINE $START_LSN $END_LSN $PG_VERSION"
+            let params = &parts[2..];
+            if params.len() != 5 {
+                return Err(QueryError::Other(anyhow::anyhow!(
+                    "invalid param number for import basebackup command"
+                )));
+            }
+            let tenant_id = TenantId::from_str(params[0])
+                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
+            let timeline_id = TimelineId::from_str(params[1])
+                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
+            let base_lsn = Lsn::from_str(params[2])
+                .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?;
+            let end_lsn = Lsn::from_str(params[3])
+                .with_context(|| format!("Failed to parse Lsn from {}", params[3]))?;
+            let pg_version = u32::from_str(params[4])
+                .with_context(|| format!("Failed to parse pg_version from {}", params[4]))?;
+
+            tracing::Span::current()
+                .record("tenant_id", field::display(tenant_id))
+                .record("timeline_id", field::display(timeline_id));
+
+            self.check_permission(Some(tenant_id))?;
+
+            COMPUTE_COMMANDS_COUNTERS
+                .for_command(ComputeCommandKind::ImportBasebackup)
+                .inc();
+
+            match self
+                .handle_import_basebackup(
+                    pgb,
+                    tenant_id,
+                    timeline_id,
+                    base_lsn,
+                    end_lsn,
+                    pg_version,
+                    ctx,
+                )
+                .await
+            {
+                Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?,
+                Err(e) => {
+                    error!("error importing base backup between {base_lsn} and {end_lsn}: {e:?}");
+                    pgb.write_message_noflush(&BeMessage::ErrorResponse(
+                        &e.to_string(),
+                        Some(e.pg_error_code()),
+                    ))?
+                }
+            };
+        } else if query_string.starts_with("import wal ") {
+            // Import the `pg_wal` section of a basebackup.
+            //
+            // Files are scheduled to be persisted to remote storage, and the
+            // caller should poll the http api to check when that is done.
+            let params = &parts[2..];
+            if params.len() != 4 {
+                return Err(QueryError::Other(anyhow::anyhow!(
+                    "invalid param number for import wal command"
+                )));
+            }
+            let tenant_id = TenantId::from_str(params[0])
+                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
+            let timeline_id = TimelineId::from_str(params[1])
+                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
+            let start_lsn = Lsn::from_str(params[2])
+                .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?;
+            let end_lsn = Lsn::from_str(params[3])
+                .with_context(|| format!("Failed to parse Lsn from {}", params[3]))?;
+
+            tracing::Span::current()
+                .record("tenant_id", field::display(tenant_id))
+                .record("timeline_id", field::display(timeline_id));
+
+            self.check_permission(Some(tenant_id))?;
+
+            COMPUTE_COMMANDS_COUNTERS
+                .for_command(ComputeCommandKind::ImportWal)
+                .inc();
+
+            match self
+                .handle_import_wal(pgb, tenant_id, timeline_id, start_lsn, end_lsn, ctx)
+                .await
+            {
+                Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?,
+                Err(e) => {
+                    error!("error importing WAL between {start_lsn} and {end_lsn}: {e:?}");
+                    pgb.write_message_noflush(&BeMessage::ErrorResponse(
+                        &e.to_string(),
+                        Some(e.pg_error_code()),
+                    ))?
+                }
+            };
        } else if query_string.to_ascii_lowercase().starts_with("set ") {
            // important because psycopg2 executes "SET datestyle TO 'ISO'"
            // on connect
@@ -1478,6 +1858,66 @@ where
                    ))?
                }
            };
+        } else if let Some(params) = parts.strip_prefix(&["show"]) {
+            // show <tenant_id>
+            if params.len() != 1 {
+                return Err(QueryError::Other(anyhow::anyhow!(
+                    "invalid param number for config command"
+                )));
+            }
+            let tenant_id = TenantId::from_str(params[0])
+                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
+
+            tracing::Span::current().record("tenant_id", field::display(tenant_id));
+
+            self.check_permission(Some(tenant_id))?;
+
+            COMPUTE_COMMANDS_COUNTERS
+                .for_command(ComputeCommandKind::Show)
+                .inc();
+
+            let tenant = self
+                .get_active_tenant_with_timeout(
+                    tenant_id,
+                    ShardSelector::Zero,
+                    ACTIVE_TENANT_TIMEOUT,
+                )
+                .await?;
+            pgb.write_message_noflush(&BeMessage::RowDescription(&[
+                RowDescriptor::int8_col(b"checkpoint_distance"),
+                RowDescriptor::int8_col(b"checkpoint_timeout"),
+                RowDescriptor::int8_col(b"compaction_target_size"),
+                RowDescriptor::int8_col(b"compaction_period"),
+                RowDescriptor::int8_col(b"compaction_threshold"),
+                RowDescriptor::int8_col(b"gc_horizon"),
+                RowDescriptor::int8_col(b"gc_period"),
+                RowDescriptor::int8_col(b"image_creation_threshold"),
+                RowDescriptor::int8_col(b"pitr_interval"),
+            ]))?
+            .write_message_noflush(&BeMessage::DataRow(&[
+                Some(tenant.get_checkpoint_distance().to_string().as_bytes()),
+                Some(
+                    tenant
+                        .get_checkpoint_timeout()
+                        .as_secs()
+                        .to_string()
+                        .as_bytes(),
+                ),
+                Some(tenant.get_compaction_target_size().to_string().as_bytes()),
+                Some(
+                    tenant
+                        .get_compaction_period()
+                        .as_secs()
+                        .to_string()
+                        .as_bytes(),
+                ),
+                Some(tenant.get_compaction_threshold().to_string().as_bytes()),
+                Some(tenant.get_gc_horizon().to_string().as_bytes()),
+                Some(tenant.get_gc_period().as_secs().to_string().as_bytes()),
+                Some(tenant.get_image_creation_threshold().to_string().as_bytes()),
+                Some(tenant.get_pitr_interval().as_secs().to_string().as_bytes()),
+            ]))?
+            .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
        } else {
            return Err(QueryError::Other(anyhow::anyhow!(
                "unknown command {query_string}"
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -284,16 +284,6 @@ impl Timeline {
        if let Some(_nblocks) = self.get_cached_rel_size(&tag, version.get_lsn()) {
            return Ok(true);
        }
-        // then check if the database was already initialized.
-        // get_rel_exists can be called before dbdir is created.
-        let buf = version.get(self, DBDIR_KEY, ctx).await?;
-        let dbdirs = match DbDirectory::des(&buf).context("deserialization failure") {
-            Ok(dir) => Ok(dir.dbdirs),
-            Err(e) => Err(PageReconstructError::from(e)),
-        }?;
-        if !dbdirs.contains_key(&(tag.spcnode, tag.dbnode)) {
-            return Ok(false);
-        }
        // fetch directory listing
        let key = rel_dir_to_key(tag.spcnode, tag.dbnode);
        let buf = version.get(self, key, ctx).await?;
@@ -532,7 +522,7 @@ impl Timeline {
        ctx: &RequestContext,
    ) -> Result<Option<TimestampTz>, PageReconstructError> {
        let mut max: Option<TimestampTz> = None;
-        self.map_all_timestamps::<()>(probe_lsn, ctx, |timestamp| {
+        self.map_all_timestamps(probe_lsn, ctx, |timestamp| {
            if let Some(max_prev) = max {
                max = Some(max_prev.max(timestamp));
            } else {
@@ -864,14 +854,13 @@ impl Timeline {
        result.add_key(DBDIR_KEY);

        // Fetch list of database dirs and iterate them
-        let dbdir = self.list_dbdirs(lsn, ctx).await?;
-        let mut dbs: Vec<((Oid, Oid), bool)> = dbdir.into_iter().collect();
+        let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
+        let dbdir = DbDirectory::des(&buf)?;

-        dbs.sort_unstable_by(|(k_a, _), (k_b, _)| k_a.cmp(k_b));
-        for ((spcnode, dbnode), has_relmap_file) in dbs {
-            if has_relmap_file {
-                result.add_key(relmap_file_key(spcnode, dbnode));
-            }
+        let mut dbs: Vec<(Oid, Oid)> = dbdir.dbdirs.keys().cloned().collect();
+        dbs.sort_unstable();
+        for (spcnode, dbnode) in dbs {
+            result.add_key(relmap_file_key(spcnode, dbnode));
            result.add_key(rel_dir_to_key(spcnode, dbnode));

            let mut rels: Vec<RelTag> = self
@@ -930,9 +919,6 @@ impl Timeline {
            result.add_key(AUX_FILES_KEY);
        }

-        // Add extra keyspaces in the test cases. Some test cases write keys into the storage without
-        // creating directory keys. These test cases will add such keyspaces into `extra_test_dense_keyspace`
-        // and the keys will not be garbage-colllected.
        #[cfg(test)]
        {
            let guard = self.extra_test_dense_keyspace.load();
@@ -941,48 +927,13 @@ impl Timeline {
            }
        }

-        let dense_keyspace = result.to_keyspace();
-        let sparse_keyspace = SparseKeySpace(KeySpace {
-            ranges: vec![Key::metadata_aux_key_range(), repl_origin_key_range()],
-        });
-
-        if cfg!(debug_assertions) {
-            // Verify if the sparse keyspaces are ordered and non-overlapping.
-
-            // We do not use KeySpaceAccum for sparse_keyspace because we want to ensure each
-            // category of sparse keys are split into their own image/delta files. If there
-            // are overlapping keyspaces, they will be automatically merged by keyspace accum,
-            // and we want the developer to keep the keyspaces separated.
-
-            let ranges = &sparse_keyspace.0.ranges;
-
-            // TODO: use a single overlaps_with across the codebase
-            fn overlaps_with<T: Ord>(a: &Range<T>, b: &Range<T>) -> bool {
-                !(a.end <= b.start || b.end <= a.start)
-            }
-            for i in 0..ranges.len() {
-                for j in 0..i {
-                    if overlaps_with(&ranges[i], &ranges[j]) {
-                        panic!(
-                            "overlapping sparse keyspace: {}..{} and {}..{}",
-                            ranges[i].start, ranges[i].end, ranges[j].start, ranges[j].end
-                        );
-                    }
-                }
-            }
-            for i in 1..ranges.len() {
-                assert!(
-                    ranges[i - 1].end <= ranges[i].start,
-                    "unordered sparse keyspace: {}..{} and {}..{}",
-                    ranges[i - 1].start,
-                    ranges[i - 1].end,
-                    ranges[i].start,
-                    ranges[i].end
-                );
-            }
-        }
-
-        Ok((dense_keyspace, sparse_keyspace))
+        Ok((
+            result.to_keyspace(),
+            /* AUX sparse key space */
+            SparseKeySpace(KeySpace {
+                ranges: vec![repl_origin_key_range(), Key::metadata_aux_key_range()],
+            }),
+        ))
    }

    /// Get cached size of relation if it not updated after specified LSN
@@ -2041,7 +1992,7 @@ mod tests {
    #[tokio::test]
    async fn aux_files_round_trip() -> anyhow::Result<()> {
        let name = "aux_files_round_trip";
-        let harness = TenantHarness::create(name).await?;
+        let harness = TenantHarness::create(name)?;

        pub const TIMELINE_ID: TimelineId =
            TimelineId::from_array(hex!("11223344556677881122334455667788"));
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -408,6 +408,7 @@ pub fn spawn<F>(
    tenant_shard_id: Option<TenantShardId>,
    timeline_id: Option<TimelineId>,
    name: &str,
+    shutdown_process_on_error: bool,
    future: F,
 ) -> PageserverTaskId
 where
@@ -436,6 +437,7 @@ where
        task_id,
        task_cloned,
        cancel,
+        shutdown_process_on_error,
        future,
    ));
    task_mut.join_handle = Some(join_handle);
@@ -452,78 +454,82 @@ async fn task_wrapper<F>(
    task_id: u64,
    task: Arc<PageServerTask>,
    shutdown_token: CancellationToken,
+    shutdown_process_on_error: bool,
    future: F,
 ) where
    F: Future<Output = anyhow::Result<()>> + Send + 'static,
 {
    debug!("Starting task '{}'", task_name);

-    // wrap the future so we log panics and errors
-    let tenant_shard_id = task.tenant_shard_id;
-    let timeline_id = task.timeline_id;
-    let fut = async move {
-        // We use AssertUnwindSafe here so that the payload function
-        // doesn't need to be UnwindSafe. We don't do anything after the
-        // unwinding that would expose us to unwind-unsafe behavior.
-        let result = AssertUnwindSafe(future).catch_unwind().await;
+    let result = SHUTDOWN_TOKEN
+        .scope(
+            shutdown_token,
+            CURRENT_TASK.scope(task, {
+                // We use AssertUnwindSafe here so that the payload function
+                // doesn't need to be UnwindSafe. We don't do anything after the
+                // unwinding that would expose us to unwind-unsafe behavior.
+                AssertUnwindSafe(future).catch_unwind()
+            }),
+        )
+        .await;
+    task_finish(result, task_name, task_id, shutdown_process_on_error).await;
+}
+
+async fn task_finish(
+    result: std::result::Result<
+        anyhow::Result<()>,
+        std::boxed::Box<dyn std::any::Any + std::marker::Send>,
+    >,
+    task_name: String,
+    task_id: u64,
+    shutdown_process_on_error: bool,
+) {
+    // Remove our entry from the global hashmap.
+    let task = TASKS
+        .lock()
+        .unwrap()
+        .remove(&task_id)
+        .expect("no task in registry");
+
+    let mut shutdown_process = false;
+    {
        match result {
            Ok(Ok(())) => {
                debug!("Task '{}' exited normally", task_name);
            }
            Ok(Err(err)) => {
-                error!(
-                    "Task '{}' tenant_shard_id: {:?}, timeline_id: {:?} exited with error: {:?}",
-                    task_name, tenant_shard_id, timeline_id, err
-                );
+                if shutdown_process_on_error {
+                    error!(
+                        "Shutting down: task '{}' tenant_shard_id: {:?}, timeline_id: {:?} exited with error: {:?}",
+                        task_name, task.tenant_shard_id, task.timeline_id, err
+                    );
+                    shutdown_process = true;
+                } else {
+                    error!(
+                        "Task '{}' tenant_shard_id: {:?}, timeline_id: {:?} exited with error: {:?}",
+                        task_name, task.tenant_shard_id, task.timeline_id, err
+                    );
+                }
            }
            Err(err) => {
-                error!(
-                    "Task '{}' tenant_shard_id: {:?}, timeline_id: {:?} panicked: {:?}",
-                    task_name, tenant_shard_id, timeline_id, err
-                );
+                if shutdown_process_on_error {
+                    error!(
+                        "Shutting down: task '{}' tenant_shard_id: {:?}, timeline_id: {:?} panicked: {:?}",
+                        task_name, task.tenant_shard_id, task.timeline_id, err
+                    );
+                    shutdown_process = true;
+                } else {
+                    error!(
+                        "Task '{}' tenant_shard_id: {:?}, timeline_id: {:?} panicked: {:?}",
+                        task_name, task.tenant_shard_id, task.timeline_id, err
+                    );
+                }
            }
        }
-    };
+    }

-    // add the task-locals
-    let fut = CURRENT_TASK.scope(task, fut);
-    let fut = SHUTDOWN_TOKEN.scope(shutdown_token, fut);
-
-    // poll future to completion
-    fut.await;
-
-    // Remove our entry from the global hashmap.
-    TASKS
-        .lock()
-        .unwrap()
-        .remove(&task_id)
-        .expect("no task in registry");
-}
-
-pub async fn exit_on_panic_or_error<T, E>(
-    task_name: &'static str,
-    future: impl Future<Output = Result<T, E>>,
-) -> T
-where
-    E: std::fmt::Debug,
-{
-    // We use AssertUnwindSafe here so that the payload function
-    // doesn't need to be UnwindSafe. We don't do anything after the
-    // unwinding that would expose us to unwind-unsafe behavior.
-    let result = AssertUnwindSafe(future).catch_unwind().await;
-    match result {
-        Ok(Ok(val)) => val,
-        Ok(Err(err)) => {
-            error!(
-                task_name,
-                "Task exited with error, exiting process: {err:?}"
-            );
-            std::process::exit(1);
-        }
-        Err(panic_obj) => {
-            error!(task_name, "Task panicked, exiting process: {panic_obj:?}");
-            std::process::exit(1);
-        }
+    if shutdown_process {
+        std::process::exit(1);
    }
 }

--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -137,14 +137,14 @@ impl<'a> BlockCursor<'a> {
 }

 /// Reserved bits for length and compression
-pub(super) const LEN_COMPRESSION_BIT_MASK: u8 = 0xf0;
+const LEN_COMPRESSION_BIT_MASK: u8 = 0xf0;

 /// The maximum size of blobs we support. The highest few bits
 /// are reserved for compression and other further uses.
 const MAX_SUPPORTED_LEN: usize = 0x0fff_ffff;

-pub(super) const BYTE_UNCOMPRESSED: u8 = 0x80;
-pub(super) const BYTE_ZSTD: u8 = BYTE_UNCOMPRESSED | 0x10;
+const BYTE_UNCOMPRESSED: u8 = 0x80;
+const BYTE_ZSTD: u8 = BYTE_UNCOMPRESSED | 0x10;

 /// A wrapper of `VirtualFile` that allows users to write blobs.
 ///
@@ -273,8 +273,12 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
        srcbuf: B,
        ctx: &RequestContext,
    ) -> (B::Buf, Result<u64, Error>) {
-        self.write_blob_maybe_compressed(srcbuf, ctx, ImageCompressionAlgorithm::Disabled)
-            .await
+        self.write_blob_maybe_compressed(
+            srcbuf,
+            ctx,
+            ImageCompressionAlgorithm::DisabledNoDecompress,
+        )
+        .await
    }

    /// Write a blob of data. Returns the offset that it was written to,
@@ -336,7 +340,8 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
                            (BYTE_UNCOMPRESSED, len, slice.into_inner())
                        }
                    }
-                    ImageCompressionAlgorithm::Disabled => {
+                    ImageCompressionAlgorithm::Disabled
+                    | ImageCompressionAlgorithm::DisabledNoDecompress => {
                        (BYTE_UNCOMPRESSED, len, srcbuf.slice_full().into_inner())
                    }
                };
@@ -390,63 +395,51 @@ impl BlobWriter<false> {
 }

 #[cfg(test)]
-pub(crate) mod tests {
+mod tests {
    use super::*;
    use crate::{context::DownloadBehavior, task_mgr::TaskKind, tenant::block_io::BlockReaderRef};
-    use camino::Utf8PathBuf;
-    use camino_tempfile::Utf8TempDir;
    use rand::{Rng, SeedableRng};

    async fn round_trip_test<const BUFFERED: bool>(blobs: &[Vec<u8>]) -> Result<(), Error> {
        round_trip_test_compressed::<BUFFERED>(blobs, false).await
    }

-    pub(crate) async fn write_maybe_compressed<const BUFFERED: bool>(
+    async fn round_trip_test_compressed<const BUFFERED: bool>(
        blobs: &[Vec<u8>],
        compression: bool,
-        ctx: &RequestContext,
-    ) -> Result<(Utf8TempDir, Utf8PathBuf, Vec<u64>), Error> {
+    ) -> Result<(), Error> {
        let temp_dir = camino_tempfile::tempdir()?;
        let pathbuf = temp_dir.path().join("file");
+        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);

        // Write part (in block to drop the file)
        let mut offsets = Vec::new();
        {
-            let file = VirtualFile::create(pathbuf.as_path(), ctx).await?;
+            let file = VirtualFile::create(pathbuf.as_path(), &ctx).await?;
            let mut wtr = BlobWriter::<BUFFERED>::new(file, 0);
            for blob in blobs.iter() {
                let (_, res) = if compression {
                    wtr.write_blob_maybe_compressed(
                        blob.clone(),
-                        ctx,
+                        &ctx,
                        ImageCompressionAlgorithm::Zstd { level: Some(1) },
                    )
                    .await
                } else {
-                    wtr.write_blob(blob.clone(), ctx).await
+                    wtr.write_blob(blob.clone(), &ctx).await
                };
                let offs = res?;
                offsets.push(offs);
            }
            // Write out one page worth of zeros so that we can
            // read again with read_blk
-            let (_, res) = wtr.write_blob(vec![0; PAGE_SZ], ctx).await;
+            let (_, res) = wtr.write_blob(vec![0; PAGE_SZ], &ctx).await;
            let offs = res?;
            println!("Writing final blob at offs={offs}");
-            wtr.flush_buffer(ctx).await?;
+            wtr.flush_buffer(&ctx).await?;
        }
-        Ok((temp_dir, pathbuf, offsets))
-    }

-    async fn round_trip_test_compressed<const BUFFERED: bool>(
-        blobs: &[Vec<u8>],
-        compression: bool,
-    ) -> Result<(), Error> {
-        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
-        let (_temp_dir, pathbuf, offsets) =
-            write_maybe_compressed::<BUFFERED>(blobs, compression, &ctx).await?;
-
-        let file = VirtualFile::open(pathbuf, &ctx).await?;
+        let file = VirtualFile::open(pathbuf.as_path(), &ctx).await?;
        let rdr = BlockReaderRef::VirtualFile(&file);
        let rdr = BlockCursor::new_with_compression(rdr, compression);
        for (idx, (blob, offset)) in blobs.iter().zip(offsets.iter()).enumerate() {
@@ -459,7 +452,7 @@ pub(crate) mod tests {
        Ok(())
    }

-    pub(crate) fn random_array(len: usize) -> Vec<u8> {
+    fn random_array(len: usize) -> Vec<u8> {
        let mut rng = rand::thread_rng();
        (0..len).map(|_| rng.gen()).collect::<_>()
    }
--- a/pageserver/src/tenant/block_io.rs
+++ b/pageserver/src/tenant/block_io.rs
@@ -202,10 +202,18 @@ pub struct FileBlockReader<'a> {

 impl<'a> FileBlockReader<'a> {
    pub fn new(file: &'a VirtualFile, file_id: FileId) -> Self {
+        Self::new_with_compression(file, file_id, false)
+    }
+
+    pub fn new_with_compression(
+        file: &'a VirtualFile,
+        file_id: FileId,
+        compressed_reads: bool,
+    ) -> Self {
        FileBlockReader {
            file_id,
            file,
-            compressed_reads: true,
+            compressed_reads,
        }
    }

--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -335,6 +335,7 @@ pub struct TenantConf {
    /// A lagging safekeeper will be changed after `lagging_wal_timeout` time elapses since the last WAL update,
    /// to avoid eager reconnects.
    pub max_lsn_wal_lag: NonZeroU64,
+    pub trace_read_requests: bool,
    pub eviction_policy: EvictionPolicy,
    pub min_resident_size_override: Option<u64>,
    // See the corresponding metric's help string.
@@ -435,6 +436,10 @@ pub struct TenantConfOpt {
    #[serde(default)]
    pub max_lsn_wal_lag: Option<NonZeroU64>,

+    #[serde(skip_serializing_if = "Option::is_none")]
+    #[serde(default)]
+    pub trace_read_requests: Option<bool>,
+
    #[serde(skip_serializing_if = "Option::is_none")]
    #[serde(default)]
    pub eviction_policy: Option<EvictionPolicy>,
@@ -514,6 +519,9 @@ impl TenantConfOpt {
                .lagging_wal_timeout
                .unwrap_or(global_conf.lagging_wal_timeout),
            max_lsn_wal_lag: self.max_lsn_wal_lag.unwrap_or(global_conf.max_lsn_wal_lag),
+            trace_read_requests: self
+                .trace_read_requests
+                .unwrap_or(global_conf.trace_read_requests),
            eviction_policy: self.eviction_policy.unwrap_or(global_conf.eviction_policy),
            min_resident_size_override: self
                .min_resident_size_override
@@ -573,6 +581,7 @@ impl Default for TenantConf {
                .expect("cannot parse default walreceiver lagging wal timeout"),
            max_lsn_wal_lag: NonZeroU64::new(DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG)
                .expect("cannot parse default max walreceiver Lsn wal lag"),
+            trace_read_requests: false,
            eviction_policy: EvictionPolicy::NoEviction,
            min_resident_size_override: None,
            evictions_low_residence_duration_metric_threshold: humantime::parse_duration(
@@ -650,6 +659,7 @@ impl From<TenantConfOpt> for models::TenantConfig {
            walreceiver_connect_timeout: value.walreceiver_connect_timeout.map(humantime),
            lagging_wal_timeout: value.lagging_wal_timeout.map(humantime),
            max_lsn_wal_lag: value.max_lsn_wal_lag,
+            trace_read_requests: value.trace_read_requests,
            eviction_policy: value.eviction_policy,
            min_resident_size_override: value.min_resident_size_override,
            evictions_low_residence_duration_metric_threshold: value
--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -262,7 +262,7 @@ where

    pub fn iter<'a>(self, start_key: &'a [u8; L], ctx: &'a RequestContext) -> DiskBtreeIterator<'a>
    where
-        R: 'a + Send,
+        R: 'a,
    {
        DiskBtreeIterator {
            stream: Box::pin(self.into_stream(start_key, ctx)),
@@ -521,7 +521,7 @@ where
 pub struct DiskBtreeIterator<'a> {
    #[allow(clippy::type_complexity)]
    stream: std::pin::Pin<
-        Box<dyn Stream<Item = std::result::Result<(Vec<u8>, u64), DiskBtreeError>> + 'a + Send>,
+        Box<dyn Stream<Item = std::result::Result<(Vec<u8>, u64), DiskBtreeError>> + 'a>,
    >,
 }

@@ -550,10 +550,10 @@ where
    /// We maintain the length of the stack to be always greater than zero.
    /// Two exceptions are:
    /// 1. `Self::flush_node`. The method will push the new node if it extracted the last one.
-    ///    So because other methods cannot see the intermediate state invariant still holds.
+    ///   So because other methods cannot see the intermediate state invariant still holds.
    /// 2. `Self::finish`. It consumes self and does not return it back,
-    ///    which means that this is where the structure is destroyed.
-    ///    Thus stack of zero length cannot be observed by other methods.
+    ///  which means that this is where the structure is destroyed.
+    ///  Thus stack of zero length cannot be observed by other methods.
    stack: Vec<BuildNode<L>>,

    /// Last key that was appended to the tree. Used to sanity check that append
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -463,7 +463,7 @@ impl LayerMap {
    pub(self) fn insert_historic_noflush(&mut self, layer_desc: PersistentLayerDesc) {
        // TODO: See #3869, resulting #4088, attempted fix and repro #4094

-        if Self::is_l0(&layer_desc.key_range) {
+        if Self::is_l0(&layer_desc) {
            self.l0_delta_layers.push(layer_desc.clone().into());
        }

@@ -482,7 +482,7 @@ impl LayerMap {
        self.historic
            .remove(historic_layer_coverage::LayerKey::from(layer_desc));
        let layer_key = layer_desc.key();
-        if Self::is_l0(&layer_desc.key_range) {
+        if Self::is_l0(layer_desc) {
            let len_before = self.l0_delta_layers.len();
            let mut l0_delta_layers = std::mem::take(&mut self.l0_delta_layers);
            l0_delta_layers.retain(|other| other.key() != layer_key);
@@ -598,9 +598,8 @@ impl LayerMap {
        coverage
    }

-    /// Check if the key range resembles that of an L0 layer.
-    pub fn is_l0(key_range: &Range<Key>) -> bool {
-        key_range == &(Key::MIN..Key::MAX)
+    pub fn is_l0(layer: &PersistentLayerDesc) -> bool {
+        layer.get_key_range() == (Key::MIN..Key::MAX)
    }

    /// This function determines which layers are counted in `count_deltas`:
@@ -627,7 +626,7 @@ impl LayerMap {
    ///      than just the current partition_range.
    pub fn is_reimage_worthy(layer: &PersistentLayerDesc, partition_range: &Range<Key>) -> bool {
        // Case 1
-        if !Self::is_l0(&layer.key_range) {
+        if !Self::is_l0(layer) {
            return true;
        }

@@ -845,8 +844,8 @@ impl LayerMap {
    }

    /// Return all L0 delta layers
-    pub fn get_level0_deltas(&self) -> Vec<Arc<PersistentLayerDesc>> {
-        self.l0_delta_layers.to_vec()
+    pub fn get_level0_deltas(&self) -> Result<Vec<Arc<PersistentLayerDesc>>> {
+        Ok(self.l0_delta_layers.to_vec())
    }

    /// debugging function to print out the contents of the layer map
--- a/Show More
+++ b/Show More