Merge pull request #8374 from neondatabase/rc/2024-07-15

Storage & Compute release 2024-07-15
Add neon.running_xacts_overflow_policy to make it possible for RO replica to startup without primary even in case running xacts overflow (#8323 )
2026-02-08 13:10:37 +00:00 · 2024-07-15 11:02:18 -04:00 · 2024-07-15 09:34:35 -04:00 · 2024-07-15 09:28:35 -04:00 · 2024-07-15 09:28:35 -04:00 · 2024-07-15 09:28:35 -04:00
232 changed files with 9677 additions and 3452 deletions
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -114,6 +114,8 @@ runs:
        export PLATFORM=${PLATFORM:-github-actions-selfhosted}
        export POSTGRES_DISTRIB_DIR=${POSTGRES_DISTRIB_DIR:-/tmp/neon/pg_install}
        export DEFAULT_PG_VERSION=${PG_VERSION#v}
        export LD_LIBRARY_PATH=${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/lib
        export BENCHMARK_CONNSTR=${BENCHMARK_CONNSTR:-}
        if [ "${BUILD_TYPE}" = "remote" ]; then
          export REMOTE_ENV=1
@@ -178,7 +180,15 @@ runs:
        # Wake up the cluster if we use remote neon instance
        if [ "${{ inputs.build_type }}" = "remote" ] && [ -n "${BENCHMARK_CONNSTR}" ]; then
-          ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/psql ${BENCHMARK_CONNSTR} -c "SELECT version();"
+          QUERIES=("SELECT version()")
          if [[ "${PLATFORM}" = "neon"* ]]; then
            QUERIES+=("SHOW neon.tenant_id")
            QUERIES+=("SHOW neon.timeline_id")
          fi
          for q in "${QUERIES[@]}"; do
            ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/psql ${BENCHMARK_CONNSTR} -c "${q}"
          done
        fi
        # Run the tests.
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -56,15 +56,26 @@ concurrency:
 jobs:
  bench:
    if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
    strategy:
      matrix:
        include:
          - DEFAULT_PG_VERSION: 14
            PLATFORM: "neon-staging"
            region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
            provisioner: 'k8s-pod' 
          - DEFAULT_PG_VERSION: 16
            PLATFORM: "azure-staging"
            region_id: 'azure-eastus2'
            provisioner: 'k8s-neonvm'
    env:
      TEST_PG_BENCH_DURATIONS_MATRIX: "300"
      TEST_PG_BENCH_SCALES_MATRIX: "10,100"
      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
-      DEFAULT_PG_VERSION: 14
+      DEFAULT_PG_VERSION: ${{ matrix.DEFAULT_PG_VERSION }}
      TEST_OUTPUT: /tmp/test_output
      BUILD_TYPE: remote
      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
-      PLATFORM: "neon-staging"
+      PLATFORM: ${{ matrix.PLATFORM }}
    runs-on: [ self-hosted, us-east-2, x64 ]
    container:
@@ -85,9 +96,10 @@ jobs:
      id: create-neon-project
      uses: ./.github/actions/neon-project-create
      with:
-        region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
+        region_id: ${{ matrix.region_id }}
        postgres_version: ${{ env.DEFAULT_PG_VERSION }}
        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
        provisioner: ${{ matrix.provisioner }}
    - name: Run benchmark
      uses: ./.github/actions/run-python-test-set
@@ -96,10 +108,18 @@ jobs:
        test_selection: performance
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        pg_version: ${{ env.DEFAULT_PG_VERSION }}
        # Set --sparse-ordering option of pytest-order plugin
        # to ensure tests are running in order of appears in the file.
        # It's important for test_perf_pgbench.py::test_pgbench_remote_* tests
-        extra_params: -m remote_cluster --sparse-ordering --timeout 5400 --ignore test_runner/performance/test_perf_olap.py --ignore test_runner/performance/test_perf_pgvector_queries.py
+        extra_params:
          -m remote_cluster
          --sparse-ordering
          --timeout 14400
          --ignore test_runner/performance/test_perf_olap.py
          --ignore test_runner/performance/test_perf_pgvector_queries.py
          --ignore test_runner/performance/test_logical_replication.py
          --ignore test_runner/performance/test_physical_replication.py
      env:
        BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -125,6 +145,69 @@ jobs:
      env:
        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
  replication-tests:
    env:
      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
      DEFAULT_PG_VERSION: 14
      TEST_OUTPUT: /tmp/test_output
      BUILD_TYPE: remote
      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
      PLATFORM: "neon-staging"
    runs-on: [ self-hosted, us-east-2, x64 ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
      options: --init
    steps:
    - uses: actions/checkout@v4
    - name: Download Neon artifact
      uses: ./.github/actions/download
      with:
        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
        path: /tmp/neon/
        prefix: latest
    - name: Run benchmark
      uses: ./.github/actions/run-python-test-set
      with:
        build_type: ${{ env.BUILD_TYPE }}
        test_selection: performance/test_logical_replication.py
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 5400
      env:
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
        NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }}
    - name: Run benchmark
      uses: ./.github/actions/run-python-test-set
      with:
        build_type: ${{ env.BUILD_TYPE }}
        test_selection: performance/test_physical_replication.py
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 5400
      env:
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
        NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }}
    - name: Create Allure report
      if: ${{ !cancelled() }}
      uses: ./.github/actions/allure-report-generate
    - name: Post to a Slack channel
      if: ${{ github.event.schedule && failure() }}
      uses: slackapi/slack-github-action@v1
      with:
        channel-id: "C033QLM5P7D" # dev-staging-stream
        slack-message: "Periodic replication testing: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
      env:
        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
  generate-matrices:
    if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
    # Create matrices for the benchmarking jobs, so we run benchmarks on rds only once a week (on Saturday)
@@ -239,11 +322,6 @@ jobs:
        path: /tmp/neon/
        prefix: latest
    - name: Add Postgres binaries to PATH
      run: |
        ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
        echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
    - name: Create Neon Project
      if: contains(fromJson('["neon-captest-new", "neon-captest-freetier", "neonvm-captest-new", "neonvm-captest-freetier"]'), matrix.platform)
      id: create-neon-project
@@ -282,16 +360,6 @@ jobs:
        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
        QUERIES=("SELECT version()")
        if [[ "${PLATFORM}" = "neon"* ]]; then
          QUERIES+=("SHOW neon.tenant_id")
          QUERIES+=("SHOW neon.timeline_id")
        fi
        for q in "${QUERIES[@]}"; do
          psql ${CONNSTR} -c "${q}"
        done
    - name: Benchmark init
      uses: ./.github/actions/run-python-test-set
      with:
@@ -377,25 +445,12 @@ jobs:
        path: /tmp/neon/
        prefix: latest
    - name: Add Postgres binaries to PATH
      run: |
        ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
        echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
    - name: Set up Connection String
      id: set-up-connstr
      run: |
        CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR }}
        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
-        QUERIES=("SELECT version()")
+        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
        QUERIES+=("SHOW neon.tenant_id")
        QUERIES+=("SHOW neon.timeline_id")
        for q in "${QUERIES[@]}"; do
          psql ${CONNSTR} -c "${q}"
        done
    - name: Benchmark pgvector hnsw indexing
      uses: ./.github/actions/run-python-test-set
@@ -417,12 +472,12 @@ jobs:
        test_selection: performance/test_perf_pgvector_queries.py
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
-        extra_params: -m remote_cluster --timeout 21600 
+        extra_params: -m remote_cluster --timeout 21600
      env:
        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
-    
+
    - name: Create Allure report
      if: ${{ !cancelled() }}
      uses: ./.github/actions/allure-report-generate
@@ -477,11 +532,6 @@ jobs:
        path: /tmp/neon/
        prefix: latest
    - name: Add Postgres binaries to PATH
      run: |
        ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
        echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
    - name: Set up Connection String
      id: set-up-connstr
      run: |
@@ -503,16 +553,6 @@ jobs:
        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
        QUERIES=("SELECT version()")
        if [[ "${PLATFORM}" = "neon"* ]]; then
          QUERIES+=("SHOW neon.tenant_id")
          QUERIES+=("SHOW neon.timeline_id")
        fi
        for q in "${QUERIES[@]}"; do
          psql ${CONNSTR} -c "${q}"
        done
    - name: ClickBench benchmark
      uses: ./.github/actions/run-python-test-set
      with:
@@ -580,11 +620,6 @@ jobs:
        path: /tmp/neon/
        prefix: latest
    - name: Add Postgres binaries to PATH
      run: |
        ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
        echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
    - name: Get Connstring Secret Name
      run: |
        case "${PLATFORM}" in
@@ -613,16 +648,6 @@ jobs:
        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
        QUERIES=("SELECT version()")
        if [[ "${PLATFORM}" = "neon"* ]]; then
          QUERIES+=("SHOW neon.tenant_id")
          QUERIES+=("SHOW neon.timeline_id")
        fi
        for q in "${QUERIES[@]}"; do
          psql ${CONNSTR} -c "${q}"
        done
    - name: Run TPC-H benchmark
      uses: ./.github/actions/run-python-test-set
      with:
@@ -681,11 +706,6 @@ jobs:
        path: /tmp/neon/
        prefix: latest
    - name: Add Postgres binaries to PATH
      run: |
        ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
        echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
    - name: Set up Connection String
      id: set-up-connstr
      run: |
@@ -707,16 +727,6 @@ jobs:
        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
        QUERIES=("SELECT version()")
        if [[ "${PLATFORM}" = "neon"* ]]; then
          QUERIES+=("SHOW neon.tenant_id")
          QUERIES+=("SHOW neon.timeline_id")
        fi
        for q in "${QUERIES[@]}"; do
          psql ${CONNSTR} -c "${q}"
        done
    - name: Run user examples
      uses: ./.github/actions/run-python-test-set
      with:
--- a/.github/workflows/build-build-tools-image.yml
+++ b/.github/workflows/build-build-tools-image.yml
@@ -63,14 +63,16 @@ jobs:
          mkdir -p /tmp/.docker-custom
          echo DOCKER_CONFIG=/tmp/.docker-custom >> $GITHUB_ENV
-      - uses: docker/setup-buildx-action@v2
+      - uses: docker/setup-buildx-action@v3
        with:
          cache-binary: false
-      - uses: docker/login-action@v2
+      - uses: docker/login-action@v3
        with:
          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
-      - uses: docker/build-push-action@v4
+      - uses: docker/build-push-action@v6
        with:
          context: .
          provenance: false
@@ -82,6 +84,7 @@ jobs:
          tags: neondatabase/build-tools:${{ inputs.image-tag }}-${{ matrix.arch }}
      - name: Remove custom docker config directory
        if: always()
        run: |
          rm -rf /tmp/.docker-custom
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -30,7 +30,7 @@ jobs:
    if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
    uses: ./.github/workflows/check-permissions.yml
    with:
-      github-event-name: ${{ github.event_name}}
+      github-event-name: ${{ github.event_name }}
  cancel-previous-e2e-tests:
    needs: [ check-permissions ]
@@ -335,6 +335,8 @@ jobs:
      - name: Run cargo build
        run: |
          PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
          export PQ_LIB_DIR
          ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
      # Do install *before* running rust tests because they might recompile the
@@ -383,6 +385,11 @@ jobs:
        env:
          NEXTEST_RETRIES: 3
        run: |
          PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
          export PQ_LIB_DIR
          LD_LIBRARY_PATH=$(pwd)/pg_install/v16/lib
          export LD_LIBRARY_PATH
          #nextest does not yet support running doctests
          cargo test --doc $CARGO_FLAGS $CARGO_FEATURES
@@ -744,14 +751,16 @@ jobs:
        run: |
          mkdir -p .docker-custom
          echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
-      - uses: docker/setup-buildx-action@v2
+      - uses: docker/setup-buildx-action@v3
        with:
          cache-binary: false
      - uses: docker/login-action@v3
        with:
          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
-      - uses: docker/build-push-action@v5
+      - uses: docker/build-push-action@v6
        with:
          context: .
          build-args: |
@@ -822,11 +831,12 @@ jobs:
        run: |
          mkdir -p .docker-custom
          echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
-      - uses: docker/setup-buildx-action@v2
+      - uses: docker/setup-buildx-action@v3
        with:
          cache-binary: false
          # Disable parallelism for docker buildkit.
          # As we already build everything with `make -j$(nproc)`, running it in additional level of parallelisam blows up the Runner.
-          config-inline: |
+          buildkitd-config-inline: |
            [worker.oci]
              max-parallelism = 1
@@ -842,7 +852,7 @@ jobs:
          password: ${{ secrets.AWS_SECRET_KEY_DEV }}
      - name: Build compute-node image
-        uses: docker/build-push-action@v5
+        uses: docker/build-push-action@v6
        with:
          context: .
          build-args: |
@@ -861,7 +871,7 @@ jobs:
      - name: Build neon extensions test image
        if: matrix.version == 'v16'
-        uses: docker/build-push-action@v5
+        uses: docker/build-push-action@v6
        with:
          context: .
          build-args: |
@@ -882,7 +892,7 @@ jobs:
      - name: Build compute-tools image
        # compute-tools are Postgres independent, so build it only once
        if: matrix.version == 'v16'
-        uses: docker/build-push-action@v5
+        uses: docker/build-push-action@v6
        with:
          target: compute-tools-image
          context: .
@@ -1326,6 +1336,7 @@ jobs:
        env:
          BUCKET: neon-github-public-dev
          PREFIX: artifacts/latest
          COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
        run: |
          # Update compatibility snapshot for the release
          for pg_version in v14 v15 v16; do
@@ -1339,7 +1350,7 @@ jobs:
          # Update Neon artifact for the release (reuse already uploaded artifact)
          for build_type in debug release; do
-            OLD_PREFIX=artifacts/${GITHUB_RUN_ID}
+            OLD_PREFIX=artifacts/${COMMIT_SHA}/${GITHUB_RUN_ID}
            FILENAME=neon-${{ runner.os }}-${{ runner.arch }}-${build_type}-artifact.tar.zst
            S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${OLD_PREFIX} | jq -r '.Contents[]?.Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
@@ -1358,3 +1369,31 @@ jobs:
    with:
      from-tag: ${{ needs.build-build-tools-image.outputs.image-tag }}
    secrets: inherit
  # This job simplifies setting branch protection rules (in GitHub UI)
  # by allowing to set only this job instead of listing many others.
  # It also makes it easier to rename or parametrise jobs (using matrix)
  # which requires changes in branch protection rules
  #
  # Note, that we can't add external check (like `neon-cloud-e2e`) we still need to use GitHub UI for that.
  #
  # https://github.com/neondatabase/neon/settings/branch_protection_rules
  conclusion:
    if: always()
    # Format `needs` differently to make the list more readable.
    # Usually we do `needs: [...]`
    needs:
      - check-codestyle-python
      - check-codestyle-rust
      - regress-tests
      - test-images
    runs-on: ubuntu-22.04
    steps:
      # The list of possible results:
      # https://docs.github.com/en/actions/learn-github-actions/contexts#needs-context
      - name: Fail the job if any of the dependencies do not succeed
        run: exit 1
        if: |
          contains(needs.*.result, 'failure')
          || contains(needs.*.result, 'cancelled')
          || contains(needs.*.result, 'skipped')
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -232,12 +232,19 @@ jobs:
      - name: Run cargo build
        run: |
          PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
          export PQ_LIB_DIR
          mold -run cargo build --locked $CARGO_FLAGS $CARGO_FEATURES --bins --tests -j$(nproc)
      - name: Run cargo test
        env:
          NEXTEST_RETRIES: 3
        run: |
          PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
          export PQ_LIB_DIR
          LD_LIBRARY_PATH=$(pwd)/pg_install/v16/lib
          export LD_LIBRARY_PATH
          cargo nextest run $CARGO_FEATURES -j$(nproc)
          # Run separate tests for real S3
@@ -378,7 +385,7 @@ jobs:
        run: make walproposer-lib -j$(nproc)
      - name: Produce the build stats
-        run: cargo build --all --release --timings -j$(nproc)
+        run: PQ_LIB_DIR=$(pwd)/pg_install/v16/lib cargo build --all --release --timings -j$(nproc)
      - name: Upload the build stats
        id: upload-stats
--- a/.github/workflows/periodic_pagebench.yml
+++ b/.github/workflows/periodic_pagebench.yml
@@ -0,0 +1,155 @@
 name: Periodic pagebench performance test on dedicated EC2 machine in eu-central-1 region
 on:
  schedule:
    # * is a special character in YAML so you have to quote this string
    #          ┌───────────── minute (0 - 59)
    #          │ ┌───────────── hour (0 - 23)
    #          │ │ ┌───────────── day of the month (1 - 31)
    #          │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
    #          │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
    - cron:  '0 18 * * *' # Runs at 6 PM UTC every day
  workflow_dispatch: # Allows manual triggering of the workflow
    inputs:
      commit_hash:
        type: string
        description: 'The long neon repo commit hash for the system under test (pageserver) to be tested.'
        required: false
        default: ''
 defaults:
  run:
    shell: bash -euo pipefail {0}
 concurrency:
  group: ${{ github.workflow }}
  cancel-in-progress: false
 jobs:
  trigger_bench_on_ec2_machine_in_eu_central_1:
    runs-on: [ self-hosted, gen3, small ]
    container:
      image: neondatabase/build-tools:pinned
      credentials:
        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
      options: --init
    timeout-minutes: 360  # Set the timeout to 6 hours
    env:
      API_KEY: ${{ secrets.PERIODIC_PAGEBENCH_EC2_RUNNER_API_KEY }}
      RUN_ID: ${{ github.run_id }}
      AWS_ACCESS_KEY_ID: ${{ secrets.AWS_EC2_US_TEST_RUNNER_ACCESS_KEY_ID }}
      AWS_SECRET_ACCESS_KEY : ${{ secrets.AWS_EC2_US_TEST_RUNNER_ACCESS_KEY_SECRET }}
      AWS_DEFAULT_REGION : "eu-central-1"
      AWS_INSTANCE_ID : "i-02a59a3bf86bc7e74"
    steps:
    # we don't need the neon source code because we run everything remotely
    # however we still need the local github actions to run the allure step below
    - uses: actions/checkout@v4
    - name: Show my own (github runner) external IP address - usefull for IP allowlisting
      run: curl https://ifconfig.me
    - name: Start EC2 instance and wait for the instance to boot up
      run: |
        aws ec2 start-instances --instance-ids $AWS_INSTANCE_ID
        aws ec2 wait instance-running --instance-ids $AWS_INSTANCE_ID
        sleep 60 # sleep some time to allow cloudinit and our API server to start up
    - name: Determine public IP of the EC2 instance and set env variable EC2_MACHINE_URL_US
      run: |
        public_ip=$(aws ec2 describe-instances --instance-ids $AWS_INSTANCE_ID --query 'Reservations[*].Instances[*].PublicIpAddress' --output text)
        echo "Public IP of the EC2 instance: $public_ip"
        echo "EC2_MACHINE_URL_US=https://${public_ip}:8443" >> $GITHUB_ENV
    - name: Determine commit hash
      env:
        INPUT_COMMIT_HASH: ${{ github.event.inputs.commit_hash }}
      run: |
        if [ -z "$INPUT_COMMIT_HASH" ]; then
          echo "COMMIT_HASH=$(curl -s https://api.github.com/repos/neondatabase/neon/commits/main | jq -r '.sha')" >> $GITHUB_ENV
        else
          echo "COMMIT_HASH=$INPUT_COMMIT_HASH" >> $GITHUB_ENV
        fi
    - name: Start Bench with run_id   
      run: |
        curl -k -X 'POST' \
        "${EC2_MACHINE_URL_US}/start_test/${GITHUB_RUN_ID}" \
        -H 'accept: application/json' \
        -H 'Content-Type: application/json' \
        -H "Authorization: Bearer $API_KEY" \
        -d "{\"neonRepoCommitHash\": \"${COMMIT_HASH}\"}"
    - name: Poll Test Status
      id: poll_step
      run: |
        status=""
        while [[ "$status" != "failure" && "$status" != "success" ]]; do
          response=$(curl -k -X 'GET' \
          "${EC2_MACHINE_URL_US}/test_status/${GITHUB_RUN_ID}" \
          -H 'accept: application/json' \
          -H "Authorization: Bearer $API_KEY")
          echo "Response: $response"
          set +x
          status=$(echo $response | jq -r '.status')
          echo "Test status: $status"
          if [[ "$status" == "failure" ]]; then
            echo "Test failed"
            exit 1 # Fail the job step if status is failure
          elif [[ "$status" == "success" || "$status" == "null" ]]; then
            break
          elif [[ "$status" == "too_many_runs" ]]; then
            echo "Too many runs already running"
            echo "too_many_runs=true" >> "$GITHUB_OUTPUT"
            exit 1
          fi
          sleep 60 # Poll every 60 seconds
        done
    - name: Retrieve Test Logs
      if: always() && steps.poll_step.outputs.too_many_runs != 'true'
      run: |
        curl -k -X 'GET' \
        "${EC2_MACHINE_URL_US}/test_log/${GITHUB_RUN_ID}" \
        -H 'accept: application/gzip' \
        -H "Authorization: Bearer $API_KEY" \
        --output "test_log_${GITHUB_RUN_ID}.gz"
    - name: Unzip Test Log and Print it into this job's log
      if: always() && steps.poll_step.outputs.too_many_runs != 'true'
      run: |
        gzip -d "test_log_${GITHUB_RUN_ID}.gz"
        cat "test_log_${GITHUB_RUN_ID}"
    - name: Create Allure report
      env:
        AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
        AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
      if: ${{ !cancelled() }}
      uses: ./.github/actions/allure-report-generate
    - name: Post to a Slack channel
      if: ${{ github.event.schedule && failure() }}
      uses: slackapi/slack-github-action@v1
      with:
        channel-id: "C033QLM5P7D" # dev-staging-stream
        slack-message: "Periodic pagebench testing on dedicated hardware: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
      env:
        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
    - name: Cleanup Test Resources
      if: always() 
      run: |
        curl -k -X 'POST' \
        "${EC2_MACHINE_URL_US}/cleanup_test/${GITHUB_RUN_ID}" \
        -H 'accept: application/json' \
        -H "Authorization: Bearer $API_KEY" \
        -d ''
    - name: Stop EC2 instance and wait for the instance to be stopped
      if: always() && steps.poll_step.outputs.too_many_runs != 'true'
      run: |
        aws ec2 stop-instances --instance-ids $AWS_INSTANCE_ID
        aws ec2 wait instance-stopped --instance-ids $AWS_INSTANCE_ID
--- a/.github/workflows/pg-clients.yml
+++ b/.github/workflows/pg-clients.yml
@@ -0,0 +1,115 @@
 name: Test Postgres client libraries
 on:
  schedule:
    # * is a special character in YAML so you have to quote this string
    #          ┌───────────── minute (0 - 59)
    #          │ ┌───────────── hour (0 - 23)
    #          │ │ ┌───────────── day of the month (1 - 31)
    #          │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
    #          │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
    - cron:  '23 02 * * *' # run once a day, timezone is utc
  pull_request:
    paths:
      - '.github/workflows/pg-clients.yml'
      - 'test_runner/pg_clients/**'
      - 'poetry.lock'
  workflow_dispatch:
 concurrency:
  group: ${{ github.workflow }}-${{ github.ref_name }}
  cancel-in-progress: ${{ github.event_name == 'pull_request' }}
 defaults:
  run:
    shell: bash -euxo pipefail {0}
 env:
  DEFAULT_PG_VERSION: 16
  PLATFORM: neon-captest-new
  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
  AWS_DEFAULT_REGION: eu-central-1
 jobs:
  check-permissions:
    if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
    uses: ./.github/workflows/check-permissions.yml
    with:
      github-event-name: ${{ github.event_name }}
  check-build-tools-image:
    needs: [ check-permissions ]
    uses: ./.github/workflows/check-build-tools-image.yml
  build-build-tools-image:
    needs: [ check-build-tools-image ]
    uses: ./.github/workflows/build-build-tools-image.yml
    with:
      image-tag: ${{ needs.check-build-tools-image.outputs.image-tag }}
    secrets: inherit
  test-postgres-client-libs:
    needs: [ build-build-tools-image ]
    runs-on: ubuntu-22.04
    container:
      image: ${{ needs.build-build-tools-image.outputs.image }}
      credentials:
        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
      options: --init --user root
    steps:
    - uses: actions/checkout@v4
    - name: Download Neon artifact
      uses: ./.github/actions/download
      with:
        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
        path: /tmp/neon/
        prefix: latest
    - name: Create Neon Project
      id: create-neon-project
      uses: ./.github/actions/neon-project-create
      with:
        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
        postgres_version: ${{ env.DEFAULT_PG_VERSION }}
    - name: Run tests
      uses: ./.github/actions/run-python-test-set
      with:
        build_type: remote
        test_selection: pg_clients
        run_in_parallel: false
        extra_params: -m remote_cluster
        pg_version: ${{ env.DEFAULT_PG_VERSION }}
      env:
        BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
    - name: Delete Neon Project
      if: always()
      uses: ./.github/actions/neon-project-delete
      with:
        project_id: ${{ steps.create-neon-project.outputs.project_id }}
        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
    - name: Create Allure report
      if: ${{ !cancelled() }}
      id: create-allure-report
      uses: ./.github/actions/allure-report-generate
      with:
        store-test-results-into-db: true
      env:
        REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
    - name: Post to a Slack channel
      if: github.event.schedule && failure()
      uses: slackapi/slack-github-action@v1
      with:
        channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream
        slack-message: |
          Testing Postgres clients: <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|${{ job.status }}> (<${{ steps.create-allure-report.outputs.report-url }}|test report>)
      env:
        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
--- a/.github/workflows/pg_clients.yml
+++ b/.github/workflows/pg_clients.yml
@@ -1,98 +0,0 @@
 name: Test Postgres client libraries
 on:
  schedule:
    # * is a special character in YAML so you have to quote this string
    #          ┌───────────── minute (0 - 59)
    #          │ ┌───────────── hour (0 - 23)
    #          │ │ ┌───────────── day of the month (1 - 31)
    #          │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
    #          │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
    - cron:  '23 02 * * *' # run once a day, timezone is utc
  workflow_dispatch:
 concurrency:
  # Allow only one workflow per any non-`main` branch.
  group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
  cancel-in-progress: true
 jobs:
  test-postgres-client-libs:
    # TODO: switch to gen2 runner, requires docker
    runs-on: ubuntu-22.04
    env:
      DEFAULT_PG_VERSION: 14
      TEST_OUTPUT: /tmp/test_output
    steps:
    - name: Checkout
      uses: actions/checkout@v4
    - uses: actions/setup-python@v4
      with:
        python-version: 3.9
    - name: Install Poetry
      uses: snok/install-poetry@v1
    - name: Cache poetry deps
      uses: actions/cache@v4
      with:
        path: ~/.cache/pypoetry/virtualenvs
        key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-ubunutu-latest-${{ hashFiles('poetry.lock') }}
    - name: Install Python deps
      shell: bash -euxo pipefail {0}
      run: ./scripts/pysync
    - name: Create Neon Project
      id: create-neon-project
      uses: ./.github/actions/neon-project-create
      with:
        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
        postgres_version: ${{ env.DEFAULT_PG_VERSION }}
    - name: Run pytest
      env:
        REMOTE_ENV: 1
        BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
        POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
      shell: bash -euxo pipefail {0}
      run: |
        # Test framework expects we have psql binary;
        # but since we don't really need it in this test, let's mock it
        mkdir -p "$POSTGRES_DISTRIB_DIR/v${DEFAULT_PG_VERSION}/bin" && touch "$POSTGRES_DISTRIB_DIR/v${DEFAULT_PG_VERSION}/bin/psql";
        ./scripts/pytest \
          --junitxml=$TEST_OUTPUT/junit.xml \
          --tb=short \
          --verbose \
          -m "remote_cluster" \
          -rA "test_runner/pg_clients"
    - name: Delete Neon Project
      if: ${{ always() }}
      uses: ./.github/actions/neon-project-delete
      with:
        project_id: ${{ steps.create-neon-project.outputs.project_id }}
        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
    # We use GitHub's action upload-artifact because `ubuntu-latest` doesn't have configured AWS CLI.
    # It will be fixed after switching to gen2 runner
    - name: Upload python test logs
      if: always()
      uses: actions/upload-artifact@v4
      with:
        retention-days: 7
        name: python-test-pg_clients-${{ runner.os }}-${{ runner.arch }}-stage-logs
        path: ${{ env.TEST_OUTPUT }}
    - name: Post to a Slack channel
      if: ${{ github.event.schedule && failure() }}
      uses: slackapi/slack-github-action@v1
      with:
        channel-id: "C033QLM5P7D" # dev-staging-stream
        slack-message: "Testing Postgres clients: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
      env:
        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1236,6 +1236,7 @@ dependencies = [
 "regex",
 "remote_storage",
 "reqwest 0.12.4",
 "rlimit",
 "rust-ini",
 "serde",
 "serde_json",
@@ -1397,9 +1398,9 @@ dependencies = [
 [[package]]
 name = "crc32c"
-version = "0.6.5"
+version = "0.6.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "89254598aa9b9fa608de44b3ae54c810f0f06d755e24c50177f1f8f31ff50ce2"
+checksum = "3a47af21622d091a8f0fb295b88bc886ac74efcc613efc19f5d0b21de5c89e47"
 dependencies = [
 "rustc_version",
 ]
@@ -1651,6 +1652,16 @@ dependencies = [
 "rusticata-macros",
 ]
 [[package]]
 name = "deranged"
 version = "0.3.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b42b6fa04a440b495c8b04d0e71b707c585f83cb9cb28cf8cd0d976c315e31b4"
 dependencies = [
 "powerfmt",
 "serde",
 ]
 [[package]]
 name = "desim"
 version = "0.1.0"
@@ -2017,16 +2028,6 @@ dependencies = [
 "tokio-util",
 ]
 [[package]]
 name = "fs2"
 version = "0.4.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9564fc758e15025b46aa6643b1b77d047d1a56a1aea6e01002ac0c7026876213"
 dependencies = [
 "libc",
 "winapi",
 ]
 [[package]]
 name = "fsevent-sys"
 version = "4.1.0"
@@ -3008,9 +3009,9 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"
 [[package]]
 name = "measured"
-version = "0.0.21"
+version = "0.0.22"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "652bc741286361c06de8cb4d89b21a6437f120c508c51713663589eeb9928ac5"
+checksum = "3051f3a030d55d680cdef6ca50e80abd1182f8da29f2344a7c9cb575721138f0"
 dependencies = [
 "bytes",
 "crossbeam-utils",
@@ -3026,9 +3027,9 @@ dependencies = [
 [[package]]
 name = "measured-derive"
-version = "0.0.21"
+version = "0.0.22"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6ea497f33e1e856a376c32ad916f69a0bd3c597db1f912a399f842b01a4a685d"
+checksum = "b9e6777fc80a575f9503d908c8b498782a6c3ee88a06cb416dc3941401e43b94"
 dependencies = [
 "heck 0.5.0",
 "proc-macro2",
@@ -3038,9 +3039,9 @@ dependencies = [
 [[package]]
 name = "measured-process"
-version = "0.0.21"
+version = "0.0.22"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b364ccb66937a814b6b2ad751d1a2f7a9d5a78c761144036825fb36bb0771000"
+checksum = "7c4b80445aeb08e832d87bf1830049a924cdc1d6b7ef40b6b9b365bff17bf8ec"
 dependencies = [
 "libc",
 "measured",
@@ -3275,6 +3276,12 @@ dependencies = [
 "num-traits",
 ]
 [[package]]
 name = "num-conv"
 version = "0.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9"
 [[package]]
 name = "num-integer"
 version = "0.1.45"
@@ -3667,6 +3674,7 @@ dependencies = [
 "sysinfo",
 "tenant_size_model",
 "thiserror",
 "tikv-jemallocator",
 "tokio",
 "tokio-epoll-uring",
 "tokio-io-timeout",
@@ -4077,6 +4085,7 @@ dependencies = [
 "tokio-postgres",
 "tokio-postgres-rustls",
 "tokio-rustls 0.25.0",
 "tokio-util",
 "tracing",
 "workspace_hack",
 ]
@@ -4117,6 +4126,12 @@ dependencies = [
 "workspace_hack",
 ]
 [[package]]
 name = "powerfmt"
 version = "0.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391"
 [[package]]
 name = "ppv-lite86"
 version = "0.2.17"
@@ -4877,6 +4892,15 @@ dependencies = [
 "windows-sys 0.48.0",
 ]
 [[package]]
 name = "rlimit"
 version = "0.10.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3560f70f30a0f16d11d01ed078a07740fe6b489667abc7c7b029155d9f21c3d8"
 dependencies = [
 "libc",
 ]
 [[package]]
 name = "routerify"
 version = "3.0.0"
@@ -5145,7 +5169,6 @@ dependencies = [
 "crc32c",
 "desim",
 "fail",
 "fs2",
 "futures",
 "git-version",
 "hex",
@@ -5172,6 +5195,8 @@ dependencies = [
 "sha2",
 "signal-hook",
 "storage_broker",
 "strum",
 "strum_macros",
 "thiserror",
 "tokio",
 "tokio-io-timeout",
@@ -5396,9 +5421,9 @@ checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4"
 [[package]]
 name = "serde"
-version = "1.0.183"
+version = "1.0.203"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "32ac8da02677876d532745a130fc9d8e6edfa81a269b107c5b00829b91d8eb3c"
+checksum = "7253ab4de971e72fb7be983802300c30b5a7f0c2e56fab8abfc6a214307c0094"
 dependencies = [
 "serde_derive",
 ]
@@ -5415,9 +5440,9 @@ dependencies = [
 [[package]]
 name = "serde_derive"
-version = "1.0.183"
+version = "1.0.203"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "aafe972d60b0b9bee71a91b92fee2d4fb3c9d7e8f6b179aa99f27203d99a4816"
+checksum = "500cbc0ebeb6f46627f50f3f5811ccf6bf00643be300b4c3eabc0ef55dc5b5ba"
 dependencies = [
 "proc-macro2",
 "quote",
@@ -6107,12 +6132,15 @@ dependencies = [
 [[package]]
 name = "time"
-version = "0.3.21"
+version = "0.3.36"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8f3403384eaacbca9923fa06940178ac13e4edb725486d70e8e15881d0c836cc"
+checksum = "5dfd88e563464686c916c7e46e623e520ddc6d79fa6641390f2e3fa86e83e885"
 dependencies = [
 "deranged",
 "itoa",
 "js-sys",
 "num-conv",
 "powerfmt",
 "serde",
 "time-core",
 "time-macros",
@@ -6120,16 +6148,17 @@ dependencies = [
 [[package]]
 name = "time-core"
-version = "0.1.1"
+version = "0.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7300fbefb4dadc1af235a9cef3737cea692a9d97e1b9cbcd4ebdae6f8868e6fb"
+checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3"
 [[package]]
 name = "time-macros"
-version = "0.2.9"
+version = "0.2.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "372950940a5f07bf38dbe211d7283c9e6d7327df53794992d293e534c733d09b"
+checksum = "3f252a68540fde3a3877aeea552b832b40ab9a69e318efd078774a01ddee1ccf"
 dependencies = [
 "num-conv",
 "time-core",
 ]
@@ -6472,17 +6501,6 @@ version = "0.3.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52"
 [[package]]
 name = "trace"
 version = "0.1.0"
 dependencies = [
 "anyhow",
 "clap",
 "pageserver_api",
 "utils",
 "workspace_hack",
 ]
 [[package]]
 name = "tracing"
 version = "0.1.37"
@@ -6811,6 +6829,7 @@ dependencies = [
 "tokio-stream",
 "tokio-tar",
 "tokio-util",
 "toml_edit 0.19.10",
 "tracing",
 "tracing-error",
 "tracing-subscriber",
@@ -7426,13 +7445,12 @@ dependencies = [
 "clap",
 "clap_builder",
 "crossbeam-utils",
 "deranged",
 "either",
 "fail",
 "futures-channel",
 "futures-core",
 "futures-executor",
 "futures-io",
 "futures-sink",
 "futures-util",
 "getrandom 0.2.11",
 "hashbrown 0.14.5",
@@ -7450,7 +7468,9 @@ dependencies = [
 "num-traits",
 "once_cell",
 "parquet",
 "proc-macro2",
 "prost",
 "quote",
 "rand 0.8.5",
 "regex",
 "regex-automata 0.4.3",
@@ -7467,6 +7487,7 @@ dependencies = [
 "syn 1.0.109",
 "syn 2.0.52",
 "sync_wrapper",
 "tikv-jemalloc-sys",
 "time",
 "time-macros",
 "tokio",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -15,7 +15,6 @@ members = [
    "storage_controller",
    "storage_scrubber",
    "workspace_hack",
    "trace",
    "libs/compute_api",
    "libs/pageserver_api",
    "libs/postgres_ffi",
@@ -84,7 +83,6 @@ enumset = "1.0.12"
 fail = "0.5.0"
 fallible-iterator = "0.2"
 framed-websockets = { version = "0.1.0", git = "https://github.com/neondatabase/framed-websockets" }
 fs2 = "0.4.3"
 futures = "0.3"
 futures-core = "0.3"
 futures-util = "0.3"
@@ -111,8 +109,8 @@ lasso = "0.7"
 leaky-bucket = "1.0.1"
 libc = "0.2"
 md5 = "0.7.0"
-measured = { version = "0.0.21", features=["lasso"] }
+measured = { version = "0.0.22", features=["lasso"] }
-measured-process = { version = "0.0.21" }
+measured-process = { version = "0.0.22" }
 memoffset = "0.8"
 nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] }
 notify = "6.0.0"
--- a/5
+++ b/5
@@ -42,12 +42,13 @@ ARG CACHEPOT_BUCKET=neon-github-dev
 COPY --from=pg-build /home/nonroot/pg_install/v14/include/postgresql/server pg_install/v14/include/postgresql/server
 COPY --from=pg-build /home/nonroot/pg_install/v15/include/postgresql/server pg_install/v15/include/postgresql/server
 COPY --from=pg-build /home/nonroot/pg_install/v16/include/postgresql/server pg_install/v16/include/postgresql/server
 COPY --from=pg-build /home/nonroot/pg_install/v16/lib                       pg_install/v16/lib
 COPY --chown=nonroot . .
 # Show build caching stats to check if it was used in the end.
 # Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats.
 RUN set -e \
-    && RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment" cargo build  \
+    && PQ_LIB_DIR=$(pwd)/pg_install/v16/lib RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment" cargo build \
      --bin pg_sni_router  \
      --bin pageserver  \
      --bin pagectl  \
@@ -56,6 +57,7 @@ RUN set -e \
      --bin storage_controller  \
      --bin proxy  \
      --bin neon_local \
      --bin storage_scrubber \
      --locked --release \
    && cachepot -s
@@ -82,6 +84,7 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_controller  /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy               /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local          /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_scrubber    /usr/local/bin
 COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/
 COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/
--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -1,5 +1,13 @@
 FROM debian:bullseye-slim
 # Use ARG as a build-time environment variable here to allow.
 # It's not supposed to be set outside.
 # Alternatively it can be obtained using the following command
 # ```
 # . /etc/os-release && echo "${VERSION_CODENAME}"
 # ```
 ARG DEBIAN_VERSION_CODENAME=bullseye
 # Add nonroot user
 RUN useradd -ms /bin/bash nonroot -b /home
 SHELL ["/bin/bash", "-c"]
@@ -26,7 +34,6 @@ RUN set -e \
        liblzma-dev \
        libncurses5-dev \
        libncursesw5-dev \
        libpq-dev \
        libreadline-dev \
        libseccomp-dev \
        libsqlite3-dev \
@@ -67,12 +74,24 @@ RUN curl -sL "https://github.com/peak/s5cmd/releases/download/v${S5CMD_VERSION}/
 # LLVM
 ENV LLVM_VERSION=18
 RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \
-    && echo "deb http://apt.llvm.org/bullseye/ llvm-toolchain-bullseye-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \
+    && echo "deb http://apt.llvm.org/${DEBIAN_VERSION_CODENAME}/ llvm-toolchain-${DEBIAN_VERSION_CODENAME}-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \
    && apt update \
    && apt install -y clang-${LLVM_VERSION} llvm-${LLVM_VERSION} \
    && bash -c 'for f in /usr/bin/clang*-${LLVM_VERSION} /usr/bin/llvm*-${LLVM_VERSION}; do ln -s "${f}" "${f%-${LLVM_VERSION}}"; done' \
    && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
 # Install docker
 RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg \
    && echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/debian ${DEBIAN_VERSION_CODENAME} stable" > /etc/apt/sources.list.d/docker.list \
    && apt update \
    && apt install -y docker-ce docker-ce-cli \
    && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
 # Configure sudo & docker
 RUN usermod -aG sudo nonroot && \
    echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers && \
    usermod -aG docker nonroot
 # AWS CLI
 RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "awscliv2.zip" \
    && unzip -q awscliv2.zip \
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -44,3 +44,4 @@ vm_monitor = { version = "0.1", path = "../libs/vm_monitor/" }
 zstd = "0.13"
 bytes = "1.0"
 rust-ini = "0.20.0"
 rlimit = "0.10.1"
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -6,7 +6,7 @@
 //! - Every start is a fresh start, so the data directory is removed and
 //!   initialized again on each run.
 //! - If remote_extension_config is provided, it will be used to fetch extensions list
-//!  and download `shared_preload_libraries` from the remote storage.
+//!   and download `shared_preload_libraries` from the remote storage.
 //! - Next it will put configuration files into the `PGDATA` directory.
 //! - Sync safekeepers and get commit LSN.
 //! - Get `basebackup` from pageserver using the returned on the previous step LSN.
@@ -33,7 +33,6 @@
 //!             -b /usr/local/bin/postgres \
 //!             -r http://pg-ext-s3-gateway \
 //! ```
 //!
 use std::collections::HashMap;
 use std::fs::File;
 use std::path::Path;
@@ -64,6 +63,7 @@ use compute_tools::monitor::launch_monitor;
 use compute_tools::params::*;
 use compute_tools::spec::*;
 use compute_tools::swap::resize_swap;
 use rlimit::{setrlimit, Resource};
 // this is an arbitrary build tag. Fine as a default / for testing purposes
 // in-case of not-set environment var
@@ -72,6 +72,9 @@ const BUILD_TAG_DEFAULT: &str = "latest";
 fn main() -> Result<()> {
    let (build_tag, clap_args) = init()?;
    // enable core dumping for all child processes
    setrlimit(Resource::CORE, rlimit::INFINITY, rlimit::INFINITY)?;
    let (pg_handle, start_pg_result) = {
        // Enter startup tracing context
        let _startup_context_guard = startup_context_from_env();
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -56,6 +56,7 @@ pub struct ComputeNode {
    /// - we push new spec and it does reconfiguration
    /// - but then something happens and compute pod / VM is destroyed,
    ///   so k8s controller starts it again with the **old** spec
    ///
    /// and the same for empty computes:
    /// - we started compute without any spec
    /// - we push spec and it does configuration
@@ -798,7 +799,11 @@ impl ComputeNode {
        // In this case we need to connect with old `zenith_admin` name
        // and create new user. We cannot simply rename connected user,
        // but we can create a new one and grant it all privileges.
-        let connstr = self.connstr.clone();
+        let mut connstr = self.connstr.clone();
        connstr
            .query_pairs_mut()
            .append_pair("application_name", "apply_config");
        let mut client = match Client::connect(connstr.as_str(), NoTls) {
            Err(e) => match e.code() {
                Some(&SqlState::INVALID_PASSWORD)
@@ -867,15 +872,19 @@ impl ComputeNode {
        // Run migrations separately to not hold up cold starts
        thread::spawn(move || {
            let mut connstr = connstr.clone();
            connstr
                .query_pairs_mut()
                .append_pair("application_name", "migrations");
            let mut client = Client::connect(connstr.as_str(), NoTls)?;
            handle_migrations(&mut client).context("apply_config handle_migrations")
        });
        Ok(())
    }
-    // We could've wrapped this around `pg_ctl reload`, but right now we don't use
+    // Wrapped this around `pg_ctl reload`, but right now we don't use
-    // `pg_ctl` for start / stop, so this just seems much easier to do as we already
+    // `pg_ctl` for start / stop.
    // have opened connection to Postgres and superuser access.
    #[instrument(skip_all)]
    fn pg_reload_conf(&self) -> Result<()> {
        let pgctl_bin = Path::new(&self.pgbin).parent().unwrap().join("pg_ctl");
@@ -1108,7 +1117,7 @@ impl ComputeNode {
    // EKS worker nodes have following core dump settings:
    //   /proc/sys/kernel/core_pattern -> core
    //   /proc/sys/kernel/core_uses_pid -> 1
-    //   ulimint -c -> unlimited
+    //   ulimit -c -> unlimited
    // which results in core dumps being written to postgres data directory as core.<pid>.
    //
    // Use that as a default location and pattern, except macos where core dumps are written
@@ -1387,7 +1396,9 @@ pub fn forward_termination_signal() {
    let pg_pid = PG_PID.load(Ordering::SeqCst);
    if pg_pid != 0 {
        let pg_pid = nix::unistd::Pid::from_raw(pg_pid as i32);
-        // use 'immediate' shutdown (SIGQUIT): https://www.postgresql.org/docs/current/server-shutdown.html
+        // Use 'fast' shutdown (SIGINT) because it also creates a shutdown checkpoint, which is important for
-        kill(pg_pid, Signal::SIGQUIT).ok();
+        // ROs to get a list of running xacts faster instead of going through the CLOG.
        // See https://www.postgresql.org/docs/current/server-shutdown.html for the list of modes and signals.
        kill(pg_pid, Signal::SIGINT).ok();
    }
 }
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -11,6 +11,7 @@ pub mod logger;
 pub mod catalog;
 pub mod compute;
 pub mod extension_server;
 mod migration;
 pub mod monitor;
 pub mod params;
 pub mod pg_helpers;
--- a/compute_tools/src/migration.rs
+++ b/compute_tools/src/migration.rs
@@ -0,0 +1,100 @@
 use anyhow::{Context, Result};
 use postgres::Client;
 use tracing::info;
 pub(crate) struct MigrationRunner<'m> {
    client: &'m mut Client,
    migrations: &'m [&'m str],
 }
 impl<'m> MigrationRunner<'m> {
    pub fn new(client: &'m mut Client, migrations: &'m [&'m str]) -> Self {
        Self { client, migrations }
    }
    fn get_migration_id(&mut self) -> Result<i64> {
        let query = "SELECT id FROM neon_migration.migration_id";
        let row = self
            .client
            .query_one(query, &[])
            .context("run_migrations get migration_id")?;
        Ok(row.get::<&str, i64>("id"))
    }
    fn update_migration_id(&mut self) -> Result<()> {
        let setval = format!(
            "UPDATE neon_migration.migration_id SET id={}",
            self.migrations.len()
        );
        self.client
            .simple_query(&setval)
            .context("run_migrations update id")?;
        Ok(())
    }
    fn prepare_migrations(&mut self) -> Result<()> {
        let query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
        self.client.simple_query(query)?;
        let query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)";
        self.client.simple_query(query)?;
        let query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING";
        self.client.simple_query(query)?;
        let query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin";
        self.client.simple_query(query)?;
        let query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC";
        self.client.simple_query(query)?;
        Ok(())
    }
    pub fn run_migrations(mut self) -> Result<()> {
        self.prepare_migrations()?;
        let mut current_migration: usize = self.get_migration_id()? as usize;
        let starting_migration_id = current_migration;
        let query = "BEGIN";
        self.client
            .simple_query(query)
            .context("run_migrations begin")?;
        while current_migration < self.migrations.len() {
            let migration = self.migrations[current_migration];
            if migration.starts_with("-- SKIP") {
                info!("Skipping migration id={}", current_migration);
            } else {
                info!(
                    "Running migration id={}:\n{}\n",
                    current_migration, migration
                );
                self.client.simple_query(migration).with_context(|| {
                    format!("run_migration current_migration={}", current_migration)
                })?;
            }
            current_migration += 1;
        }
        self.update_migration_id()?;
        let query = "COMMIT";
        self.client
            .simple_query(query)
            .context("run_migrations commit")?;
        info!(
            "Ran {} migrations",
            (self.migrations.len() - starting_migration_id)
        );
        Ok(())
    }
 }
--- a/compute_tools/src/migrations/0009-grant_snapshot_synchronization_funcs_to_neon_superuser.sql
+++ b/compute_tools/src/migrations/0009-grant_snapshot_synchronization_funcs_to_neon_superuser.sql
@@ -0,0 +1,7 @@
 DO $$
 BEGIN
    IF (SELECT setting::numeric >= 160000 FROM pg_settings WHERE name = 'server_version_num') THEN
       EXECUTE 'GRANT EXECUTE ON FUNCTION pg_export_snapshot TO neon_superuser';
       EXECUTE 'GRANT EXECUTE ON FUNCTION pg_log_standby_snapshot TO neon_superuser';
    END IF;
 END $$;
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -489,7 +489,7 @@ pub fn handle_postgres_logs(stderr: std::process::ChildStderr) -> JoinHandle<()>
 /// Read Postgres logs from `stderr` until EOF. Buffer is flushed on one of the following conditions:
 /// - next line starts with timestamp
 /// - EOF
-/// - no new lines were written for the last second
+/// - no new lines were written for the last 100 milliseconds
 async fn handle_postgres_logs_async(stderr: tokio::process::ChildStderr) -> Result<()> {
    let mut lines = tokio::io::BufReader::new(stderr).lines();
    let timeout_duration = Duration::from_millis(100);
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -10,6 +10,7 @@ use tracing::{error, info, info_span, instrument, span_enabled, warn, Level};
 use crate::config;
 use crate::logger::inlinify;
 use crate::migration::MigrationRunner;
 use crate::params::PG_HBA_ALL_MD5;
 use crate::pg_helpers::*;
@@ -789,71 +790,12 @@ pub fn handle_migrations(client: &mut Client) -> Result<()> {
            "./migrations/0007-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql"
        ),
        include_str!("./migrations/0008-revoke_replication_for_previously_allowed_roles.sql"),
        include_str!(
            "./migrations/0009-grant_snapshot_synchronization_funcs_to_neon_superuser.sql"
        ),
    ];
-    let mut func = || {
+    MigrationRunner::new(client, &migrations).run_migrations()?;
        let query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
        client.simple_query(query)?;
        let query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)";
        client.simple_query(query)?;
        let query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING";
        client.simple_query(query)?;
        let query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin";
        client.simple_query(query)?;
        let query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC";
        client.simple_query(query)?;
        Ok::<_, anyhow::Error>(())
    };
    func().context("handle_migrations prepare")?;
    let query = "SELECT id FROM neon_migration.migration_id";
    let row = client
        .query_one(query, &[])
        .context("handle_migrations get migration_id")?;
    let mut current_migration: usize = row.get::<&str, i64>("id") as usize;
    let starting_migration_id = current_migration;
    let query = "BEGIN";
    client
        .simple_query(query)
        .context("handle_migrations begin")?;
    while current_migration < migrations.len() {
        let migration = &migrations[current_migration];
        if migration.starts_with("-- SKIP") {
            info!("Skipping migration id={}", current_migration);
        } else {
            info!(
                "Running migration id={}:\n{}\n",
                current_migration, migration
            );
            client.simple_query(migration).with_context(|| {
                format!("handle_migrations current_migration={}", current_migration)
            })?;
        }
        current_migration += 1;
    }
    let setval = format!(
        "UPDATE neon_migration.migration_id SET id={}",
        migrations.len()
    );
    client
        .simple_query(&setval)
        .context("handle_migrations update id")?;
    let query = "COMMIT";
    client
        .simple_query(query)
        .context("handle_migrations commit")?;
    info!(
        "Ran {} migrations",
        (migrations.len() - starting_migration_id)
    );
    Ok(())
 }
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -325,11 +325,16 @@ impl LocalEnv {
        }
    }
-    pub fn pg_bin_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
+    pub fn pg_dir(&self, pg_version: u32, dir_name: &str) -> anyhow::Result<PathBuf> {
-        Ok(self.pg_distrib_dir(pg_version)?.join("bin"))
+        Ok(self.pg_distrib_dir(pg_version)?.join(dir_name))
    }
    pub fn pg_bin_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
        self.pg_dir(pg_version, "bin")
    }
    pub fn pg_lib_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
-        Ok(self.pg_distrib_dir(pg_version)?.join("lib"))
+        self.pg_dir(pg_version, "lib")
    }
    pub fn pageserver_bin(&self) -> PathBuf {
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -15,7 +15,6 @@ use std::time::Duration;
 use anyhow::{bail, Context};
 use camino::Utf8PathBuf;
 use futures::SinkExt;
 use pageserver_api::models::{
    self, AuxFilePolicy, LocationConfig, TenantHistorySize, TenantInfo, TimelineInfo,
 };
@@ -350,11 +349,6 @@ impl PageServerNode {
                .map(|x| x.parse::<NonZeroU64>())
                .transpose()
                .context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?,
            trace_read_requests: settings
                .remove("trace_read_requests")
                .map(|x| x.parse::<bool>())
                .transpose()
                .context("Failed to parse 'trace_read_requests' as bool")?,
            eviction_policy: settings
                .remove("eviction_policy")
                .map(serde_json::from_str)
@@ -455,11 +449,6 @@ impl PageServerNode {
                    .map(|x| x.parse::<NonZeroU64>())
                    .transpose()
                    .context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?,
                trace_read_requests: settings
                    .remove("trace_read_requests")
                    .map(|x| x.parse::<bool>())
                    .transpose()
                    .context("Failed to parse 'trace_read_requests' as bool")?,
                eviction_policy: settings
                    .remove("eviction_policy")
                    .map(serde_json::from_str)
@@ -566,60 +555,39 @@ impl PageServerNode {
        pg_wal: Option<(Lsn, PathBuf)>,
        pg_version: u32,
    ) -> anyhow::Result<()> {
        let (client, conn) = self.page_server_psql_client().await?;
        // The connection object performs the actual communication with the database,
        // so spawn it off to run on its own.
        tokio::spawn(async move {
            if let Err(e) = conn.await {
                eprintln!("connection error: {}", e);
            }
        });
        let client = std::pin::pin!(client);
        // Init base reader
        let (start_lsn, base_tarfile_path) = base;
        let base_tarfile = tokio::fs::File::open(base_tarfile_path).await?;
-        let base_tarfile = tokio_util::io::ReaderStream::new(base_tarfile);
+        let base_tarfile =
            mgmt_api::ReqwestBody::wrap_stream(tokio_util::io::ReaderStream::new(base_tarfile));
        // Init wal reader if necessary
        let (end_lsn, wal_reader) = if let Some((end_lsn, wal_tarfile_path)) = pg_wal {
            let wal_tarfile = tokio::fs::File::open(wal_tarfile_path).await?;
-            let wal_reader = tokio_util::io::ReaderStream::new(wal_tarfile);
+            let wal_reader =
                mgmt_api::ReqwestBody::wrap_stream(tokio_util::io::ReaderStream::new(wal_tarfile));
            (end_lsn, Some(wal_reader))
        } else {
            (start_lsn, None)
        };
        let copy_in = |reader, cmd| {
            let client = &client;
            async move {
                let writer = client.copy_in(&cmd).await?;
                let writer = std::pin::pin!(writer);
                let mut writer = writer.sink_map_err(|e| {
                    std::io::Error::new(std::io::ErrorKind::Other, format!("{e}"))
                });
                let mut reader = std::pin::pin!(reader);
                writer.send_all(&mut reader).await?;
                writer.into_inner().finish().await?;
                anyhow::Ok(())
            }
        };
        // Import base
-        copy_in(
+        self.http_client
-            base_tarfile,
+            .import_basebackup(
-            format!(
+                tenant_id,
-                "import basebackup {tenant_id} {timeline_id} {start_lsn} {end_lsn} {pg_version}"
+                timeline_id,
-            ),
+                start_lsn,
-        )
+                end_lsn,
-        .await?;
+                pg_version,
-        // Import wal if necessary
+                base_tarfile,
        if let Some(wal_reader) = wal_reader {
            copy_in(
                wal_reader,
                format!("import wal {tenant_id} {timeline_id} {start_lsn} {end_lsn}"),
            )
            .await?;
        // Import wal if necessary
        if let Some(wal_reader) = wal_reader {
            self.http_client
                .import_wal(tenant_id, timeline_id, start_lsn, end_lsn, wal_reader)
                .await?;
        }
        Ok(())
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -155,16 +155,16 @@ impl StorageController {
        .expect("non-Unicode path")
    }
-    /// Find the directory containing postgres binaries, such as `initdb` and `pg_ctl`
+    /// Find the directory containing postgres subdirectories, such `bin` and `lib`
    ///
    /// This usually uses STORAGE_CONTROLLER_POSTGRES_VERSION of postgres, but will fall back
    /// to other versions if that one isn't found.  Some automated tests create circumstances
    /// where only one version is available in pg_distrib_dir, such as `test_remote_extensions`.
-    pub async fn get_pg_bin_dir(&self) -> anyhow::Result<Utf8PathBuf> {
+    async fn get_pg_dir(&self, dir_name: &str) -> anyhow::Result<Utf8PathBuf> {
        let prefer_versions = [STORAGE_CONTROLLER_POSTGRES_VERSION, 15, 14];
        for v in prefer_versions {
-            let path = Utf8PathBuf::from_path_buf(self.env.pg_bin_dir(v)?).unwrap();
+            let path = Utf8PathBuf::from_path_buf(self.env.pg_dir(v, dir_name)?).unwrap();
            if tokio::fs::try_exists(&path).await? {
                return Ok(path);
            }
@@ -172,11 +172,20 @@ impl StorageController {
        // Fall through
        anyhow::bail!(
-            "Postgres binaries not found in {}",
+            "Postgres directory '{}' not found in {}",
-            self.env.pg_distrib_dir.display()
+            dir_name,
            self.env.pg_distrib_dir.display(),
        );
    }
    pub async fn get_pg_bin_dir(&self) -> anyhow::Result<Utf8PathBuf> {
        self.get_pg_dir("bin").await
    }
    pub async fn get_pg_lib_dir(&self) -> anyhow::Result<Utf8PathBuf> {
        self.get_pg_dir("lib").await
    }
    /// Readiness check for our postgres process
    async fn pg_isready(&self, pg_bin_dir: &Utf8Path) -> anyhow::Result<bool> {
        let bin_path = pg_bin_dir.join("pg_isready");
@@ -229,12 +238,17 @@ impl StorageController {
            .unwrap()
            .join("storage_controller_db");
        let pg_bin_dir = self.get_pg_bin_dir().await?;
        let pg_lib_dir = self.get_pg_lib_dir().await?;
        let pg_log_path = pg_data_path.join("postgres.log");
        if !tokio::fs::try_exists(&pg_data_path).await? {
            // Initialize empty database
            let initdb_path = pg_bin_dir.join("initdb");
            let mut child = Command::new(&initdb_path)
                .envs(vec![
                    ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
                    ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
                ])
                .args(["-D", pg_data_path.as_ref()])
                .spawn()
                .expect("Failed to spawn initdb");
@@ -269,7 +283,10 @@ impl StorageController {
            &self.env.base_data_dir,
            pg_bin_dir.join("pg_ctl").as_std_path(),
            db_start_args,
-            [],
+            vec![
                ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
                ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
            ],
            background_process::InitialPidFile::Create(self.postgres_pid_file()),
            retry_timeout,
            || self.pg_isready(&pg_bin_dir),
@@ -324,7 +341,10 @@ impl StorageController {
            &self.env.base_data_dir,
            &self.env.storage_controller_bin(),
            args,
-            [],
+            vec![
                ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
                ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
            ],
            background_process::InitialPidFile::Create(self.pid_file()),
            retry_timeout,
            || async {
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -56,6 +56,10 @@ enum Command {
        #[arg(long)]
        scheduling: Option<NodeSchedulingPolicy>,
    },
    NodeDelete {
        #[arg(long)]
        node_id: NodeId,
    },
    /// Modify a tenant's policies in the storage controller
    TenantPolicy {
        #[arg(long)]
@@ -337,7 +341,7 @@ async fn main() -> anyhow::Result<()> {
        }
        Command::TenantCreate { tenant_id } => {
            storcon_client
-                .dispatch(
+                .dispatch::<_, ()>(
                    Method::POST,
                    "v1/tenant".to_string(),
                    Some(TenantCreateRequest {
@@ -357,13 +361,16 @@ async fn main() -> anyhow::Result<()> {
            tracing::info!("Delete status: {}", status);
        }
        Command::Nodes {} => {
-            let resp = storcon_client
+            let mut resp = storcon_client
                .dispatch::<(), Vec<NodeDescribeResponse>>(
                    Method::GET,
                    "control/v1/node".to_string(),
                    None,
                )
                .await?;
            resp.sort_by(|a, b| a.listen_http_addr.cmp(&b.listen_http_addr));
            let mut table = comfy_table::Table::new();
            table.set_header(["Id", "Hostname", "Scheduling", "Availability"]);
            for node in resp {
@@ -395,13 +402,16 @@ async fn main() -> anyhow::Result<()> {
                .await?;
        }
        Command::Tenants {} => {
-            let resp = storcon_client
+            let mut resp = storcon_client
                .dispatch::<(), Vec<TenantDescribeResponse>>(
                    Method::GET,
                    "control/v1/tenant".to_string(),
                    None,
                )
                .await?;
            resp.sort_by(|a, b| a.tenant_id.cmp(&b.tenant_id));
            let mut table = comfy_table::Table::new();
            table.set_header([
                "TenantId",
@@ -650,6 +660,11 @@ async fn main() -> anyhow::Result<()> {
                .dispatch::<(), ()>(Method::POST, format!("debug/v1/node/{node_id}/drop"), None)
                .await?;
        }
        Command::NodeDelete { node_id } => {
            storcon_client
                .dispatch::<(), ()>(Method::DELETE, format!("control/v1/node/{node_id}"), None)
                .await?;
        }
        Command::TenantSetTimeBasedEviction {
            tenant_id,
            period,
--- a/docs/rfcs/033-storage-controller-drain-and-fill.md
+++ b/docs/rfcs/033-storage-controller-drain-and-fill.md
@@ -0,0 +1,345 @@
 # Graceful Restarts of Storage Controller Managed Clusters
 ## Summary
 This RFC describes new storage controller APIs for draining and filling tenant shards from/on pageserver nodes.
 It also covers how these new APIs should be used by an orchestrator (e.g. Ansible) in order to implement
 graceful cluster restarts.
 ## Motivation
 Pageserver restarts cause read availablity downtime for tenants.
 For example pageserver-3 @ us-east-1 was unavailable for a randomly
 picked tenant (which requested on-demand activation) for around 30 seconds
 during the restart at 2024-04-03 16:37 UTC.
 Note that lots of shutdowns on loaded pageservers do not finish within the
 [10 second systemd enforced timeout](https://github.com/neondatabase/aws/blob/0a5280b383e43c063d43cbf87fa026543f6d6ad4/.github/ansible/systemd/pageserver.service#L16). This means we are shutting down without flushing ephemeral layers
 and have to reingest data in order to serve requests after restarting, potentially making first request latencies worse.
 This problem is not yet very acutely felt in storage controller managed pageservers since
 tenant density is much lower there. However, we are planning on eventually migrating all
 pageservers to storage controller management, so it makes sense to solve the issue proactively.
 ## Requirements
 - Pageserver re-deployments cause minimal downtime for tenants
 - The storage controller exposes HTTP API hooks for draining and filling tenant shards
 from a given pageserver. Said hooks can be used by an orchestrator proces or a human operator.
 - The storage controller exposes some HTTP API to cancel draining and filling background operations.
 - Failures to drain or fill the node should not be fatal. In such cases, cluster restarts should proceed
 as usual (with downtime).
 - Progress of draining/filling is visible through metrics
 ## Non Goals
 - Integration with the control plane
 - Graceful restarts for large non-HA tenants.
 ## Impacted Components
 - storage controller
 - deployment orchestrator (i.e. Ansible)
 - pageserver (indirectly)
 ## Terminology
 ** Draining ** is the process through which all tenant shards that can be migrated from a given pageserver
 are distributed across the rest of the cluster.
 ** Filling ** is the symmetric opposite of draining. In this process tenant shards are migrated onto a given
 pageserver until the cluster reaches a resonable, quiescent distribution of tenant shards across pageservers.
 ** Node scheduling policies ** act as constraints to the scheduler. For instance, when a
 node is set in the `Paused` policy, no further shards will be scheduled on it.
 ** Node ** is a pageserver. Term is used interchangeably in this RFC.
 ** Deployment orchestrator ** is a generic term for whatever drives our deployments.
 Currently, it's an Ansible playbook.
 ## Background
 ### Storage Controller Basics (skip if already familiar)
 Fundamentally, the storage controller is a reconciler which aims to move from the observed mapping between pageservers and tenant shards to an intended mapping. Pageserver nodes and tenant shards metadata is durably persisted in a database, but note that the mapping between the two entities is not durably persisted. Instead, this mapping (*observed state*) is constructed at startup by sending `GET location_config` requests to registered pageservers.
 An internal scheduler maps tenant shards to pageservers while respecting certain constraints. The result of scheduling is the *intent state*. When the intent state changes, a *reconciliation* will inform pageservers about the new assigment via `PUT location_config` requests and will notify the compute via the configured hook.
 ### Background Optimizations
 The storage controller performs scheduling optimizations in the background. It will
 migrate attachments to warm secondaries and replace secondaries in order to balance
 the cluster out.
 ### Reconciliations Concurrency Limiting
 There's a hard limit on the number of reconciles that the storage controller
 can have in flight at any given time. To get an idea of scales, the limit is
 128 at the time of writing.
 ## Implementation
 Note: this section focuses on the core functionality of the graceful restart process.
 It doesn't neccesarily describe the most efficient approach. Optimizations are described
 separately in a later section.
 ### Overall Flow
 This section describes how to implement graceful restarts from the perspective
 of Ansible, the deployment orchestrator. Pageservers are already restarted sequentially.
 The orchestrator shall implement the following epilogue and prologue steps for each
 pageserver restart:
 #### Prologue
 The orchestrator shall first fetch the pageserver node id from the control plane or
 the pageserver it aims to restart directly. Next, it issues an HTTP request
 to the storage controller in order to start the drain of said pageserver node.
 All error responses are retried with a short back-off. When a 202 (Accepted)
 HTTP code is returned, the drain has started. Now the orchestrator polls the
 node status endpoint exposed by the storage controller in order to await the
 end of the drain process. When the `policy` field of the node status response
 becomes `PauseForRestart`, the drain has completed and the orchestrator can
 proceed with restarting the pageserver.
 The prologue is subject to an overall timeout. It will have a value in the ballpark
 of minutes. As storage controller managed pageservers become more loaded this timeout
 will likely have to increase.
 #### Epilogue
 After restarting the pageserver, the orchestrator issues an HTTP request
 to the storage controller to kick off the filling process. This API call
 may be retried for all error codes with a short backoff. This also serves
 as a synchronization primitive as the fill will be refused if the pageserver
 has not yet re-attached to the storage controller. When a 202(Accepted) HTTP
 code is returned, the fill has started. Now the orchestrator polls the node
 status endpoint exposed by the storage controller in order to await the end of
 the filling process. When the `policy` field of the node status response becomes
 `Active`, the fill has completed and the orchestrator may proceed to the next pageserver.
 Again, the epilogue is subject to an overall timeout. We can start off with
 using the same timeout as for the prologue, but can also consider relying on
 the storage controller's background optimizations with a shorter timeout.
 In the case that the deployment orchestrator times out, it attempts to cancel
 the fill. This operation shall be retried with a short back-off. If it ultimately
 fails it will require manual intervention to set the nodes scheduling policy to
 `NodeSchedulingPolicy::Active`. Not doing that is not immediately problematic,
 but it constrains the scheduler as mentioned previously.
 ### Node Scheduling Policy State Machine
 The state machine below encodes the behaviours discussed above and
 the various failover situations described in a later section.
 Assuming no failures and/or timeouts the flow should be:
 `Active -> Draining -> PauseForRestart -> Active -> Filling -> Active`
 ```
                          Operator requested drain
               +-----------------------------------------+
               |                                         |
       +-------+-------+                         +-------v-------+
       |               |                         |               |
       |     Pause     |             +----------->    Draining   +----------+
       |               |             |           |               |          |
       +---------------+             |           +-------+-------+          |
                                     |                   |                  |
                                     |                   |                  |
                      Drain requested|                   |                  |
                                     |                   |Drain complete    | Drain failed
                                     |                   |                  | Cancelled/PS reattach/Storcon restart
                                     |                   |                  |
                             +-------+-------+           |                  |
                             |               |           |                  |
               +-------------+    Active     <-----------+------------------+
               |             |               |           |
 Fill requested |             +---^---^-------+           |
               |                 |   |                   |
               |                 |   |                   |
               |                 |   |                   |
               |   Fill completed|   |                   |
               |                 |   |PS reattach        |
               |                 |   |after restart      |
       +-------v-------+         |   |           +-------v-------+
       |               |         |   |           |               |
       |    Filling    +---------+   +-----------+PauseForRestart|
       |               |                         |               |
       +---------------+                         +---------------+
 ```
 ### Draining/Filling APIs
 The storage controller API to trigger the draining of a given node is:
 `PUT /v1/control/node/:node_id/{drain,fill}`.
 The following HTTP non-success return codes are used.
 All of them are safely retriable from the perspective of the storage controller.
 - 404: Requested node was not found
 - 503: Requested node is known to the storage controller, but unavailable
 - 412: Drain precondition failed: there is no other node to drain to or the node's schedulling policy forbids draining
 - 409: A {drain, fill} is already in progress. Only one such background operation
 is allowed per node.
 When the drain is accepted and commenced a 202 HTTP code is returned.
 Drains and fills shall be cancellable by the deployment orchestrator or a
 human operator via: `DELETE /v1/control/node/:node_id/{drain,fill}`. A 200
 response is returned when the cancelation is successful. Errors are retriable.
 ### Drain Process
 Before accpeting a drain request the following validations is applied:
 * Ensure that the node is known the storage controller
 * Ensure that the schedulling policy is `NodeSchedulingPolicy::Active` or `NodeSchedulingPolicy::Pause`
 * Ensure that another drain or fill is not already running on the node
 * Ensure that a drain is possible (i.e. check that there is at least one
 schedulable node to drain to)
 After accepting the drain, the scheduling policy of the node is set to
 `NodeSchedulingPolicy::Draining` and persisted in both memory and the database.
 This disallows the optimizer from adding or removing shards from the node which
 is desirable to avoid them racing.
 Next, a separate Tokio task is spawned to manage the draining. For each tenant
 shard attached to the node being drained, demote the node to a secondary and
 attempt to schedule the node away. Scheduling might fail due to unsatisfiable
 constraints, but that is fine. Draining is a best effort process since it might
 not always be possible to cut over all shards.
 Importantly, this task manages the concurrency of issued reconciles in order to
 avoid drowning out the target pageservers and to allow other important reconciles
 to proceed.
 Once the triggered reconciles have finished or timed out, set the node's scheduling
 policy to `NodeSchedulingPolicy::PauseForRestart` to signal the end of the drain.
 A note on non HA tenants: These tenants do not have secondaries, so by the description
 above, they would not be migrated. It makes sense to skip them (especially the large ones)
 since, depending on tenant size, this might be more disruptive than the restart since the
 pageserver we've moved to do will need to on-demand download the entire working set for the tenant.
 We can consider expanding to small non-HA tenants in the future.
 ### Fill Process
 Before accpeting a fill request the following validations is applied:
 * Ensure that the node is known the storage controller
 * Ensure that the schedulling policy is `NodeSchedulingPolicy::Active`.
 This is the only acceptable policy for the fill starting state. When a node re-attaches,
 it set the scheduling policy to `NodeSchedulingPolicy::Active` if it was equal to
 `NodeSchedulingPolicy::PauseForRestart` or `NodeSchedulingPolicy::Draining` (possible end states for a node drain).
 * Ensure that another drain or fill is not already running on the node
 After accepting the drain, the scheduling policy of the node is set to
 `NodeSchedulingPolicy::Filling` and persisted in both memory and the database.
 This disallows the optimizer from adding or removing shards from the node which
 is desirable to avoid them racing.
 Next, a separate Tokio task is spawned to manage the draining. For each tenant
 shard where the filled node is a secondary, promote the secondary. This is done
 until we run out of shards or the counts of attached shards become balanced across
 the cluster.
 Like for draining, the concurrency of spawned reconciles is limited.
 ### Failure Modes & Handling
 Failures are generally handled by transition back into the `Active`
 (neutral) state. This simplifies the implementation greatly at the
 cost of adding transitions to the state machine. For example, we
 could detect the `Draining` state upon restart and proceed with a drain,
 but how should the storage controller know that's what the orchestrator
 needs still?
 #### Storage Controller Crash
 When the storage controller starts up reset the node scheduling policy
 of all nodes in states `Draining`, `Filling` or `PauseForRestart` to
 `Active`. The rationale is that when the storage controller restarts,
 we have lost context of what the deployment orchestrator wants. It also
 has the benefit of making things easier to reason about.
 #### Pageserver Crash During Drain
 The pageserver will attempt to re-attach during restart at which
 point the node scheduling policy will be set back to `Active`, thus
 reenabling the scheduler to use the node.
 #### Non-drained Pageserver Crash During Drain
 What should happen when a pageserver we are draining to crashes during the
 process. Two reasonable options are: cancel the drain and focus on the failover
 *or* do both, but prioritise failover. Since the number of concurrent reconciles
 produced by drains/fills are limited, we get the later behaviour for free.
 My suggestion is we take this approach, but the cancellation option is trivial
 to implement as well.
 #### Pageserver Crash During Fill
 The pageserver will attempt to re-attach during restart at which
 point the node scheduling policy will be set back to `Active`, thus
 reenabling the scheduler to use the node.
 #### Pageserver Goes unavailable During Drain/Fill
 The drain and fill jobs handle this by stopping early. When the pageserver
 is detected as online by storage controller heartbeats, reset its scheduling
 policy to `Active`. If a restart happens instead, see the pageserver crash
 failure mode.
 #### Orchestrator Drain Times Out
 Orchestrator will still proceed with the restart.
 When the pageserver re-attaches, the scheduling policy is set back to
 `Active`.
 #### Orchestrator Fill Times Out
 Orchestrator will attempt to cancel the fill operation. If that fails,
 the fill will continue until it quiesces and the node will be left
 in the `Filling` scheduling policy. This hinders the scheduler, but is
 otherwise harmless. A human operator can handle this by setting the scheduling
 policy to `Active`, or we can bake in a fill timeout into the storage controller.
 ## Optimizations
 ### Location Warmth
 When cutting over to a secondary, the storage controller will wait for it to
 become "warm" (i.e. download enough of the tenants data). This means that some
 reconciliations can take significantly longer than others and hold up precious
 reconciliations units. As an optimization, the drain stage can only cut over
 tenants that are already "warm". Similarly, the fill stage can prioritise the
 "warmest" tenants in the fill.
 Given that the number of tenants by the storage controller will be fairly low
 for the foreseable future, the first implementation could simply query the tenants
 for secondary status. This doesn't scale well with increasing tenant counts, so
 eventually we will need new pageserver API endpoints to report the sets of
 "warm" and "cold" nodes.
 ## Alternatives Considered
 ### Draining and Filling Purely as Scheduling Constraints
 At its core, the storage controller is a big background loop that detects changes
 in the environment and reacts on them. One could express draining and filling
 of nodes purely in terms of constraining the scheduler (as opposed to having
 such background tasks).
 While theoretically nice, I think that's harder to implement and more importantly operate and reason about.
 Consider cancellation of a drain/fill operation. We would have to update the scheduler state, create
 an entirely new schedule (intent state) and start work on applying that. It gets trickier if we wish
 to cancel the reconciliation tasks spawned by drain/fill nodes. How would we know which ones belong
 to the conceptual drain/fill? One could add labels to reconciliations, but it gets messy in my opinion.
 It would also mean that reconciliations themselves have side effects that persist in the database
 (persist something to the databse when the drain is done), which I'm not conceptually fond of.
 ## Proof of Concept
 This RFC is accompanied by a POC which implements nearly everything mentioned here
 apart from the optimizations and some of the failure handling:
 https://github.com/neondatabase/neon/pull/7682
--- a/docs/rfcs/034-timeline-archive.md
+++ b/docs/rfcs/034-timeline-archive.md
@@ -0,0 +1,507 @@
 # Timeline Archival
 ## Summary
 This RFC describes a mechanism for pageservers to eliminate local storage + compute work
 for timelines which are not in use, in response to external API calls to "archive" a timeline.
 The archived state roughly corresponds to fully offloading a timeline to object storage, such
 that its cost is purely the cost of that object storage.
 ## Motivation
 Archived timelines serve multiple purposes:
 - Act as a 'snapshot' for workloads that would like to retain restorable copies of their
  database from longer ago than their PITR window.
 - Enable users to create huge numbers of branches (e.g. one per github PR) without having
  to diligently clean them up later to avoid overloading the pageserver (currently we support
  up to ~500 branches per tenant).
 ### Prior art
 Most storage and database systems have some form of snapshot, which can be implemented several ways:
 1. full copies of data (e.g. an EBS snapshot to S3)
 2. shallow snapshots which are CoW relative to the original version of the data, e.g. on a typical NFS appliance, or a filesystem like CephFS.
 3. a series of snapshots which are CoW or de-duplicated relative to one another.
 Today's Neon branches are approximately like `2.`, although due to implementation details branches
 often end up storing much more data than they really need, as parent branches assume that all data
 at the branch point is needed.  The layers pinned in the parent branch may have a much larger size
 than the physical size of a compressed image layer representing the data at the branch point.
 ## Requirements
 - Enter & exit the archived state in response to external admin API calls
 - API calls to modify the archived state are atomic and durable
 - An archived timeline should eventually (once out of PITR window) use an efficient compressed
  representation, and avoid retaining arbitrarily large data in its parent branch.
 - Remote object GETs during tenant start may be O(N) with the number of _active_ branches,
  but must not scale with the number of _archived_ branches.
 - Background I/O for archived branches should only be done a limited number of times to evolve them
  to a long-term-efficient state (e.g. rewriting to image layers).  There should be no ongoing "housekeeping"
  overhead for archived branches, including operations related to calculating sizes for billing.
 - The pageserver should put no load on the safekeeper for archived branches.
 - Performance of un-archiving a branch must make good use of S3/disk bandwidth to restore the branch
  to a performant state in a short time (linear with the branch's logical size)
 ## Non Goals
 - Archived branches are not a literal `fullbackup` postgres snapshot: they are still stored
  in Neon's internal format.
 - Compute cold starts after activating an archived branch will not have comparable performance to
  cold starts on an active branch.
 - Archived branches will not use any new/additional compression or de-duplication beyond what
  is already implemented for image layers (zstd per page).
 - The pageserver will not "auto start" archived branches in response to page_service API requests: they
  are only activated explicitly via the HTTP API.
 - We will not implement a total offload of archived timelines from safekeepers: their control file (small) will
  remain on local disk, although existing eviction mechanisms will remove any segments from local disk.
 - We will not expose any prometheus metrics for archived timelines, or make them visible in any
  detailed HTTP APIs other than the specific API for listing archived timelines.
 - A parent branch may not be archived unless all its children are.
 ## Impacted Components
 pageserver, storage controller
 ## Terminology
 **Archived**: a branch is _archived_ when an HTTP API request to archive it has succeeded: the caller
 may assume that this branch is now very cheap to store, although this may not be physically so until the
 branch proceeds to the offloaded state.
 **Active** branches are branches which are available for use by page_service clients, and have a relatively
 high cost due to consuming local storage.
 **Offloaded** branches are a subset of _archived_ branches, which have had their local state removed such
 that they now consume minimal runtime resources and have a cost similar to the cost of object storage.
 **Activate** (verb): transition from Archived to Active
 **Archive** (verb): transition from Active to Archived
 **Offload** (verb): transition from Archived to Offloaded
 **Offload manifest**: an object stored in S3 that describes timelines which pageservers do not load.
 **Warm up** (verb): operation done on an active branch, by downloading its active layers.  Once a branch is
 warmed up, good performance will be available to page_service clients.
 ## Implementation
 ### High level flow
 We may think of a timeline which is archived and then activated as proceeding through a series of states:
 ```mermaid
 stateDiagram
  [*] --> Active(warm)
  Active(warm) --> Archived
  Archived --> Offloaded
  Archived --> Active(warm)
  Offloaded --> Active(cold)
  Active(cold) --> Active(warm)
 ```
 Note that the transition from Archived to Active(warm) is expected to be fairly rare: the most common lifecycles
 of branches will be:
 - Very frequent: Short lived branches: Active -> Deleted
 - Frequent: Long-lived branches: Active -> Archived -> Offloaded -> Deleted
 - Rare: Branches used to restore old state: Active ->Archived -> Offloaded -> Active
 These states are _not_ all stored as a single physical state on the timeline, but rather represent the combination
 of:
 - the timeline's lifecycle state: active or archived, stored in the timeline's index
 - its offload state: whether pageserver has chosen to drop local storage of the timeline and write it into the
  manifest of offloaded timelines.
 - cache state (whether it's warm or cold).
 ### Storage format changes
 There are two storage format changes:
 1. `index_part.json` gets a new attribute `state` that describes whether the timeline is to
   be considered active or archived.
 2. A new tenant-level _manifest_ object `tenant_manifest-v1.json` describes which timelines a tenant does not need to load
   at startup (and is available for storing other small, rarely changing tenant-wide attributes in future)
 The manifest object will have a format like this:
 ```
 {
  "offload_timelines": [
    {
      "timeline_id": ...
      "last_record_lsn": ...
      "last_record_lsn_time": ...
      "pitr_interval": ...
      "last_gc_lsn": ...  # equal to last_record_lsn if this branch has no history (i.e. a snapshot)
      "logical_size": ...  # The size at last_record_lsn
      "physical_size" ...
      "parent": Option<{
        "timeline_id"...
        "lsn"... # Branch point LSN on the parent
        "requires_data": bool # True if this branch depends on layers in its parent, identify it here
      }>
    }
  ]
 }
 ```
 The information about a timeline in its offload state is intentionally minimal: just enough to decide:
 - Whether it requires [archive optimization](#archive-branch-optimization) by rewriting as a set of image layers: we may infer this
  by checking if now > last_record_lsn_time - pitr_interval, and pitr_lsn < last_record_lsn.
 - Whether a parent branch should include this offloaded branch in its GC inputs to avoid removing
  layers that the archived branch depends on
 - Whether requests to delete this `timeline_id` should be executed (i.e. if a deletion request
  is received for a timeline_id that isn't in the site of live `Timelines` or in the manifest, then
  we don't need to go to S3 for the deletion.
 - How much archived space to report in consumption metrics
 The contents of the manifest's offload list will also be stored as an attribute of `Tenant`, such that the total
 set of timelines may be found by the union of `Tenant::timelines` (non-offloaded timelines) and `Tenant::offloaded`
 (offloaded timelines).
 For split-brain protection, the manifest object will be written with a generation suffix, in the same way as
 index_part objects are (see [generation numbers RFC](025-generation-numbers.md)).  This will add some complexity, but
 give us total safety against two pageservers with the same tenant attached fighting over the object.  Existing code
 for finding the latest generation and for cleaning up old generations (in the scrubber) will be generalized to cover
 the manifest file.
 ### API & Timeline state
 Timelines will store a lifecycle state (enum of Active or Archived) in their IndexPart.  This will
 be controlled by a new per-timeline `configure` endpoint.  This is intentionally generic naming, which
 may be used in future to control other per-timeline attributes (e.g. in future we may make PITR interval
 a per-timeline configuration).
 `PUT /v1/tenants/{tenant_id}/timelines/{timeline_id}/configure`
 ```
 {
  'state': 'active|archive'
 }
 ```
 When archiving a timeline, this API will complete as soon as the timeline's state has been set in index_part, and that index has been uploaded.
 When activating a timeline, this API will complete as soon as the timeline's state has been set in index_part,
 **and** the `Timeline` object has been instantiated and activated.  This will require reading the timeline's
 index, but not any data: it should be about as fast as a couple of small S3 requests.
 The API will be available with identical path via the storage controller: calling this on a sharded tenant
 will simply map the API call to all the shards.
 Archived timelines may never have descendent timelines which are active.  This will be enforced at the API level,
 such that activating a timeline requires that all its ancestors are active, and archiving a timeline requires
 that all its descendents are archived.  It is the callers responsibility to walk the hierarchy of timelines
 in the proper order if they would like to archive whole trees of branches.
 Because archive timelines will be excluded from the usual timeline listing APIs, a new API specifically
 for archived timelines will be added: this is for use in support/debug:
 ```
 GET /v1/tenants/{tenant_id}/archived_timelines
 {
  ...same per-timeline content as the tenant manifest...
 }
 ```
 ### Tenant attach changes
 Currently, during Tenant::spawn we list all the timelines in the S3 bucket, and then for each timeline
 we load their index_part.json.  To avoid the number of GETs scaling linearly with the number of archived
 timelines, we must have a single object that tells us which timelines do not need to be loaded.  The
 number of ListObjects requests while listing timelines will still scale O(N), but this is less problematic
 because each request covers 1000 timelines.
 This is **not** literally the same as the set of timelines who have state=archived.  Rather, it is
 the set of timelines which have been offloaded in the background after their state was set to archived.
 We may simply skip loading these timelines: there will be no special state of `Timeline`, they just won't
 exist from the perspective of an active `Tenant` apart from in deletion: timeline deletion will need
 to check for offloaded timelines as well as active timelines, to avoid wrongly returning 404 on trying
 to delete an offloaded timeline.
 ### Warm-up API
 `PUT /v1/tenants/{tenant_id}/timelines/{timeline_id}/download?wait_ms=1234`
 This API will be similar to the existing `download_remote_layers` API, but smarter:
 - It will not download _all_ remote layers, just the visible set (i.e. layers needed for a read)
 - It will download layers in the visible set until reaching `wait_ms`, then return a struct describing progress
  of downloads, so that the caller can poll.
 The _visible set_ mentioned above will be calculated by the pageserver in the background, by taking the set
 of readable LSNs (i.e. branch points and heads of branches), and walking the layer map to work out which layers
 can possibly be read from these LSNs.  This concept of layer visibility is more generally useful for cache
 eviction and heatmaps, as well as in this specific case of warming up a timeline.
 The caller does not have to wait for the warm up API, or call it at all.  But it is strongly advised
 to call it, because otherwise populating local contents for a timeline can take a long time when waiting
 for SQL queries to coincidentally hit all the layers, and during that time query latency remains quite
 volatile.
 ### Background work
 Archived branches are not subject to normal compaction.  Instead, when the compaction loop encounters
 an archived branch, it will consider rewriting the branch to just image layers if the branch has no history
 ([archive branch optimization](#archive-branch-optimization)), or offloading the timeline from local disk
 if its state permits that.
 Additionally, the tenant compaction task will walk the state of already offloaded timelines to consider
 optimizing their storage, e.g. if a timeline had some history when offloaded, but since then its PITR
 has elapsed and it can now be rewritten to image layers.
 #### Archive branch offload
 Recall that when we archive a timeline via the HTTP API, this only sets a state: it doesn't do
 any actual work.
 This work is done in the background compaction loop.  It makes sense to tag this work on to the compaction
 loop, because it is spiritually aligned: offloading data for archived branches improves storage efficiency.
 The condition for offload is simple:
 - a `Timeline` object exists with state `Archived`
 - the timeline does not have any non-offloaded children.
 Regarding the condition that children must be offloaded, this will always be eventually true, because
 we enforce at the API level that children of archived timelines must themselves be archived, and all
 archived timelines will eventually be offloaded.
 Offloading a timeline is simple:
 - Read the timeline's attributes that we will store in its offloaded state (especially its logical size)
 - Call `shutdown()` on the timeline and remove it from the `Tenant` (as if we were about to delete it)
 - Erase all the timeline's content from local storage (`remove_dir_all` on its path)
 - Write the tenant manifest to S3 to prevent this timeline being loaded on next start.
 #### Archive branch optimization (flattening)
 When we offloaded a branch, it might have had some history that prevented rewriting it to a single
 point in time set of image layers.  For example, a branch might have several days of writes and a 7
 day PITR: when we archive it, it still has those days of history.
 Once the PITR has expired, we have an opportunity to reduce the physical footprint of the branch by:
 - Writing compressed image layers within the archived branch, as these are more efficient as a way of storing
  a point in time compared with delta layers
 - Updating the branch's offload metadata to indicate that this branch no longer depends on its ancestor
  for data, i.e. the ancestor is free to GC layers files at+below the branch point
 Fully compacting an archived branch into image layers at a single LSN may be thought of as *flattening* the
 branch, such that it is now a one-dimensional keyspace rather than a two-dimensional key/lsn space. It becomes
 a true snapshot at that LSN.
 It is not always more efficient to flatten a branch than to keep some extra history on the parent: this
 is described in more detail in [optimizations](#delaying-storage-optimization-if-retaining-parent-layers-is-cheaper)
 Archive branch optimization should be done _before_ background offloads during compaction, because there may
 be timelines which are ready to be offloaded but also would benefit from the optimization step before
 being offloaded.  For example, a branch which has already fallen out of PITR window and has no history
 of its own may be immediately re-written as a series of image layers before being offloaded.
 ### Consumption metrics
 Archived timelines and offloaded timelines will be excluded from the synthetic size calculation, in anticipating
 that billing structures based on consumption metrics are highly likely to apply different $/GB rates to archived
 vs. ordinary content.
 Archived and offloaded timelines' logical size will be reported under the existing `timeline_logical_size`
 variant of `MetricsKey`: receivers are then free to bill on this metric as they please.
 ### Secondary locations
 Archived timelines (including offloaded timelines) will be excluded from heatmaps, and thereby
 when a timeline is archived, after the next cycle of heatmap upload & secondary download, its contents
 will be dropped from secondary locations.
 ### Sharding
 Archiving or activating a timeline will be done symmetrically across all shards in a tenant, in
 the same way that timeline creation and deletion is done.  There are no special rules about ordering:
 the storage controller may dispatch concurrent calls to all shards when archiving or activating a timeline.
 Since consumption metrics are only transmitted from shard zero, the state of archival on this shard
 will be authoritative for consumption metrics.
 ## Error cases
 ### Errors in sharded tenants
 If one shard in a tenant fails an operation but others succeed, the tenant may end up in a mixed
 state, where a timeline is archived on some shards but not on others.  
 We will not bother implementing a rollback mechanism for this: errors in archiving/activating a timeline
 are either transient (e.g. S3 unavailable, shutting down), or the fault of the caller (NotFound, BadRequest).
 In the transient case callers are expected to retry until success, or to make appropriate API calls to clear
 up their mistake.  We rely on this good behavior of callers to eventually get timelines into a consistent
 state across all shards.  If callers do leave a timeline in an inconsistent state across shards, this doesn't
 break anything, it's just "weird".
 This is similar to the status quo for timeline creation and deletion: callers are expected to retry
 these operations until they succeed.
 ### Archiving/activating
 Archiving/activating a timeline can fail in a limited number of ways:
 1. I/O error storing/reading the timeline's updated index
    - These errors are always retryable: a fundamental design assumption of the pageserver is that remote
      storage errors are always transient. 
 2. NotFound if the timeline doesn't exist
    - Callers of the API are expected to avoid calling deletion and archival APIs concurrently.
    - The storage controller has runtime locking to prevent races such as deleting a timeline while
      archiving it.
 3. BadRequest if the rules around ancestors/descendents of archived timelines would be violated
    - Callers are expected to do their own checks to avoid hitting this case.  If they make
      a mistake and encounter this error, they should give up.
 ### Offloading
 Offloading can only fail if remote storage is unavailable, which would prevent us from writing the
 tenant manifest.  In such error cases, we give up in the expectation that offloading will be tried 
 again at the next iteration of the compaction loop.
 ### Archive branch optimization
 Optimization is a special form of compaction, so can encounter all the same errors as regular compaction
 can: it should return Result<(), CompactionError>, and as with compaction it will be retried on
 the next iteration of the compaction loop.
 ## Optimizations
 ### Delaying storage optimization if retaining parent layers is cheaper
 Optimizing archived branches to image layers and thereby enabling parent branch GC to progress
 is a safe default: archived branches cannot over-fill a pageserver's local disk, and once they
 are offloaded to S3 they're totally safe, inert things.
 However, in some cases it can be advantageous to retain extra history on their parent branch rather
 than flattening the archived branch.  For example, if a 1TB parent branch is rather slow-changing (1GB
 of data per day), and archive branches are being created nightly, then writing out full 1TB image layers
 for each nightly branch is inefficient compared with just keeping more history on the main branch.
 Getting this right requires consideration of:
 - Compaction: if keeping more history on the main branch is going to prompt the main branch's compaction to
  write out extra image layers, then it might make more sense to just write out the image layers on
  the archived branch.
 - Metadata bloat: keeping extra history on a parent branch doesn't just cost GB of storage, it makes
  the layer map (and index_part) bigger.  There are practical limits beyond which writing an indefinitely
  large layer map can cause problems elsewhere.
 This optimization can probably be implemented quite cheaply with some basic heuristics like:
 - don't bother doing optimization on an archive branch if the LSN distance between
  its branch point and the end of the PITR window is <5% of the logical size of the archive branch.
 - ...but, Don't keep more history on the main branch than double the PITR
 ### Creating a timeline in archived state (a snapshot)
 Sometimes, one might want to create a branch with no history, which will not be written to
 before it is archived.  This is a snapshot, although we do not require a special snapshot API,
 since a snapshot can be represented as a timeline with no history.
 This can be accomplished by simply creating a timeline and then immediately archiving it, but
 that is somewhat wasteful: this timeline it will spin up various tasks and open a connection to the storage
 broker to try and ingest WAL, before being shutdown in the subsequent archival call.  To explicitly
 support this common special case, we may add a parameter to the timeline creation API which
 creates a timeline directly into the archived state.
 Such a timeline creation will do exactly two I/Os at creation time:
 - write the index_part object to record the timeline's existence
 - when the timeline is offloaded in the next iteration of the compaction loop (~20s later),
  write the tenant manifest.
 Later, when the timeline falls off the end of the PITR interval, the usual offload logic will wake
 up the 'snapshot' branch and write out image layers.
 ## Future Work
 ### Enabling `fullbackup` dumps from archive branches
 It would be useful to be able to export an archive branch to another system, or for use in a local
 postgres database.
 This could be implemented as a general capability for all branches, in which case it would "just work"
 for archive branches by activating them.  However, downloading all the layers in a branch just to generate
 a fullbackup is a bit inefficient: we could implement a special case for flattened archived branches
 which streams image layers from S3 and outputs the fullbackup stream without writing the layers out to disk.
 Implementing `fullbackup` is a bit more complicated than this because of sharding, but solving that problem
 is unrelated to the topic of archived branches (it probably involves having each shard write out a fullbackup 
 stream to S3 in an intermediate format and, then having one node stitch them together).
 ### Tagging layers from archived branches
 When we know a layer is an image layer written for an archived branch that has fallen off the PITR window,
 we may add tags to the S3 objects to enable writing lifecycle policies that transition such layers to even
 cheaper storage.
 This could be done for all archived layers, or it could be driven by the archival API, to give the pageserver
 external hints on which branches are likely to be reactivated, and which branches are good candidates for
 tagging for low performance storage.
 Tagging+lifecycles is just one mechanism: one might also directly use S3 storage classes.  Other clouds' object
 stores have similar mechanisms.
 ### Storing sequences of archive branches as deltas
 When archived branches are used as scheduled snapshots, we could store them even more efficiently
 by encoding them as deltas relative to each other (i.e. for nightly snapshots, when we do the
 storage optimization for Tuesday's snapshot, we would read Monday's snapshot and store only the modified
 pages). This is the kind of encoding that many backup storage systems use.
 The utility of this depends a lot on the churn rate of the data, and the cost of doing the delta encoding
 vs. just writing out a simple stream of the entire database.  For smaller databases, writing out a full
 copy is pretty trivial (e.g. writing a compressed copy of a 10GiB database to S3 can take under 10 seconds,
 so the complexity tradeoff of diff-encoding it is dubious).
 One does not necessarily have to read-back the previous snapshot in order to encoded the next one: if the
 pageserver knows about the schedule, it can intentionally retain extra history on the main branch so that
 we can say: "A branch exists from Monday night.  I have Monday night's data still active in the main branch,
 so now I can read at the Monday LSN and the Tuesday LSN, calculate the delta, and store it as Tuesday's
 delta snapshot".
 Clearly this all requires careful housekeeping to retain the relationship between branches that depend on
 each other: perhaps this would be done by making the archive branches have child/parent relationships with
 each other, or perhaps we would permit them to remain children of their original parent, but additionally
 have a relationship with the snapshot they're encoded relative to.
 Activating a branch that is diff-encoded may require activating several earlier branches too, so figuring
 out how frequently to write a full copy is important.  This is essentially a zoomed-out version of what
 we do with delta layers and image layers within a timeline, except each "layer" is a whole timeline.
 ## FAQ/Alternatives
 ### Store all timelines in the tenant manifest
 Rather than special-casing offloaded timelines in the offload manifest, we could store a total
 manifest of all timelines, eliminating the need for the pageserver to list timelines in S3 on
 startup.
 That would be a more invasive change (require hooking in to timeline creation), and would
 generate much more I/O to this manifest for tenants that had many branches _and_ frequent
 create/delete cycles for short lived branches.  Restricting the manifest to offloaded timelines
 means that we only have to cope with the rate at which long-lived timelines are archived, rather
 than the rate at which sort lived timelines are created & destroyed.
 ### Automatically archiving/activating timelines without external API calls
 We could implement TTL driven offload of timelines, waking them up when a page request
 arrives.
 This has downsides:
 - Opacity: if we do TTL-driven offload inside the pageserver, then the end user doesn't
  know which of their branches are in this state, and might get a surprise when they try
  to use such a branch.
 - Price fluctuation: if the archival of a branch is used in end user pricing, then users
  prefer clarity & consistency.  Ideally a branch's storage should cost the same from the moment it
  is created, rather than having a usage-dependency storage price.
 - Complexity: enabling the page service to call up into the Tenant to activate a timeline
  would be awkward, compared with an external entry point.
 ### Make offloaded a state of Timeline
 To reduce the operator-facing complexity of having some timelines APIs that only return
 non-offloaded timelines, we could build the offloaded state into the Timeline type.
 `timeline.rs` is already one of the most egregiously long source files in the tree, so
 this is rejected on the basis that we need to avoid making that complexity worse.
--- a/libs/metrics/src/hll.rs
+++ b/libs/metrics/src/hll.rs
@@ -13,11 +13,7 @@ use std::{
 use measured::{
    label::{LabelGroupVisitor, LabelName, LabelValue, LabelVisitor},
-    metric::{
+    metric::{counter::CounterState, name::MetricNameEncoder, Metric, MetricType, MetricVec},
        group::{Encoding, MetricValue},
        name::MetricNameEncoder,
        Metric, MetricType, MetricVec,
    },
    text::TextEncoder,
    LabelGroup,
 };
@@ -144,6 +140,7 @@ impl<const N: usize> HyperLogLogState<N> {
        })
    }
 }
 impl<W: std::io::Write, const N: usize> measured::metric::MetricEncoding<TextEncoder<W>>
    for HyperLogLogState<N>
 {
@@ -182,12 +179,13 @@ impl<W: std::io::Write, const N: usize> measured::metric::MetricEncoding<TextEnc
            .into_iter()
            .enumerate()
            .try_for_each(|(hll_shard, val)| {
-                enc.write_metric_value(
+                CounterState::new(val as u64).collect_into(
-                    name.by_ref(),
+                    &(),
                    labels.by_ref().compose_with(HllShardLabel {
                        hll_shard: hll_shard as i64,
                    }),
-                    MetricValue::Int(val as i64),
+                    name.by_ref(),
                    enc,
                )
            })
    }
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -9,7 +9,7 @@ use measured::{
    metric::{
        counter::CounterState,
        gauge::GaugeState,
-        group::{Encoding, MetricValue},
+        group::Encoding,
        name::{MetricName, MetricNameEncoder},
        MetricEncoding, MetricFamilyEncoding,
    },
@@ -171,8 +171,11 @@ fn write_gauge<Enc: Encoding>(
    labels: impl LabelGroup,
    name: impl MetricNameEncoder,
    enc: &mut Enc,
-) -> Result<(), Enc::Err> {
+) -> Result<(), Enc::Err>
-    enc.write_metric_value(name, labels, MetricValue::Int(x))
+where
    GaugeState: MetricEncoding<Enc>,
 {
    GaugeState::new(x).collect_into(&(), labels, name, enc)
 }
 #[derive(Default)]
@@ -544,15 +547,6 @@ impl<T: Encoding> Encoding for Inc<T> {
    fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> {
        self.0.write_help(name, help)
    }
    fn write_metric_value(
        &mut self,
        name: impl MetricNameEncoder,
        labels: impl LabelGroup,
        value: MetricValue,
    ) -> Result<(), Self::Err> {
        self.0.write_metric_value(name, labels, value)
    }
 }
 impl<T: Encoding> MetricEncoding<Inc<T>> for MeasuredCounterPairState
@@ -579,15 +573,6 @@ impl<T: Encoding> Encoding for Dec<T> {
    fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> {
        self.0.write_help(name, help)
    }
    fn write_metric_value(
        &mut self,
        name: impl MetricNameEncoder,
        labels: impl LabelGroup,
        value: MetricValue,
    ) -> Result<(), Self::Err> {
        self.0.write_metric_value(name, labels, value)
    }
 }
 /// Write the dec counter to the encoder
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -29,7 +29,7 @@ pub const KEY_SIZE: usize = 18;
 /// See [`Key::to_i128`] for more information on the encoding.
 pub const METADATA_KEY_SIZE: usize = 16;
-/// The key prefix start range for the metadata keys. All keys with the first byte >= 0x40 is a metadata key.
+/// The key prefix start range for the metadata keys. All keys with the first byte >= 0x60 is a metadata key.
 pub const METADATA_KEY_BEGIN_PREFIX: u8 = 0x60;
 pub const METADATA_KEY_END_PREFIX: u8 = 0x7F;
--- a/libs/pageserver_api/src/keyspace.rs
+++ b/libs/pageserver_api/src/keyspace.rs
@@ -17,6 +17,16 @@ pub struct KeySpace {
    pub ranges: Vec<Range<Key>>,
 }
 impl std::fmt::Display for KeySpace {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "[")?;
        for range in &self.ranges {
            write!(f, "{}..{},", range.start, range.end)?;
        }
        write!(f, "]")
    }
 }
 /// A wrapper type for sparse keyspaces.
 #[derive(Clone, Debug, Default, PartialEq, Eq)]
 pub struct SparseKeySpace(pub KeySpace);
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -9,6 +9,7 @@ use std::{
    collections::HashMap,
    io::{BufRead, Read},
    num::{NonZeroU64, NonZeroUsize},
    str::FromStr,
    sync::atomic::AtomicUsize,
    time::{Duration, SystemTime},
 };
@@ -228,6 +229,11 @@ pub struct TimelineCreateRequest {
    pub pg_version: Option<u32>,
 }
 #[derive(Serialize, Deserialize, Clone)]
 pub struct LsnLeaseRequest {
    pub lsn: Lsn,
 }
 #[derive(Serialize, Deserialize)]
 pub struct TenantShardSplitRequest {
    pub new_shard_count: u8,
@@ -288,7 +294,6 @@ pub struct TenantConfig {
    pub walreceiver_connect_timeout: Option<String>,
    pub lagging_wal_timeout: Option<String>,
    pub max_lsn_wal_lag: Option<NonZeroU64>,
    pub trace_read_requests: Option<bool>,
    pub eviction_policy: Option<EvictionPolicy>,
    pub min_resident_size_override: Option<u64>,
    pub evictions_low_residence_duration_metric_threshold: Option<String>,
@@ -432,6 +437,41 @@ pub enum CompactionAlgorithm {
    Tiered,
 }
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
 pub enum ImageCompressionAlgorithm {
    // Disabled for writes, support decompressing during read path
    Disabled,
    /// Zstandard compression. Level 0 means and None mean the same (default level). Levels can be negative as well.
    /// For details, see the [manual](http://facebook.github.io/zstd/zstd_manual.html).
    Zstd {
        level: Option<i8>,
    },
 }
 impl FromStr for ImageCompressionAlgorithm {
    type Err = anyhow::Error;
    fn from_str(s: &str) -> Result<Self, Self::Err> {
        let mut components = s.split(['(', ')']);
        let first = components
            .next()
            .ok_or_else(|| anyhow::anyhow!("empty string"))?;
        match first {
            "disabled" => Ok(ImageCompressionAlgorithm::Disabled),
            "zstd" => {
                let level = if let Some(v) = components.next() {
                    let v: i8 = v.parse()?;
                    Some(v)
                } else {
                    None
                };
                Ok(ImageCompressionAlgorithm::Zstd { level })
            }
            _ => anyhow::bail!("invalid specifier '{first}'"),
        }
    }
 }
 #[derive(Eq, PartialEq, Debug, Clone, Serialize, Deserialize)]
 pub struct CompactionAlgorithmSettings {
    pub kind: CompactionAlgorithm,
@@ -643,6 +683,16 @@ pub struct TimelineInfo {
    pub current_physical_size: Option<u64>, // is None when timeline is Unloaded
    pub current_logical_size_non_incremental: Option<u64>,
    /// How many bytes of WAL are within this branch's pitr_interval.  If the pitr_interval goes
    /// beyond the branch's branch point, we only count up to the branch point.
    pub pitr_history_size: u64,
    /// Whether this branch's branch point is within its ancestor's PITR interval (i.e. any
    /// ancestor data used by this branch would have been retained anyway).  If this is false, then
    /// this branch may be imposing a cost on the ancestor by causing it to retain layers that it would
    /// otherwise be able to GC.
    pub within_ancestor_pitr: bool,
    pub timeline_dir_layer_file_size_sum: Option<u64>,
    pub wal_source_connstr: Option<String>,
@@ -1614,4 +1664,25 @@ mod tests {
            AuxFilePolicy::CrossValidation
        );
    }
    #[test]
    fn test_image_compression_algorithm_parsing() {
        use ImageCompressionAlgorithm::*;
        assert_eq!(
            ImageCompressionAlgorithm::from_str("disabled").unwrap(),
            Disabled
        );
        assert_eq!(
            ImageCompressionAlgorithm::from_str("zstd").unwrap(),
            Zstd { level: None }
        );
        assert_eq!(
            ImageCompressionAlgorithm::from_str("zstd(18)").unwrap(),
            Zstd { level: Some(18) }
        );
        assert_eq!(
            ImageCompressionAlgorithm::from_str("zstd(-3)").unwrap(),
            Zstd { level: Some(-3) }
        );
    }
 }
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -1,59 +1,42 @@
-use std::{ops::RangeInclusive, str::FromStr};
+//! See docs/rfcs/031-sharding-static.md for an overview of sharding.
 //!
 //! This module contains a variety of types used to represent the concept of sharding
 //! a Neon tenant across multiple physical shards.  Since there are quite a few of these,
 //! we provide an summary here.
 //!
 //! Types used to describe shards:
 //! - [`ShardCount`] describes how many shards make up a tenant, plus the magic `unsharded` value
 //!   which identifies a tenant which is not shard-aware.  This means its storage paths do not include
 //!   a shard suffix.
 //! - [`ShardNumber`] is simply the zero-based index of a shard within a tenant.
 //! - [`ShardIndex`] is the 2-tuple of `ShardCount` and `ShardNumber`, it's just like a `TenantShardId`
 //!   without the tenant ID.  This is useful for things that are implicitly scoped to a particular
 //!   tenant, such as layer files.
 //! - [`ShardIdentity`]` is the full description of a particular shard's parameters, in sufficient
 //!   detail to convert a [`Key`] to a [`ShardNumber`] when deciding where to write/read.
 //! - The [`ShardSlug`] is a terse formatter for ShardCount and ShardNumber, written as
 //!   four hex digits.  An unsharded tenant is `0000`.
 //! - [`TenantShardId`] is the unique ID of a particular shard within a particular tenant
 //!
 //! Types used to describe the parameters for data distribution in a sharded tenant:
 //! - [`ShardStripeSize`] controls how long contiguous runs of [`Key`]s (stripes) are when distributed across
 //!   multiple shards.  Its value is given in 8kiB pages.
 //! - [`ShardLayout`] describes the data distribution scheme, and at time of writing is
 //!   always zero: this is provided for future upgrades that might introduce different
 //!   data distribution schemes.
 //!
 //! Examples:
 //! - A legacy unsharded tenant has one shard with ShardCount(0), ShardNumber(0), and its slug is 0000
 //! - A single sharded tenant has one shard with ShardCount(1), ShardNumber(0), and its slug is 0001
 //! - In a tenant with 4 shards, each shard has ShardCount(N), ShardNumber(i) where i in 0..N-1 (inclusive),
 //!   and their slugs are 0004, 0104, 0204, and 0304.
 use crate::{key::Key, models::ShardParameters};
 use hex::FromHex;
 use postgres_ffi::relfile_utils::INIT_FORKNUM;
 use serde::{Deserialize, Serialize};
 use utils::id::TenantId;
-/// See docs/rfcs/031-sharding-static.md for an overview of sharding.
+#[doc(inline)]
-///
+pub use ::utils::shard::*;
 /// This module contains a variety of types used to represent the concept of sharding
 /// a Neon tenant across multiple physical shards.  Since there are quite a few of these,
 /// we provide an summary here.
 ///
 /// Types used to describe shards:
 /// - [`ShardCount`] describes how many shards make up a tenant, plus the magic `unsharded` value
 ///   which identifies a tenant which is not shard-aware.  This means its storage paths do not include
 ///   a shard suffix.
 /// - [`ShardNumber`] is simply the zero-based index of a shard within a tenant.
 /// - [`ShardIndex`] is the 2-tuple of `ShardCount` and `ShardNumber`, it's just like a `TenantShardId`
 ///   without the tenant ID.  This is useful for things that are implicitly scoped to a particular
 ///   tenant, such as layer files.
 /// - [`ShardIdentity`]` is the full description of a particular shard's parameters, in sufficient
 ///   detail to convert a [`Key`] to a [`ShardNumber`] when deciding where to write/read.
 /// - The [`ShardSlug`] is a terse formatter for ShardCount and ShardNumber, written as
 ///   four hex digits.  An unsharded tenant is `0000`.
 /// - [`TenantShardId`] is the unique ID of a particular shard within a particular tenant
 ///
 /// Types used to describe the parameters for data distribution in a sharded tenant:
 /// - [`ShardStripeSize`] controls how long contiguous runs of [`Key`]s (stripes) are when distributed across
 ///   multiple shards.  Its value is given in 8kiB pages.
 /// - [`ShardLayout`] describes the data distribution scheme, and at time of writing is
 ///   always zero: this is provided for future upgrades that might introduce different
 ///   data distribution schemes.
 ///
 /// Examples:
 /// - A legacy unsharded tenant has one shard with ShardCount(0), ShardNumber(0), and its slug is 0000
 /// - A single sharded tenant has one shard with ShardCount(1), ShardNumber(0), and its slug is 0001
 /// - In a tenant with 4 shards, each shard has ShardCount(N), ShardNumber(i) where i in 0..N-1 (inclusive),
 ///   and their slugs are 0004, 0104, 0204, and 0304.
 #[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
 pub struct ShardNumber(pub u8);
 #[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
 pub struct ShardCount(u8);
 /// Combination of ShardNumber and ShardCount.  For use within the context of a particular tenant,
 /// when we need to know which shard we're dealing with, but do not need to know the full
 /// ShardIdentity (because we won't be doing any page->shard mapping), and do not need to know
 /// the fully qualified TenantShardId.
 #[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
 pub struct ShardIndex {
    pub shard_number: ShardNumber,
    pub shard_count: ShardCount,
 }
 /// The ShardIdentity contains enough information to map a [`Key`] to a [`ShardNumber`],
 /// and to check whether that [`ShardNumber`] is the same as the current shard.
@@ -65,362 +48,6 @@ pub struct ShardIdentity {
    layout: ShardLayout,
 }
 /// Formatting helper, for generating the `shard_id` label in traces.
 struct ShardSlug<'a>(&'a TenantShardId);
 /// TenantShardId globally identifies a particular shard in a particular tenant.
 ///
 /// These are written as `<TenantId>-<ShardSlug>`, for example:
 ///   # The second shard in a two-shard tenant
 ///   072f1291a5310026820b2fe4b2968934-0102
 ///
 /// If the `ShardCount` is _unsharded_, the `TenantShardId` is written without
 /// a shard suffix and is equivalent to the encoding of a `TenantId`: this enables
 /// an unsharded [`TenantShardId`] to be used interchangably with a [`TenantId`].
 ///
 /// The human-readable encoding of an unsharded TenantShardId, such as used in API URLs,
 /// is both forward and backward compatible with TenantId: a legacy TenantId can be
 /// decoded as a TenantShardId, and when re-encoded it will be parseable
 /// as a TenantId.
 #[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
 pub struct TenantShardId {
    pub tenant_id: TenantId,
    pub shard_number: ShardNumber,
    pub shard_count: ShardCount,
 }
 impl ShardCount {
    pub const MAX: Self = Self(u8::MAX);
    /// The internal value of a ShardCount may be zero, which means "1 shard, but use
    /// legacy format for TenantShardId that excludes the shard suffix", also known
    /// as [`TenantShardId::unsharded`].
    ///
    /// This method returns the actual number of shards, i.e. if our internal value is
    /// zero, we return 1 (unsharded tenants have 1 shard).
    pub fn count(&self) -> u8 {
        if self.0 > 0 {
            self.0
        } else {
            1
        }
    }
    /// The literal internal value: this is **not** the number of shards in the
    /// tenant, as we have a special zero value for legacy unsharded tenants.  Use
    /// [`Self::count`] if you want to know the cardinality of shards.
    pub fn literal(&self) -> u8 {
        self.0
    }
    /// Whether the `ShardCount` is for an unsharded tenant, so uses one shard but
    /// uses the legacy format for `TenantShardId`. See also the documentation for
    /// [`Self::count`].
    pub fn is_unsharded(&self) -> bool {
        self.0 == 0
    }
    /// `v` may be zero, or the number of shards in the tenant.  `v` is what
    /// [`Self::literal`] would return.
    pub const fn new(val: u8) -> Self {
        Self(val)
    }
 }
 impl ShardNumber {
    pub const MAX: Self = Self(u8::MAX);
 }
 impl TenantShardId {
    pub fn unsharded(tenant_id: TenantId) -> Self {
        Self {
            tenant_id,
            shard_number: ShardNumber(0),
            shard_count: ShardCount(0),
        }
    }
    /// The range of all TenantShardId that belong to a particular TenantId.  This is useful when
    /// you have a BTreeMap of TenantShardId, and are querying by TenantId.
    pub fn tenant_range(tenant_id: TenantId) -> RangeInclusive<Self> {
        RangeInclusive::new(
            Self {
                tenant_id,
                shard_number: ShardNumber(0),
                shard_count: ShardCount(0),
            },
            Self {
                tenant_id,
                shard_number: ShardNumber::MAX,
                shard_count: ShardCount::MAX,
            },
        )
    }
    pub fn shard_slug(&self) -> impl std::fmt::Display + '_ {
        ShardSlug(self)
    }
    /// Convenience for code that has special behavior on the 0th shard.
    pub fn is_shard_zero(&self) -> bool {
        self.shard_number == ShardNumber(0)
    }
    /// The "unsharded" value is distinct from simply having a single shard: it represents
    /// a tenant which is not shard-aware at all, and whose storage paths will not include
    /// a shard suffix.
    pub fn is_unsharded(&self) -> bool {
        self.shard_number == ShardNumber(0) && self.shard_count.is_unsharded()
    }
    /// Convenience for dropping the tenant_id and just getting the ShardIndex: this
    /// is useful when logging from code that is already in a span that includes tenant ID, to
    /// keep messages reasonably terse.
    pub fn to_index(&self) -> ShardIndex {
        ShardIndex {
            shard_number: self.shard_number,
            shard_count: self.shard_count,
        }
    }
    /// Calculate the children of this TenantShardId when splitting the overall tenant into
    /// the given number of shards.
    pub fn split(&self, new_shard_count: ShardCount) -> Vec<TenantShardId> {
        let effective_old_shard_count = std::cmp::max(self.shard_count.0, 1);
        let mut child_shards = Vec::new();
        for shard_number in 0..ShardNumber(new_shard_count.0).0 {
            // Key mapping is based on a round robin mapping of key hash modulo shard count,
            // so our child shards are the ones which the same keys would map to.
            if shard_number % effective_old_shard_count == self.shard_number.0 {
                child_shards.push(TenantShardId {
                    tenant_id: self.tenant_id,
                    shard_number: ShardNumber(shard_number),
                    shard_count: new_shard_count,
                })
            }
        }
        child_shards
    }
 }
 impl<'a> std::fmt::Display for ShardSlug<'a> {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(
            f,
            "{:02x}{:02x}",
            self.0.shard_number.0, self.0.shard_count.0
        )
    }
 }
 impl std::fmt::Display for TenantShardId {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        if self.shard_count != ShardCount(0) {
            write!(f, "{}-{}", self.tenant_id, self.shard_slug())
        } else {
            // Legacy case (shard_count == 0) -- format as just the tenant id.  Note that this
            // is distinct from the normal single shard case (shard count == 1).
            self.tenant_id.fmt(f)
        }
    }
 }
 impl std::fmt::Debug for TenantShardId {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        // Debug is the same as Display: the compact hex representation
        write!(f, "{}", self)
    }
 }
 impl std::str::FromStr for TenantShardId {
    type Err = hex::FromHexError;
    fn from_str(s: &str) -> Result<Self, Self::Err> {
        // Expect format: 16 byte TenantId, '-', 1 byte shard number, 1 byte shard count
        if s.len() == 32 {
            // Legacy case: no shard specified
            Ok(Self {
                tenant_id: TenantId::from_str(s)?,
                shard_number: ShardNumber(0),
                shard_count: ShardCount(0),
            })
        } else if s.len() == 37 {
            let bytes = s.as_bytes();
            let tenant_id = TenantId::from_hex(&bytes[0..32])?;
            let mut shard_parts: [u8; 2] = [0u8; 2];
            hex::decode_to_slice(&bytes[33..37], &mut shard_parts)?;
            Ok(Self {
                tenant_id,
                shard_number: ShardNumber(shard_parts[0]),
                shard_count: ShardCount(shard_parts[1]),
            })
        } else {
            Err(hex::FromHexError::InvalidStringLength)
        }
    }
 }
 impl From<[u8; 18]> for TenantShardId {
    fn from(b: [u8; 18]) -> Self {
        let tenant_id_bytes: [u8; 16] = b[0..16].try_into().unwrap();
        Self {
            tenant_id: TenantId::from(tenant_id_bytes),
            shard_number: ShardNumber(b[16]),
            shard_count: ShardCount(b[17]),
        }
    }
 }
 impl ShardIndex {
    pub fn new(number: ShardNumber, count: ShardCount) -> Self {
        Self {
            shard_number: number,
            shard_count: count,
        }
    }
    pub fn unsharded() -> Self {
        Self {
            shard_number: ShardNumber(0),
            shard_count: ShardCount(0),
        }
    }
    /// The "unsharded" value is distinct from simply having a single shard: it represents
    /// a tenant which is not shard-aware at all, and whose storage paths will not include
    /// a shard suffix.
    pub fn is_unsharded(&self) -> bool {
        self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
    }
    /// For use in constructing remote storage paths: concatenate this with a TenantId
    /// to get a fully qualified TenantShardId.
    ///
    /// Backward compat: this function returns an empty string if Self::is_unsharded, such
    /// that the legacy pre-sharding remote key format is preserved.
    pub fn get_suffix(&self) -> String {
        if self.is_unsharded() {
            "".to_string()
        } else {
            format!("-{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
        }
    }
 }
 impl std::fmt::Display for ShardIndex {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
    }
 }
 impl std::fmt::Debug for ShardIndex {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        // Debug is the same as Display: the compact hex representation
        write!(f, "{}", self)
    }
 }
 impl std::str::FromStr for ShardIndex {
    type Err = hex::FromHexError;
    fn from_str(s: &str) -> Result<Self, Self::Err> {
        // Expect format: 1 byte shard number, 1 byte shard count
        if s.len() == 4 {
            let bytes = s.as_bytes();
            let mut shard_parts: [u8; 2] = [0u8; 2];
            hex::decode_to_slice(bytes, &mut shard_parts)?;
            Ok(Self {
                shard_number: ShardNumber(shard_parts[0]),
                shard_count: ShardCount(shard_parts[1]),
            })
        } else {
            Err(hex::FromHexError::InvalidStringLength)
        }
    }
 }
 impl From<[u8; 2]> for ShardIndex {
    fn from(b: [u8; 2]) -> Self {
        Self {
            shard_number: ShardNumber(b[0]),
            shard_count: ShardCount(b[1]),
        }
    }
 }
 impl Serialize for TenantShardId {
    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
    where
        S: serde::Serializer,
    {
        if serializer.is_human_readable() {
            serializer.collect_str(self)
        } else {
            // Note: while human encoding of [`TenantShardId`] is backward and forward
            // compatible, this binary encoding is not.
            let mut packed: [u8; 18] = [0; 18];
            packed[0..16].clone_from_slice(&self.tenant_id.as_arr());
            packed[16] = self.shard_number.0;
            packed[17] = self.shard_count.0;
            packed.serialize(serializer)
        }
    }
 }
 impl<'de> Deserialize<'de> for TenantShardId {
    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
    where
        D: serde::Deserializer<'de>,
    {
        struct IdVisitor {
            is_human_readable_deserializer: bool,
        }
        impl<'de> serde::de::Visitor<'de> for IdVisitor {
            type Value = TenantShardId;
            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
                if self.is_human_readable_deserializer {
                    formatter.write_str("value in form of hex string")
                } else {
                    formatter.write_str("value in form of integer array([u8; 18])")
                }
            }
            fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
            where
                A: serde::de::SeqAccess<'de>,
            {
                let s = serde::de::value::SeqAccessDeserializer::new(seq);
                let id: [u8; 18] = Deserialize::deserialize(s)?;
                Ok(TenantShardId::from(id))
            }
            fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
            where
                E: serde::de::Error,
            {
                TenantShardId::from_str(v).map_err(E::custom)
            }
        }
        if deserializer.is_human_readable() {
            deserializer.deserialize_str(IdVisitor {
                is_human_readable_deserializer: true,
            })
        } else {
            deserializer.deserialize_tuple(
                18,
                IdVisitor {
                    is_human_readable_deserializer: false,
                },
            )
        }
    }
 }
 /// Stripe size in number of pages
 #[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
 pub struct ShardStripeSize(pub u32);
@@ -585,77 +212,6 @@ impl ShardIdentity {
    }
 }
 impl Serialize for ShardIndex {
    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
    where
        S: serde::Serializer,
    {
        if serializer.is_human_readable() {
            serializer.collect_str(self)
        } else {
            // Binary encoding is not used in index_part.json, but is included in anticipation of
            // switching various structures (e.g. inter-process communication, remote metadata) to more
            // compact binary encodings in future.
            let mut packed: [u8; 2] = [0; 2];
            packed[0] = self.shard_number.0;
            packed[1] = self.shard_count.0;
            packed.serialize(serializer)
        }
    }
 }
 impl<'de> Deserialize<'de> for ShardIndex {
    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
    where
        D: serde::Deserializer<'de>,
    {
        struct IdVisitor {
            is_human_readable_deserializer: bool,
        }
        impl<'de> serde::de::Visitor<'de> for IdVisitor {
            type Value = ShardIndex;
            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
                if self.is_human_readable_deserializer {
                    formatter.write_str("value in form of hex string")
                } else {
                    formatter.write_str("value in form of integer array([u8; 2])")
                }
            }
            fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
            where
                A: serde::de::SeqAccess<'de>,
            {
                let s = serde::de::value::SeqAccessDeserializer::new(seq);
                let id: [u8; 2] = Deserialize::deserialize(s)?;
                Ok(ShardIndex::from(id))
            }
            fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
            where
                E: serde::de::Error,
            {
                ShardIndex::from_str(v).map_err(E::custom)
            }
        }
        if deserializer.is_human_readable() {
            deserializer.deserialize_str(IdVisitor {
                is_human_readable_deserializer: true,
            })
        } else {
            deserializer.deserialize_tuple(
                2,
                IdVisitor {
                    is_human_readable_deserializer: false,
                },
            )
        }
    }
 }
 /// Whether this key is always held on shard 0 (e.g. shard 0 holds all SLRU keys
 /// in order to be able to serve basebackup requests without peer communication).
 fn key_is_shard0(key: &Key) -> bool {
@@ -737,7 +293,9 @@ pub fn describe(
 #[cfg(test)]
 mod tests {
-    use utils::Hex;
+    use std::str::FromStr;
    use utils::{id::TenantId, Hex};
    use super::*;
--- a/libs/postgres_backend/Cargo.toml
+++ b/libs/postgres_backend/Cargo.toml
@@ -13,6 +13,7 @@ rustls.workspace = true
 serde.workspace = true
 thiserror.workspace = true
 tokio.workspace = true
 tokio-util.workspace = true
 tokio-rustls.workspace = true
 tracing.workspace = true
@@ -23,4 +24,4 @@ workspace_hack.workspace = true
 once_cell.workspace = true
 rustls-pemfile.workspace = true
 tokio-postgres.workspace = true
-tokio-postgres-rustls.workspace = true
+tokio-postgres-rustls.workspace = true
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -16,6 +16,7 @@ use std::{fmt, io};
 use std::{future::Future, str::FromStr};
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_rustls::TlsAcceptor;
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, error, info, trace, warn};
 use pq_proto::framed::{ConnectionError, Framed, FramedReader, FramedWriter};
@@ -400,21 +401,15 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
    }
    /// Wrapper for run_message_loop() that shuts down socket when we are done
-    pub async fn run<F, S>(
+    pub async fn run(
        mut self,
        handler: &mut impl Handler<IO>,
-        shutdown_watcher: F,
+        cancel: &CancellationToken,
-    ) -> Result<(), QueryError>
+    ) -> Result<(), QueryError> {
-    where
+        let ret = self.run_message_loop(handler, cancel).await;
        F: Fn() -> S + Clone,
        S: Future,
    {
        let ret = self
            .run_message_loop(handler, shutdown_watcher.clone())
            .await;
        tokio::select! {
-            _ = shutdown_watcher() => {
+            _ = cancel.cancelled() => {
                // do nothing; we most likely got already stopped by shutdown and will log it next.
            }
            _ = self.framed.shutdown() => {
@@ -444,21 +439,17 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
        }
    }
-    async fn run_message_loop<F, S>(
+    async fn run_message_loop(
        &mut self,
        handler: &mut impl Handler<IO>,
-        shutdown_watcher: F,
+        cancel: &CancellationToken,
-    ) -> Result<(), QueryError>
+    ) -> Result<(), QueryError> {
    where
        F: Fn() -> S,
        S: Future,
    {
        trace!("postgres backend to {:?} started", self.peer_addr);
        tokio::select!(
            biased;
-            _ = shutdown_watcher() => {
+            _ = cancel.cancelled() => {
                // We were requested to shut down.
                tracing::info!("shutdown request received during handshake");
                return Err(QueryError::Shutdown)
@@ -473,7 +464,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
        let mut query_string = Bytes::new();
        while let Some(msg) = tokio::select!(
            biased;
-            _ = shutdown_watcher() => {
+            _ = cancel.cancelled() => {
                // We were requested to shut down.
                tracing::info!("shutdown request received in run_message_loop");
                return Err(QueryError::Shutdown)
@@ -485,7 +476,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
            let result = self.process_message(handler, msg, &mut query_string).await;
            tokio::select!(
                biased;
-                _ = shutdown_watcher() => {
+                _ = cancel.cancelled() => {
                    // We were requested to shut down.
                    tracing::info!("shutdown request received during response flush");
@@ -672,11 +663,17 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
        assert!(self.state < ProtoState::Authentication);
        let have_tls = self.tls_config.is_some();
        match msg {
-            FeStartupPacket::SslRequest => {
+            FeStartupPacket::SslRequest { direct } => {
                debug!("SSL requested");
-                self.write_message(&BeMessage::EncryptionResponse(have_tls))
+                if !direct {
-                    .await?;
+                    self.write_message(&BeMessage::EncryptionResponse(have_tls))
                        .await?;
                } else if !have_tls {
                    return Err(QueryError::Other(anyhow::anyhow!(
                        "direct SSL negotiation but no TLS support"
                    )));
                }
                if have_tls {
                    self.start_tls().await?;
--- a/libs/postgres_backend/tests/simple_select.rs
+++ b/libs/postgres_backend/tests/simple_select.rs
@@ -3,13 +3,14 @@ use once_cell::sync::Lazy;
 use postgres_backend::{AuthType, Handler, PostgresBackend, QueryError};
 use pq_proto::{BeMessage, RowDescriptor};
 use std::io::Cursor;
-use std::{future, sync::Arc};
+use std::sync::Arc;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio::net::{TcpListener, TcpStream};
 use tokio_postgres::config::SslMode;
 use tokio_postgres::tls::MakeTlsConnect;
 use tokio_postgres::{Config, NoTls, SimpleQueryMessage};
 use tokio_postgres_rustls::MakeRustlsConnect;
 use tokio_util::sync::CancellationToken;
 // generate client, server test streams
 async fn make_tcp_pair() -> (TcpStream, TcpStream) {
@@ -50,7 +51,7 @@ async fn simple_select() {
    tokio::spawn(async move {
        let mut handler = TestHandler {};
-        pgbackend.run(&mut handler, future::pending::<()>).await
+        pgbackend.run(&mut handler, &CancellationToken::new()).await
    });
    let conf = Config::new();
@@ -102,7 +103,7 @@ async fn simple_select_ssl() {
    tokio::spawn(async move {
        let mut handler = TestHandler {};
-        pgbackend.run(&mut handler, future::pending::<()>).await
+        pgbackend.run(&mut handler, &CancellationToken::new()).await
    });
    let client_cfg = rustls::ClientConfig::builder()
--- a/libs/pq_proto/src/framed.rs
+++ b/libs/pq_proto/src/framed.rs
@@ -44,9 +44,9 @@ impl ConnectionError {
 /// Wraps async io `stream`, providing messages to write/flush + read Postgres
 /// messages.
 pub struct Framed<S> {
-    stream: S,
+    pub stream: S,
-    read_buf: BytesMut,
+    pub read_buf: BytesMut,
-    write_buf: BytesMut,
+    pub write_buf: BytesMut,
 }
 impl<S> Framed<S> {
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -39,14 +39,39 @@ pub enum FeMessage {
    PasswordMessage(Bytes),
 }
 #[derive(Clone, Copy, PartialEq, PartialOrd)]
 pub struct ProtocolVersion(u32);
 impl ProtocolVersion {
    pub const fn new(major: u16, minor: u16) -> Self {
        Self((major as u32) << 16 | minor as u32)
    }
    pub const fn minor(self) -> u16 {
        self.0 as u16
    }
    pub const fn major(self) -> u16 {
        (self.0 >> 16) as u16
    }
 }
 impl fmt::Debug for ProtocolVersion {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        f.debug_list()
            .entry(&self.major())
            .entry(&self.minor())
            .finish()
    }
 }
 #[derive(Debug)]
 pub enum FeStartupPacket {
    CancelRequest(CancelKeyData),
-    SslRequest,
+    SslRequest {
        direct: bool,
    },
    GssEncRequest,
    StartupMessage {
-        major_version: u32,
+        version: ProtocolVersion,
        minor_version: u32,
        params: StartupMessageParams,
    },
 }
@@ -301,11 +326,23 @@ impl FeStartupPacket {
    /// different from [`FeMessage::parse`] because startup messages don't have
    /// message type byte; otherwise, its comments apply.
    pub fn parse(buf: &mut BytesMut) -> Result<Option<FeStartupPacket>, ProtocolError> {
        /// <https://github.com/postgres/postgres/blob/ca481d3c9ab7bf69ff0c8d71ad3951d407f6a33c/src/include/libpq/pqcomm.h#L118>
        const MAX_STARTUP_PACKET_LENGTH: usize = 10000;
-        const RESERVED_INVALID_MAJOR_VERSION: u32 = 1234;
+        const RESERVED_INVALID_MAJOR_VERSION: u16 = 1234;
-        const CANCEL_REQUEST_CODE: u32 = 5678;
+        /// <https://github.com/postgres/postgres/blob/ca481d3c9ab7bf69ff0c8d71ad3951d407f6a33c/src/include/libpq/pqcomm.h#L132>
-        const NEGOTIATE_SSL_CODE: u32 = 5679;
+        const CANCEL_REQUEST_CODE: ProtocolVersion = ProtocolVersion::new(1234, 5678);
-        const NEGOTIATE_GSS_CODE: u32 = 5680;
+        /// <https://github.com/postgres/postgres/blob/ca481d3c9ab7bf69ff0c8d71ad3951d407f6a33c/src/include/libpq/pqcomm.h#L166>
        const NEGOTIATE_SSL_CODE: ProtocolVersion = ProtocolVersion::new(1234, 5679);
        /// <https://github.com/postgres/postgres/blob/ca481d3c9ab7bf69ff0c8d71ad3951d407f6a33c/src/include/libpq/pqcomm.h#L167>
        const NEGOTIATE_GSS_CODE: ProtocolVersion = ProtocolVersion::new(1234, 5680);
        // <https://github.com/postgres/postgres/blob/04bcf9e19a4261fe9c7df37c777592c2e10c32a7/src/backend/tcop/backend_startup.c#L378-L382>
        // First byte indicates standard SSL handshake message
        // (It can't be a Postgres startup length because in network byte order
        // that would be a startup packet hundreds of megabytes long)
        if buf.first() == Some(&0x16) {
            return Ok(Some(FeStartupPacket::SslRequest { direct: true }));
        }
        // need at least 4 bytes with packet len
        if buf.len() < 4 {
@@ -338,12 +375,10 @@ impl FeStartupPacket {
        let mut msg = buf.split_to(len).freeze();
        msg.advance(4); // consume len
-        let request_code = msg.get_u32();
+        let request_code = ProtocolVersion(msg.get_u32());
        let req_hi = request_code >> 16;
        let req_lo = request_code & ((1 << 16) - 1);
        // StartupMessage, CancelRequest, SSLRequest etc are differentiated by request code.
-        let message = match (req_hi, req_lo) {
+        let message = match request_code {
-            (RESERVED_INVALID_MAJOR_VERSION, CANCEL_REQUEST_CODE) => {
+            CANCEL_REQUEST_CODE => {
                if msg.remaining() != 8 {
                    return Err(ProtocolError::BadMessage(
                        "CancelRequest message is malformed, backend PID / secret key missing"
@@ -355,21 +390,22 @@ impl FeStartupPacket {
                    cancel_key: msg.get_i32(),
                })
            }
-            (RESERVED_INVALID_MAJOR_VERSION, NEGOTIATE_SSL_CODE) => {
+            NEGOTIATE_SSL_CODE => {
                // Requested upgrade to SSL (aka TLS)
-                FeStartupPacket::SslRequest
+                FeStartupPacket::SslRequest { direct: false }
            }
-            (RESERVED_INVALID_MAJOR_VERSION, NEGOTIATE_GSS_CODE) => {
+            NEGOTIATE_GSS_CODE => {
                // Requested upgrade to GSSAPI
                FeStartupPacket::GssEncRequest
            }
-            (RESERVED_INVALID_MAJOR_VERSION, unrecognized_code) => {
+            version if version.major() == RESERVED_INVALID_MAJOR_VERSION => {
                return Err(ProtocolError::Protocol(format!(
-                    "Unrecognized request code {unrecognized_code}"
+                    "Unrecognized request code {}",
                    version.minor()
                )));
            }
            // TODO bail if protocol major_version is not 3?
-            (major_version, minor_version) => {
+            version => {
                // StartupMessage
                let s = str::from_utf8(&msg).map_err(|_e| {
@@ -382,8 +418,7 @@ impl FeStartupPacket {
                })?;
                FeStartupPacket::StartupMessage {
-                    major_version,
+                    version,
                    minor_version,
                    params: StartupMessageParams {
                        params: msg.slice_ref(s.as_bytes()),
                    },
@@ -522,6 +557,10 @@ pub enum BeMessage<'a> {
    RowDescription(&'a [RowDescriptor<'a>]),
    XLogData(XLogDataBody<'a>),
    NoticeResponse(&'a str),
    NegotiateProtocolVersion {
        version: ProtocolVersion,
        options: &'a [&'a str],
    },
    KeepAlive(WalSndKeepAlive),
 }
@@ -945,6 +984,18 @@ impl<'a> BeMessage<'a> {
                    buf.put_u8(u8::from(req.request_reply));
                });
            }
            BeMessage::NegotiateProtocolVersion { version, options } => {
                buf.put_u8(b'v');
                write_body(buf, |buf| {
                    buf.put_u32(version.0);
                    buf.put_u32(options.len() as u32);
                    for option in options.iter() {
                        write_cstr(option, buf)?;
                    }
                    Ok(())
                })?
            }
        }
        Ok(())
    }
--- a/libs/remote_storage/src/config.rs
+++ b/libs/remote_storage/src/config.rs
@@ -1,6 +1,5 @@
 use std::{fmt::Debug, num::NonZeroUsize, str::FromStr, time::Duration};
 use anyhow::bail;
 use aws_sdk_s3::types::StorageClass;
 use camino::Utf8PathBuf;
@@ -176,20 +175,8 @@ fn serialize_storage_class<S: serde::Serializer>(
 impl RemoteStorageConfig {
    pub const DEFAULT_TIMEOUT: Duration = std::time::Duration::from_secs(120);
-    pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result<Option<RemoteStorageConfig>> {
+    pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result<RemoteStorageConfig> {
-        let document: toml_edit::Document = match toml {
+        Ok(utils::toml_edit_ext::deserialize_item(toml)?)
            toml_edit::Item::Table(toml) => toml.clone().into(),
            toml_edit::Item::Value(toml_edit::Value::InlineTable(toml)) => {
                toml.clone().into_table().into()
            }
            _ => bail!("toml not a table or inline table"),
        };
        if document.is_empty() {
            return Ok(None);
        }
        Ok(Some(toml_edit::de::from_document(document)?))
    }
 }
@@ -197,7 +184,7 @@ impl RemoteStorageConfig {
 mod tests {
    use super::*;
-    fn parse(input: &str) -> anyhow::Result<Option<RemoteStorageConfig>> {
+    fn parse(input: &str) -> anyhow::Result<RemoteStorageConfig> {
        let toml = input.parse::<toml_edit::Document>().unwrap();
        RemoteStorageConfig::from_toml(toml.as_item())
    }
@@ -207,7 +194,7 @@ mod tests {
        let input = "local_path = '.'
 timeout = '5s'";
-        let config = parse(input).unwrap().expect("it exists");
+        let config = parse(input).unwrap();
        assert_eq!(
            config,
@@ -229,7 +216,7 @@ timeout = '5s'";
    timeout = '7s'
    ";
-        let config = parse(toml).unwrap().expect("it exists");
+        let config = parse(toml).unwrap();
        assert_eq!(
            config,
@@ -257,7 +244,7 @@ timeout = '5s'";
    timeout = '7s'
    ";
-        let config = parse(toml).unwrap().expect("it exists");
+        let config = parse(toml).unwrap();
        assert_eq!(
            config,
--- a/libs/tenant_size_model/src/calculation.rs
+++ b/libs/tenant_size_model/src/calculation.rs
@@ -34,10 +34,10 @@ struct SegmentSize {
 }
 struct SizeAlternatives {
-    // cheapest alternative if parent is available.
+    /// cheapest alternative if parent is available.
    incremental: SegmentSize,
-    // cheapest alternative if parent node is not available
+    /// cheapest alternative if parent node is not available
    non_incremental: Option<SegmentSize>,
 }
--- a/libs/tenant_size_model/src/svg.rs
+++ b/libs/tenant_size_model/src/svg.rs
@@ -3,10 +3,17 @@ use std::fmt::Write;
 const SVG_WIDTH: f32 = 500.0;
 /// Different branch kind for SVG drawing.
 #[derive(PartialEq)]
 pub enum SvgBranchKind {
    Timeline,
    Lease,
 }
 struct SvgDraw<'a> {
    storage: &'a StorageModel,
    branches: &'a [String],
-    seg_to_branch: &'a [usize],
+    seg_to_branch: &'a [(usize, SvgBranchKind)],
    sizes: &'a [SegmentSizeResult],
    // layout
@@ -42,13 +49,18 @@ fn draw_legend(result: &mut String) -> anyhow::Result<()> {
        "<line x1=\"5\" y1=\"70\" x2=\"15\" y2=\"70\" stroke-width=\"1\" stroke=\"gray\" />"
    )?;
    writeln!(result, "<text x=\"20\" y=\"75\">WAL not retained</text>")?;
    writeln!(
        result,
        "<line x1=\"10\" y1=\"85\" x2=\"10\" y2=\"95\" stroke-width=\"3\" stroke=\"blue\" />"
    )?;
    writeln!(result, "<text x=\"20\" y=\"95\">LSN lease</text>")?;
    Ok(())
 }
 pub fn draw_svg(
    storage: &StorageModel,
    branches: &[String],
-    seg_to_branch: &[usize],
+    seg_to_branch: &[(usize, SvgBranchKind)],
    sizes: &SizeResult,
 ) -> anyhow::Result<String> {
    let mut draw = SvgDraw {
@@ -100,7 +112,7 @@ impl<'a> SvgDraw<'a> {
        // Layout the timelines on Y dimension.
        // TODO
-        let mut y = 100.0;
+        let mut y = 120.0;
        let mut branch_y_coordinates = Vec::new();
        for _branch in self.branches {
            branch_y_coordinates.push(y);
@@ -109,7 +121,7 @@ impl<'a> SvgDraw<'a> {
        // Calculate coordinates for each point
        let seg_coordinates = std::iter::zip(segments, self.seg_to_branch)
-            .map(|(seg, branch_id)| {
+            .map(|(seg, (branch_id, _))| {
                let x = (seg.lsn - min_lsn) as f32 / xscale;
                let y = branch_y_coordinates[*branch_id];
                (x, y)
@@ -175,6 +187,22 @@ impl<'a> SvgDraw<'a> {
        // draw a snapshot point if it's needed
        let (coord_x, coord_y) = self.seg_coordinates[seg_id];
        let (_, kind) = &self.seg_to_branch[seg_id];
        if kind == &SvgBranchKind::Lease {
            let (x1, y1) = (coord_x, coord_y - 10.0);
            let (x2, y2) = (coord_x, coord_y + 10.0);
            let style = "stroke-width=\"3\" stroke=\"blue\"";
            writeln!(
                result,
                "<line x1=\"{x1}\" y1=\"{y1}\" x2=\"{x2}\" y2=\"{y2}\" {style}>",
            )?;
            writeln!(result, "  <title>leased lsn at {}</title>", seg.lsn)?;
            writeln!(result, "</line>")?;
        }
        if self.sizes[seg_id].method == SegmentMethod::SnapshotHere {
            writeln!(
                result,
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -40,6 +40,7 @@ thiserror.workspace = true
 tokio.workspace = true
 tokio-tar.workspace = true
 tokio-util.workspace = true
 toml_edit.workspace = true
 tracing.workspace = true
 tracing-error.workspace = true
 tracing-subscriber = { workspace = true, features = ["json", "registry"] }
--- a/libs/utils/src/circuit_breaker.rs
+++ b/libs/utils/src/circuit_breaker.rs
@@ -0,0 +1,114 @@
 use std::{
    fmt::Display,
    time::{Duration, Instant},
 };
 use metrics::IntCounter;
 /// Circuit breakers are for operations that are expensive and fallible: if they fail repeatedly,
 /// we will stop attempting them for some period of time, to avoid denial-of-service from retries, and
 /// to mitigate the log spam from repeated failures.
 pub struct CircuitBreaker {
    /// An identifier that enables us to log useful errors when a circuit is broken
    name: String,
    /// Consecutive failures since last success
    fail_count: usize,
    /// How many consecutive failures before we break the circuit
    fail_threshold: usize,
    /// If circuit is broken, when was it broken?
    broken_at: Option<Instant>,
    /// If set, we will auto-reset the circuit this long after it was broken.  If None, broken
    /// circuits stay broken forever, or until success() is called.
    reset_period: Option<Duration>,
    /// If this is true, no actual circuit-breaking happens.  This is for overriding a circuit breaker
    /// to permit something to keep running even if it would otherwise have tripped it.
    short_circuit: bool,
 }
 impl CircuitBreaker {
    pub fn new(name: String, fail_threshold: usize, reset_period: Option<Duration>) -> Self {
        Self {
            name,
            fail_count: 0,
            fail_threshold,
            broken_at: None,
            reset_period,
            short_circuit: false,
        }
    }
    /// Construct an unbreakable circuit breaker, for use in unit tests etc.
    pub fn short_circuit() -> Self {
        Self {
            name: String::new(),
            fail_threshold: 0,
            fail_count: 0,
            broken_at: None,
            reset_period: None,
            short_circuit: true,
        }
    }
    pub fn fail<E>(&mut self, metric: &IntCounter, error: E)
    where
        E: Display,
    {
        if self.short_circuit {
            return;
        }
        self.fail_count += 1;
        if self.broken_at.is_none() && self.fail_count >= self.fail_threshold {
            self.break_circuit(metric, error);
        }
    }
    /// Call this after successfully executing an operation
    pub fn success(&mut self, metric: &IntCounter) {
        self.fail_count = 0;
        if let Some(broken_at) = &self.broken_at {
            tracing::info!(breaker=%self.name, "Circuit breaker failure ended (was broken for {})",
                humantime::format_duration(broken_at.elapsed()));
            self.broken_at = None;
            metric.inc();
        }
    }
    /// Call this before attempting an operation, and skip the operation if we are currently broken.
    pub fn is_broken(&mut self) -> bool {
        if self.short_circuit {
            return false;
        }
        if let Some(broken_at) = self.broken_at {
            match self.reset_period {
                Some(reset_period) if broken_at.elapsed() > reset_period => {
                    self.reset_circuit();
                    false
                }
                _ => true,
            }
        } else {
            false
        }
    }
    fn break_circuit<E>(&mut self, metric: &IntCounter, error: E)
    where
        E: Display,
    {
        self.broken_at = Some(Instant::now());
        tracing::error!(breaker=%self.name, "Circuit breaker broken!  Last error: {error}");
        metric.inc();
    }
    fn reset_circuit(&mut self) {
        self.broken_at = None;
        self.fail_count = 0;
    }
 }
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -52,17 +52,17 @@ struct RequestId(String);
 /// There could be other ways to implement similar functionality:
 ///
 /// * procmacros placed on top of all handler methods
-/// With all the drawbacks of procmacros, brings no difference implementation-wise,
+///   With all the drawbacks of procmacros, brings no difference implementation-wise,
-/// and little code reduction compared to the existing approach.
+///   and little code reduction compared to the existing approach.
 ///
 /// * Another `TraitExt` with e.g. the `get_with_span`, `post_with_span` methods to do similar logic,
-/// implemented for [`RouterBuilder`].
+///   implemented for [`RouterBuilder`].
-/// Could be simpler, but we don't want to depend on [`routerify`] more, targeting to use other library later.
+///   Could be simpler, but we don't want to depend on [`routerify`] more, targeting to use other library later.
 ///
 /// * In theory, a span guard could've been created in a pre-request middleware and placed into a global collection, to be dropped
-/// later, in a post-response middleware.
+///   later, in a post-response middleware.
-/// Due to suspendable nature of the futures, would give contradictive results which is exactly the opposite of what `tracing-futures`
+///   Due to suspendable nature of the futures, would give contradictive results which is exactly the opposite of what `tracing-futures`
-/// tries to achive with its `.instrument` used in the current approach.
+///   tries to achive with its `.instrument` used in the current approach.
 ///
 /// If needed, a declarative macro to substitute the |r| ... closure boilerplate could be introduced.
 pub async fn request_span<R, H>(request: Request<Body>, handler: H) -> R::Output
--- a/libs/utils/src/http/request.rs
+++ b/libs/utils/src/http/request.rs
@@ -74,6 +74,15 @@ pub fn parse_query_param<E: fmt::Display, T: FromStr<Err = E>>(
        .transpose()
 }
 pub fn must_parse_query_param<E: fmt::Display, T: FromStr<Err = E>>(
    request: &Request<Body>,
    param_name: &str,
 ) -> Result<T, ApiError> {
    parse_query_param(request, param_name)?.ok_or_else(|| {
        ApiError::BadRequest(anyhow!("no {param_name} specified in query parameters"))
    })
 }
 pub async fn ensure_no_body(request: &mut Request<Body>) -> Result<(), ApiError> {
    match request.body_mut().data().await {
        Some(_) => Err(ApiError::BadRequest(anyhow!("Unexpected request body"))),
--- a/libs/utils/src/id.rs
+++ b/libs/utils/src/id.rs
@@ -302,17 +302,6 @@ pub struct TenantId(Id);
 id_newtype!(TenantId);
 /// Neon Connection Id identifies long-lived connections (for example a pagestream
 /// connection with the page_service). Is used for better logging and tracing
 ///
 /// NOTE: It (de)serializes as an array of hex bytes, so the string representation would look
 /// like `[173,80,132,115,129,226,72,254,170,201,135,108,199,26,228,24]`.
 /// See [`Id`] for alternative ways to serialize it.
 #[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, PartialOrd, Ord)]
 pub struct ConnectionId(Id);
 id_newtype!(ConnectionId);
 // A pair uniquely identifying Neon instance.
 #[derive(Debug, Clone, Copy, PartialOrd, Ord, PartialEq, Eq, Hash, Serialize, Deserialize)]
 pub struct TenantTimelineId {
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -26,6 +26,8 @@ pub mod auth;
 // utility functions and helper traits for unified unique id generation/serialization etc.
 pub mod id;
 pub mod shard;
 mod hex;
 pub use hex::Hex;
@@ -94,6 +96,10 @@ pub mod env;
 pub mod poison;
 pub mod toml_edit_ext;
 pub mod circuit_breaker;
 /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
 ///
 /// we have several cases:
--- a/libs/utils/src/shard.rs
+++ b/libs/utils/src/shard.rs
@@ -0,0 +1,451 @@
 //! See `pageserver_api::shard` for description on sharding.
 use std::{ops::RangeInclusive, str::FromStr};
 use hex::FromHex;
 use serde::{Deserialize, Serialize};
 use crate::id::TenantId;
 #[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
 pub struct ShardNumber(pub u8);
 #[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
 pub struct ShardCount(pub u8);
 /// Combination of ShardNumber and ShardCount.  For use within the context of a particular tenant,
 /// when we need to know which shard we're dealing with, but do not need to know the full
 /// ShardIdentity (because we won't be doing any page->shard mapping), and do not need to know
 /// the fully qualified TenantShardId.
 #[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
 pub struct ShardIndex {
    pub shard_number: ShardNumber,
    pub shard_count: ShardCount,
 }
 /// Formatting helper, for generating the `shard_id` label in traces.
 pub struct ShardSlug<'a>(&'a TenantShardId);
 /// TenantShardId globally identifies a particular shard in a particular tenant.
 ///
 /// These are written as `<TenantId>-<ShardSlug>`, for example:
 ///   # The second shard in a two-shard tenant
 ///   072f1291a5310026820b2fe4b2968934-0102
 ///
 /// If the `ShardCount` is _unsharded_, the `TenantShardId` is written without
 /// a shard suffix and is equivalent to the encoding of a `TenantId`: this enables
 /// an unsharded [`TenantShardId`] to be used interchangably with a [`TenantId`].
 ///
 /// The human-readable encoding of an unsharded TenantShardId, such as used in API URLs,
 /// is both forward and backward compatible with TenantId: a legacy TenantId can be
 /// decoded as a TenantShardId, and when re-encoded it will be parseable
 /// as a TenantId.
 #[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
 pub struct TenantShardId {
    pub tenant_id: TenantId,
    pub shard_number: ShardNumber,
    pub shard_count: ShardCount,
 }
 impl ShardCount {
    pub const MAX: Self = Self(u8::MAX);
    /// The internal value of a ShardCount may be zero, which means "1 shard, but use
    /// legacy format for TenantShardId that excludes the shard suffix", also known
    /// as [`TenantShardId::unsharded`].
    ///
    /// This method returns the actual number of shards, i.e. if our internal value is
    /// zero, we return 1 (unsharded tenants have 1 shard).
    pub fn count(&self) -> u8 {
        if self.0 > 0 {
            self.0
        } else {
            1
        }
    }
    /// The literal internal value: this is **not** the number of shards in the
    /// tenant, as we have a special zero value for legacy unsharded tenants.  Use
    /// [`Self::count`] if you want to know the cardinality of shards.
    pub fn literal(&self) -> u8 {
        self.0
    }
    /// Whether the `ShardCount` is for an unsharded tenant, so uses one shard but
    /// uses the legacy format for `TenantShardId`. See also the documentation for
    /// [`Self::count`].
    pub fn is_unsharded(&self) -> bool {
        self.0 == 0
    }
    /// `v` may be zero, or the number of shards in the tenant.  `v` is what
    /// [`Self::literal`] would return.
    pub const fn new(val: u8) -> Self {
        Self(val)
    }
 }
 impl ShardNumber {
    pub const MAX: Self = Self(u8::MAX);
 }
 impl TenantShardId {
    pub fn unsharded(tenant_id: TenantId) -> Self {
        Self {
            tenant_id,
            shard_number: ShardNumber(0),
            shard_count: ShardCount(0),
        }
    }
    /// The range of all TenantShardId that belong to a particular TenantId.  This is useful when
    /// you have a BTreeMap of TenantShardId, and are querying by TenantId.
    pub fn tenant_range(tenant_id: TenantId) -> RangeInclusive<Self> {
        RangeInclusive::new(
            Self {
                tenant_id,
                shard_number: ShardNumber(0),
                shard_count: ShardCount(0),
            },
            Self {
                tenant_id,
                shard_number: ShardNumber::MAX,
                shard_count: ShardCount::MAX,
            },
        )
    }
    pub fn shard_slug(&self) -> impl std::fmt::Display + '_ {
        ShardSlug(self)
    }
    /// Convenience for code that has special behavior on the 0th shard.
    pub fn is_shard_zero(&self) -> bool {
        self.shard_number == ShardNumber(0)
    }
    /// The "unsharded" value is distinct from simply having a single shard: it represents
    /// a tenant which is not shard-aware at all, and whose storage paths will not include
    /// a shard suffix.
    pub fn is_unsharded(&self) -> bool {
        self.shard_number == ShardNumber(0) && self.shard_count.is_unsharded()
    }
    /// Convenience for dropping the tenant_id and just getting the ShardIndex: this
    /// is useful when logging from code that is already in a span that includes tenant ID, to
    /// keep messages reasonably terse.
    pub fn to_index(&self) -> ShardIndex {
        ShardIndex {
            shard_number: self.shard_number,
            shard_count: self.shard_count,
        }
    }
    /// Calculate the children of this TenantShardId when splitting the overall tenant into
    /// the given number of shards.
    pub fn split(&self, new_shard_count: ShardCount) -> Vec<TenantShardId> {
        let effective_old_shard_count = std::cmp::max(self.shard_count.0, 1);
        let mut child_shards = Vec::new();
        for shard_number in 0..ShardNumber(new_shard_count.0).0 {
            // Key mapping is based on a round robin mapping of key hash modulo shard count,
            // so our child shards are the ones which the same keys would map to.
            if shard_number % effective_old_shard_count == self.shard_number.0 {
                child_shards.push(TenantShardId {
                    tenant_id: self.tenant_id,
                    shard_number: ShardNumber(shard_number),
                    shard_count: new_shard_count,
                })
            }
        }
        child_shards
    }
 }
 impl<'a> std::fmt::Display for ShardSlug<'a> {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(
            f,
            "{:02x}{:02x}",
            self.0.shard_number.0, self.0.shard_count.0
        )
    }
 }
 impl std::fmt::Display for TenantShardId {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        if self.shard_count != ShardCount(0) {
            write!(f, "{}-{}", self.tenant_id, self.shard_slug())
        } else {
            // Legacy case (shard_count == 0) -- format as just the tenant id.  Note that this
            // is distinct from the normal single shard case (shard count == 1).
            self.tenant_id.fmt(f)
        }
    }
 }
 impl std::fmt::Debug for TenantShardId {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        // Debug is the same as Display: the compact hex representation
        write!(f, "{}", self)
    }
 }
 impl std::str::FromStr for TenantShardId {
    type Err = hex::FromHexError;
    fn from_str(s: &str) -> Result<Self, Self::Err> {
        // Expect format: 16 byte TenantId, '-', 1 byte shard number, 1 byte shard count
        if s.len() == 32 {
            // Legacy case: no shard specified
            Ok(Self {
                tenant_id: TenantId::from_str(s)?,
                shard_number: ShardNumber(0),
                shard_count: ShardCount(0),
            })
        } else if s.len() == 37 {
            let bytes = s.as_bytes();
            let tenant_id = TenantId::from_hex(&bytes[0..32])?;
            let mut shard_parts: [u8; 2] = [0u8; 2];
            hex::decode_to_slice(&bytes[33..37], &mut shard_parts)?;
            Ok(Self {
                tenant_id,
                shard_number: ShardNumber(shard_parts[0]),
                shard_count: ShardCount(shard_parts[1]),
            })
        } else {
            Err(hex::FromHexError::InvalidStringLength)
        }
    }
 }
 impl From<[u8; 18]> for TenantShardId {
    fn from(b: [u8; 18]) -> Self {
        let tenant_id_bytes: [u8; 16] = b[0..16].try_into().unwrap();
        Self {
            tenant_id: TenantId::from(tenant_id_bytes),
            shard_number: ShardNumber(b[16]),
            shard_count: ShardCount(b[17]),
        }
    }
 }
 impl ShardIndex {
    pub fn new(number: ShardNumber, count: ShardCount) -> Self {
        Self {
            shard_number: number,
            shard_count: count,
        }
    }
    pub fn unsharded() -> Self {
        Self {
            shard_number: ShardNumber(0),
            shard_count: ShardCount(0),
        }
    }
    /// The "unsharded" value is distinct from simply having a single shard: it represents
    /// a tenant which is not shard-aware at all, and whose storage paths will not include
    /// a shard suffix.
    pub fn is_unsharded(&self) -> bool {
        self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
    }
    /// For use in constructing remote storage paths: concatenate this with a TenantId
    /// to get a fully qualified TenantShardId.
    ///
    /// Backward compat: this function returns an empty string if Self::is_unsharded, such
    /// that the legacy pre-sharding remote key format is preserved.
    pub fn get_suffix(&self) -> String {
        if self.is_unsharded() {
            "".to_string()
        } else {
            format!("-{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
        }
    }
 }
 impl std::fmt::Display for ShardIndex {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
    }
 }
 impl std::fmt::Debug for ShardIndex {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        // Debug is the same as Display: the compact hex representation
        write!(f, "{}", self)
    }
 }
 impl std::str::FromStr for ShardIndex {
    type Err = hex::FromHexError;
    fn from_str(s: &str) -> Result<Self, Self::Err> {
        // Expect format: 1 byte shard number, 1 byte shard count
        if s.len() == 4 {
            let bytes = s.as_bytes();
            let mut shard_parts: [u8; 2] = [0u8; 2];
            hex::decode_to_slice(bytes, &mut shard_parts)?;
            Ok(Self {
                shard_number: ShardNumber(shard_parts[0]),
                shard_count: ShardCount(shard_parts[1]),
            })
        } else {
            Err(hex::FromHexError::InvalidStringLength)
        }
    }
 }
 impl From<[u8; 2]> for ShardIndex {
    fn from(b: [u8; 2]) -> Self {
        Self {
            shard_number: ShardNumber(b[0]),
            shard_count: ShardCount(b[1]),
        }
    }
 }
 impl Serialize for TenantShardId {
    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
    where
        S: serde::Serializer,
    {
        if serializer.is_human_readable() {
            serializer.collect_str(self)
        } else {
            // Note: while human encoding of [`TenantShardId`] is backward and forward
            // compatible, this binary encoding is not.
            let mut packed: [u8; 18] = [0; 18];
            packed[0..16].clone_from_slice(&self.tenant_id.as_arr());
            packed[16] = self.shard_number.0;
            packed[17] = self.shard_count.0;
            packed.serialize(serializer)
        }
    }
 }
 impl<'de> Deserialize<'de> for TenantShardId {
    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
    where
        D: serde::Deserializer<'de>,
    {
        struct IdVisitor {
            is_human_readable_deserializer: bool,
        }
        impl<'de> serde::de::Visitor<'de> for IdVisitor {
            type Value = TenantShardId;
            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
                if self.is_human_readable_deserializer {
                    formatter.write_str("value in form of hex string")
                } else {
                    formatter.write_str("value in form of integer array([u8; 18])")
                }
            }
            fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
            where
                A: serde::de::SeqAccess<'de>,
            {
                let s = serde::de::value::SeqAccessDeserializer::new(seq);
                let id: [u8; 18] = Deserialize::deserialize(s)?;
                Ok(TenantShardId::from(id))
            }
            fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
            where
                E: serde::de::Error,
            {
                TenantShardId::from_str(v).map_err(E::custom)
            }
        }
        if deserializer.is_human_readable() {
            deserializer.deserialize_str(IdVisitor {
                is_human_readable_deserializer: true,
            })
        } else {
            deserializer.deserialize_tuple(
                18,
                IdVisitor {
                    is_human_readable_deserializer: false,
                },
            )
        }
    }
 }
 impl Serialize for ShardIndex {
    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
    where
        S: serde::Serializer,
    {
        if serializer.is_human_readable() {
            serializer.collect_str(self)
        } else {
            // Binary encoding is not used in index_part.json, but is included in anticipation of
            // switching various structures (e.g. inter-process communication, remote metadata) to more
            // compact binary encodings in future.
            let mut packed: [u8; 2] = [0; 2];
            packed[0] = self.shard_number.0;
            packed[1] = self.shard_count.0;
            packed.serialize(serializer)
        }
    }
 }
 impl<'de> Deserialize<'de> for ShardIndex {
    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
    where
        D: serde::Deserializer<'de>,
    {
        struct IdVisitor {
            is_human_readable_deserializer: bool,
        }
        impl<'de> serde::de::Visitor<'de> for IdVisitor {
            type Value = ShardIndex;
            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
                if self.is_human_readable_deserializer {
                    formatter.write_str("value in form of hex string")
                } else {
                    formatter.write_str("value in form of integer array([u8; 2])")
                }
            }
            fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
            where
                A: serde::de::SeqAccess<'de>,
            {
                let s = serde::de::value::SeqAccessDeserializer::new(seq);
                let id: [u8; 2] = Deserialize::deserialize(s)?;
                Ok(ShardIndex::from(id))
            }
            fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
            where
                E: serde::de::Error,
            {
                ShardIndex::from_str(v).map_err(E::custom)
            }
        }
        if deserializer.is_human_readable() {
            deserializer.deserialize_str(IdVisitor {
                is_human_readable_deserializer: true,
            })
        } else {
            deserializer.deserialize_tuple(
                2,
                IdVisitor {
                    is_human_readable_deserializer: false,
                },
            )
        }
    }
 }
--- a/libs/utils/src/toml_edit_ext.rs
+++ b/libs/utils/src/toml_edit_ext.rs
@@ -0,0 +1,22 @@
 #[derive(Debug, thiserror::Error)]
 pub enum Error {
    #[error("item is not a document")]
    ItemIsNotADocument,
    #[error(transparent)]
    Serde(toml_edit::de::Error),
 }
 pub fn deserialize_item<T>(item: &toml_edit::Item) -> Result<T, Error>
 where
    T: serde::de::DeserializeOwned,
 {
    let document: toml_edit::Document = match item {
        toml_edit::Item::Table(toml) => toml.clone().into(),
        toml_edit::Item::Value(toml_edit::Value::InlineTable(toml)) => {
            toml.clone().into_table().into()
        }
        _ => return Err(Error::ItemIsNotADocument),
    };
    toml_edit::de::from_document(document).map_err(Error::Serde)
 }
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -62,6 +62,7 @@ sync_wrapper.workspace = true
 sysinfo.workspace = true
 tokio-tar.workspace = true
 thiserror.workspace = true
 tikv-jemallocator.workspace = true
 tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
 tokio-epoll-uring.workspace = true
 tokio-io-timeout.workspace = true
--- a/pageserver/client/Cargo.toml
+++ b/pageserver/client/Cargo.toml
@@ -8,7 +8,7 @@ license.workspace = true
 pageserver_api.workspace = true
 thiserror.workspace = true
 async-trait.workspace = true
-reqwest.workspace = true
+reqwest = { workspace = true, features = [ "stream" ] }
 utils.workspace = true
 serde.workspace = true
 workspace_hack = { version = "0.1", path = "../../workspace_hack" }
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -9,6 +9,8 @@ use utils::{
    lsn::Lsn,
 };
 pub use reqwest::Body as ReqwestBody;
 pub mod util;
 #[derive(Debug, Clone)]
@@ -20,6 +22,9 @@ pub struct Client {
 #[derive(thiserror::Error, Debug)]
 pub enum Error {
    #[error("send request: {0}")]
    SendRequest(reqwest::Error),
    #[error("receive body: {0}")]
    ReceiveBody(reqwest::Error),
@@ -173,19 +178,30 @@ impl Client {
        self.request(Method::GET, uri, ()).await
    }
    fn start_request<U: reqwest::IntoUrl>(
        &self,
        method: Method,
        uri: U,
    ) -> reqwest::RequestBuilder {
        let req = self.client.request(method, uri);
        if let Some(value) = &self.authorization_header {
            req.header(reqwest::header::AUTHORIZATION, value)
        } else {
            req
        }
    }
    async fn request_noerror<B: serde::Serialize, U: reqwest::IntoUrl>(
        &self,
        method: Method,
        uri: U,
        body: B,
    ) -> Result<reqwest::Response> {
-        let req = self.client.request(method, uri);
+        self.start_request(method, uri)
-        let req = if let Some(value) = &self.authorization_header {
+            .json(&body)
-            req.header(reqwest::header::AUTHORIZATION, value)
+            .send()
-        } else {
+            .await
-            req
+            .map_err(Error::ReceiveBody)
        };
        req.json(&body).send().await.map_err(Error::ReceiveBody)
    }
    async fn request<B: serde::Serialize, U: reqwest::IntoUrl>(
@@ -609,4 +625,53 @@ impl Client {
            }),
        }
    }
    pub async fn import_basebackup(
        &self,
        tenant_id: TenantId,
        timeline_id: TimelineId,
        base_lsn: Lsn,
        end_lsn: Lsn,
        pg_version: u32,
        basebackup_tarball: ReqwestBody,
    ) -> Result<()> {
        let uri = format!(
            "{}/v1/tenant/{tenant_id}/timeline/{timeline_id}/import_basebackup?base_lsn={base_lsn}&end_lsn={end_lsn}&pg_version={pg_version}",
            self.mgmt_api_endpoint,
        );
        self.start_request(Method::PUT, uri)
            .body(basebackup_tarball)
            .send()
            .await
            .map_err(Error::SendRequest)?
            .error_from_body()
            .await?
            .json()
            .await
            .map_err(Error::ReceiveBody)
    }
    pub async fn import_wal(
        &self,
        tenant_id: TenantId,
        timeline_id: TimelineId,
        start_lsn: Lsn,
        end_lsn: Lsn,
        wal_tarball: ReqwestBody,
    ) -> Result<()> {
        let uri = format!(
            "{}/v1/tenant/{tenant_id}/timeline/{timeline_id}/import_wal?start_lsn={start_lsn}&end_lsn={end_lsn}",
            self.mgmt_api_endpoint,
        );
        self.start_request(Method::PUT, uri)
            .body(wal_tarball)
            .send()
            .await
            .map_err(Error::SendRequest)?
            .error_from_body()
            .await?
            .json()
            .await
            .map_err(Error::ReceiveBody)
    }
 }
--- a/pageserver/compaction/src/interface.rs
+++ b/pageserver/compaction/src/interface.rs
@@ -131,7 +131,7 @@ impl CompactionKey for Key {
 pub type CompactionKeySpace<K> = Vec<Range<K>>;
 /// Functions needed from all layers.
-pub trait CompactionLayer<K: CompactionKey + ?Sized> {
+pub trait CompactionLayer<K: CompactionKey> {
    fn key_range(&self) -> &Range<K>;
    fn lsn_range(&self) -> &Range<Lsn>;
--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -178,7 +178,7 @@ async fn main() -> anyhow::Result<()> {
            let toml_item = toml_document
                .get("remote_storage")
                .expect("need remote_storage");
-            let config = RemoteStorageConfig::from_toml(toml_item)?.expect("incomplete config");
+            let config = RemoteStorageConfig::from_toml(toml_item)?;
            let storage = remote_storage::GenericRemoteStorage::from_config(&config);
            let cancel = CancellationToken::new();
            storage
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -348,35 +348,36 @@ where
                    self.add_rel(rel, rel).await?;
                }
            }
            for (path, content) in self
                .timeline
                .list_aux_files(self.lsn, self.ctx)
                .await
                .map_err(|e| BasebackupError::Server(e.into()))?
            {
                if path.starts_with("pg_replslot") {
                    let offs = pg_constants::REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN;
                    let restart_lsn = Lsn(u64::from_le_bytes(
                        content[offs..offs + 8].try_into().unwrap(),
                    ));
                    info!("Replication slot {} restart LSN={}", path, restart_lsn);
                    min_restart_lsn = Lsn::min(min_restart_lsn, restart_lsn);
                } else if path == "pg_logical/replorigin_checkpoint" {
                    // replorigin_checkoint is written only on compute shutdown, so it contains
                    // deteriorated values. So we generate our own version of this file for the particular LSN
                    // based on information about replorigins extracted from transaction commit records.
                    // In future we will not generate AUX record for "pg_logical/replorigin_checkpoint" at all,
                    // but now we should handle (skip) it for backward compatibility.
                    continue;
                }
                let header = new_tar_header(&path, content.len() as u64)?;
                self.ar
                    .append(&header, &*content)
                    .await
                    .context("could not add aux file to basebackup tarball")?;
            }
        }
        for (path, content) in self
            .timeline
            .list_aux_files(self.lsn, self.ctx)
            .await
            .map_err(|e| BasebackupError::Server(e.into()))?
        {
            if path.starts_with("pg_replslot") {
                let offs = pg_constants::REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN;
                let restart_lsn = Lsn(u64::from_le_bytes(
                    content[offs..offs + 8].try_into().unwrap(),
                ));
                info!("Replication slot {} restart LSN={}", path, restart_lsn);
                min_restart_lsn = Lsn::min(min_restart_lsn, restart_lsn);
            } else if path == "pg_logical/replorigin_checkpoint" {
                // replorigin_checkoint is written only on compute shutdown, so it contains
                // deteriorated values. So we generate our own version of this file for the particular LSN
                // based on information about replorigins extracted from transaction commit records.
                // In future we will not generate AUX record for "pg_logical/replorigin_checkpoint" at all,
                // but now we should handle (skip) it for backward compatibility.
                continue;
            }
            let header = new_tar_header(&path, content.len() as u64)?;
            self.ar
                .append(&header, &*content)
                .await
                .context("could not add aux file to basebackup tarball")?;
        }
        if min_restart_lsn != Lsn::MAX {
            info!(
                "Min restart LSN for logical replication is {}",
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -47,6 +47,9 @@ use utils::{
 project_git_version!(GIT_VERSION);
 project_build_tag!(BUILD_TAG);
 #[global_allocator]
 static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
 const PID_FILE_NAME: &str = "pageserver.pid";
 const FEATURES: &[&str] = &[
@@ -421,6 +424,10 @@ fn start_pageserver(
        background_jobs_can_start: background_jobs_barrier.clone(),
    };
    info!(config=?conf.l0_flush, "using l0_flush config");
    let l0_flush_global_state =
        pageserver::l0_flush::L0FlushGlobalState::new(conf.l0_flush.clone());
    // Scan the local 'tenants/' directory and start loading the tenants
    let deletion_queue_client = deletion_queue.new_client();
    let tenant_manager = BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
@@ -429,6 +436,7 @@ fn start_pageserver(
            broker_client: broker_client.clone(),
            remote_storage: remote_storage.clone(),
            deletion_queue_client,
            l0_flush_global_state,
        },
        order,
        shutdown_pageserver.clone(),
@@ -652,7 +660,6 @@ fn start_pageserver(
                async move {
                    page_service::libpq_listener_main(
                        tenant_manager,
                        broker_client,
                        pg_auth,
                        pageserver_listener,
                        conf.pg_auth_type,
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -5,14 +5,13 @@
 //! See also `settings.md` for better description on every parameter.
 use anyhow::{anyhow, bail, ensure, Context, Result};
-use pageserver_api::shard::TenantShardId;
+use pageserver_api::{models::ImageCompressionAlgorithm, shard::TenantShardId};
 use remote_storage::{RemotePath, RemoteStorageConfig};
 use serde;
 use serde::de::IntoDeserializer;
 use std::env;
 use storage_broker::Uri;
 use utils::crashsafe::path_with_suffix_extension;
 use utils::id::ConnectionId;
 use utils::logging::SecretString;
 use once_cell::sync::OnceCell;
@@ -30,11 +29,11 @@ use utils::{
    logging::LogFormat,
 };
 use crate::tenant::timeline::GetVectoredImpl;
 use crate::tenant::vectored_blob_io::MaxVectoredReadBytes;
 use crate::tenant::{config::TenantConfOpt, timeline::GetImpl};
 use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
 use crate::{disk_usage_eviction_task::DiskUsageEvictionTaskConfig, virtual_file::io_engine};
 use crate::{l0_flush::L0FlushConfig, tenant::timeline::GetVectoredImpl};
 use crate::{tenant::config::TenantConf, virtual_file};
 use crate::{TENANT_HEATMAP_BASENAME, TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX};
@@ -50,6 +49,7 @@ pub mod defaults {
        DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_HTTP_LISTEN_PORT, DEFAULT_PG_LISTEN_ADDR,
        DEFAULT_PG_LISTEN_PORT,
    };
    use pageserver_api::models::ImageCompressionAlgorithm;
    pub use storage_broker::DEFAULT_ENDPOINT as BROKER_DEFAULT_ENDPOINT;
    pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "60 s";
@@ -90,6 +90,9 @@ pub mod defaults {
    pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB
    pub const DEFAULT_IMAGE_COMPRESSION: ImageCompressionAlgorithm =
        ImageCompressionAlgorithm::Disabled;
    pub const DEFAULT_VALIDATE_VECTORED_GET: bool = true;
    pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
@@ -159,7 +162,7 @@ pub mod defaults {
 #ephemeral_bytes_per_memory_kb = {DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB}
-[remote_storage]
+#[remote_storage]
 "#
    );
@@ -285,12 +288,16 @@ pub struct PageServerConf {
    pub validate_vectored_get: bool,
    pub image_compression: ImageCompressionAlgorithm,
    /// How many bytes of ephemeral layer content will we allow per kilobyte of RAM.  When this
    /// is exceeded, we start proactively closing ephemeral layers to limit the total amount
    /// of ephemeral data.
    ///
    /// Setting this to zero disables limits on total ephemeral layer size.
    pub ephemeral_bytes_per_memory_kb: usize,
    pub l0_flush: L0FlushConfig,
 }
 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -395,7 +402,11 @@ struct PageServerConfigBuilder {
    validate_vectored_get: BuilderValue<bool>,
    image_compression: BuilderValue<ImageCompressionAlgorithm>,
    ephemeral_bytes_per_memory_kb: BuilderValue<usize>,
    l0_flush: BuilderValue<L0FlushConfig>,
 }
 impl PageServerConfigBuilder {
@@ -482,8 +493,10 @@ impl PageServerConfigBuilder {
            max_vectored_read_bytes: Set(MaxVectoredReadBytes(
                NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
            )),
            image_compression: Set(DEFAULT_IMAGE_COMPRESSION),
            validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET),
            ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
            l0_flush: Set(L0FlushConfig::default()),
        }
    }
 }
@@ -667,10 +680,18 @@ impl PageServerConfigBuilder {
        self.validate_vectored_get = BuilderValue::Set(value);
    }
    pub fn get_image_compression(&mut self, value: ImageCompressionAlgorithm) {
        self.image_compression = BuilderValue::Set(value);
    }
    pub fn get_ephemeral_bytes_per_memory_kb(&mut self, value: usize) {
        self.ephemeral_bytes_per_memory_kb = BuilderValue::Set(value);
    }
    pub fn l0_flush(&mut self, value: L0FlushConfig) {
        self.l0_flush = BuilderValue::Set(value);
    }
    pub fn build(self) -> anyhow::Result<PageServerConf> {
        let default = Self::default_values();
@@ -727,7 +748,9 @@ impl PageServerConfigBuilder {
                get_impl,
                max_vectored_read_bytes,
                validate_vectored_get,
                image_compression,
                ephemeral_bytes_per_memory_kb,
                l0_flush,
            }
            CUSTOM LOGIC
            {
@@ -846,22 +869,6 @@ impl PageServerConf {
        )
    }
    pub fn traces_path(&self) -> Utf8PathBuf {
        self.workdir.join("traces")
    }
    pub fn trace_path(
        &self,
        tenant_shard_id: &TenantShardId,
        timeline_id: &TimelineId,
        connection_id: &ConnectionId,
    ) -> Utf8PathBuf {
        self.traces_path()
            .join(tenant_shard_id.to_string())
            .join(timeline_id.to_string())
            .join(connection_id.to_string())
    }
    /// Turns storage remote path of a file into its local path.
    pub fn local_path(&self, remote_path: &RemotePath) -> Utf8PathBuf {
        remote_path.with_base(&self.workdir)
@@ -918,7 +925,7 @@ impl PageServerConf {
                "http_auth_type" => builder.http_auth_type(parse_toml_from_str(key, item)?),
                "pg_auth_type" => builder.pg_auth_type(parse_toml_from_str(key, item)?),
                "remote_storage" => {
-                    builder.remote_storage_config(RemoteStorageConfig::from_toml(item)?)
+                    builder.remote_storage_config(Some(RemoteStorageConfig::from_toml(item).context("remote_storage")?))
                }
                "tenant_config" => {
                    t_conf = TenantConfOpt::try_from(item.to_owned()).context(format!("failed to parse: '{key}'"))?;
@@ -946,7 +953,7 @@ impl PageServerConf {
                    builder.metric_collection_endpoint(Some(endpoint));
                },
                "metric_collection_bucket" => {
-                    builder.metric_collection_bucket(RemoteStorageConfig::from_toml(item)?)
+                    builder.metric_collection_bucket(Some(RemoteStorageConfig::from_toml(item)?))
                }
                "synthetic_size_calculation_interval" =>
                    builder.synthetic_size_calculation_interval(parse_toml_duration(key, item)?),
@@ -1004,9 +1011,15 @@ impl PageServerConf {
                "validate_vectored_get" => {
                    builder.get_validate_vectored_get(parse_toml_bool("validate_vectored_get", item)?)
                }
                "image_compression" => {
                    builder.get_image_compression(parse_toml_from_str("image_compression", item)?)
                }
                "ephemeral_bytes_per_memory_kb" => {
                    builder.get_ephemeral_bytes_per_memory_kb(parse_toml_u64("ephemeral_bytes_per_memory_kb", item)? as usize)
                }
                "l0_flush" => {
                    builder.l0_flush(utils::toml_edit_ext::deserialize_item(item).context("l0_flush")?)
                }
                _ => bail!("unrecognized pageserver option '{key}'"),
            }
        }
@@ -1088,8 +1101,10 @@ impl PageServerConf {
                NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
                    .expect("Invalid default constant"),
            ),
            image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
            validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
            ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
            l0_flush: L0FlushConfig::default(),
        }
    }
 }
@@ -1328,7 +1343,9 @@ background_task_maximum_delay = '334 s'
                        .expect("Invalid default constant")
                ),
                validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
                image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
                ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
                l0_flush: L0FlushConfig::default(),
            },
            "Correct defaults should be used when no config values are provided"
        );
@@ -1401,7 +1418,9 @@ background_task_maximum_delay = '334 s'
                        .expect("Invalid default constant")
                ),
                validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
                image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
                ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
                l0_flush: L0FlushConfig::default(),
            },
            "Should be able to parse all basic config values correctly"
        );
@@ -1524,34 +1543,6 @@ broker_endpoint = '{broker_endpoint}'
        Ok(())
    }
    #[test]
    fn parse_tenant_config() -> anyhow::Result<()> {
        let tempdir = tempdir()?;
        let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?;
        let broker_endpoint = "http://127.0.0.1:7777";
        let trace_read_requests = true;
        let config_string = format!(
            r#"{ALL_BASE_VALUES_TOML}
 pg_distrib_dir='{pg_distrib_dir}'
 broker_endpoint = '{broker_endpoint}'
 [tenant_config]
 trace_read_requests = {trace_read_requests}"#,
        );
        let toml = config_string.parse()?;
        let conf = PageServerConf::parse_and_validate(&toml, &workdir)?;
        assert_eq!(
            conf.default_tenant_conf.trace_read_requests, trace_read_requests,
            "Tenant config from pageserver config file should be parsed and udpated values used as defaults for all tenants",
        );
        Ok(())
    }
    #[test]
    fn parse_incorrect_tenant_config() -> anyhow::Result<()> {
        let config_string = r#"
@@ -1681,6 +1672,19 @@ threshold = "20m"
        }
    }
    #[test]
    fn empty_remote_storage_is_error() {
        let tempdir = tempdir().unwrap();
        let (workdir, _) = prepare_fs(&tempdir).unwrap();
        let input = r#"
 remote_storage = {}
        "#;
        let doc = toml_edit::Document::from_str(input).unwrap();
        let err = PageServerConf::parse_and_validate(&doc, &workdir)
            .expect_err("empty remote_storage field should fail, don't specify it if you want no remote_storage");
        assert!(format!("{err}").contains("remote_storage"), "{err}");
    }
    fn prepare_fs(tempdir: &Utf8TempDir) -> anyhow::Result<(Utf8PathBuf, Utf8PathBuf)> {
        let tempdir_path = tempdir.path();
--- a/pageserver/src/context.rs
+++ b/pageserver/src/context.rs
@@ -59,6 +59,7 @@
 //! 1. It should be easy to forward the context to callees.
 //! 2. To propagate more data from high-level to low-level code, the functions in
 //!    the middle should not need to be modified.
 //!
 //! The solution is to have a container structure ([`RequestContext`]) that
 //! carries the information. Functions that don't care about what's in it
 //! pass it along to callees.
--- a/pageserver/src/deletion_queue/validator.rs
+++ b/pageserver/src/deletion_queue/validator.rs
@@ -190,7 +190,7 @@ where
                }
            } else {
                // If we failed validation, then do not apply any of the projected updates
-                warn!("Dropped remote consistent LSN updates for tenant {tenant_id} in stale generation {:?}", tenant_lsn_state.generation);
+                info!("Dropped remote consistent LSN updates for tenant {tenant_id} in stale generation {:?}", tenant_lsn_state.generation);
                metrics::DELETION_QUEUE.dropped_lsn_updates.inc();
            }
        }
@@ -225,7 +225,7 @@ where
                    && (tenant.generation == *validated_generation);
                if !this_list_valid {
-                    warn!("Dropping stale deletions for tenant {tenant_id} in generation {:?}, objects may be leaked", tenant.generation);
+                    info!("Dropping stale deletions for tenant {tenant_id} in generation {:?}, objects may be leaked", tenant.generation);
                    metrics::DELETION_QUEUE.keys_dropped.inc_by(tenant.len() as u64);
                    mutated = true;
                } else {
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -265,15 +265,19 @@ paths:
          type: string
          format: hex
    post:
-      description: Obtain lease for the given LSN
+      description: Obtains a lease for the given LSN.
-      parameters:
+      requestBody:
-        - name: lsn
+        content:
-          in: query
+          application/json:
-          required: true
+            schema:
-          schema:
+              type: object
-            type: string
+              required:
-            format: hex
+               - lsn
-          description: A LSN to obtain the lease for
+              properties:
                lsn:
                  description: A LSN to obtain the lease for.
                  type: string
                  format: hex
      responses:
        "200":
          description: OK
@@ -869,8 +873,6 @@ components:
          type: string
        max_lsn_wal_lag:
          type: integer
        trace_read_requests:
          type: boolean
        heatmap_period:
          type: string
    TenantConfigResponse:
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -10,6 +10,7 @@ use std::time::Duration;
 use anyhow::{anyhow, Context, Result};
 use enumset::EnumSet;
 use futures::StreamExt;
 use futures::TryFutureExt;
 use humantime::format_rfc3339;
 use hyper::header;
@@ -22,6 +23,7 @@ use pageserver_api::models::ListAuxFilesRequest;
 use pageserver_api::models::LocationConfig;
 use pageserver_api::models::LocationConfigListResponse;
 use pageserver_api::models::LsnLease;
 use pageserver_api::models::LsnLeaseRequest;
 use pageserver_api::models::ShardParameters;
 use pageserver_api::models::TenantDetails;
 use pageserver_api::models::TenantLocationConfigResponse;
@@ -42,13 +44,15 @@ use pageserver_api::shard::TenantShardId;
 use remote_storage::DownloadError;
 use remote_storage::GenericRemoteStorage;
 use remote_storage::TimeTravelError;
-use tenant_size_model::{SizeResult, StorageModel};
+use tenant_size_model::{svg::SvgBranchKind, SizeResult, StorageModel};
 use tokio_util::io::StreamReader;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::auth::JwtAuth;
 use utils::failpoint_support::failpoints_handler;
 use utils::http::endpoint::prometheus_metrics_handler;
 use utils::http::endpoint::request_span;
 use utils::http::request::must_parse_query_param;
 use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
 use crate::context::{DownloadBehavior, RequestContext};
@@ -227,7 +231,7 @@ impl From<UpsertLocationError> for ApiError {
            BadRequest(e) => ApiError::BadRequest(e),
            Unavailable(_) => ApiError::ShuttingDown,
            e @ InProgress => ApiError::Conflict(format!("{e}")),
-            Flush(e) | Other(e) => ApiError::InternalServerError(e),
+            Flush(e) | InternalError(e) => ApiError::InternalServerError(e),
        }
    }
 }
@@ -406,6 +410,8 @@ async fn build_timeline_info_common(
    let walreceiver_status = timeline.walreceiver_status();
    let (pitr_history_size, within_ancestor_pitr) = timeline.get_pitr_history_stats();
    let info = TimelineInfo {
        tenant_id: timeline.tenant_shard_id,
        timeline_id: timeline.timeline_id,
@@ -426,6 +432,8 @@ async fn build_timeline_info_common(
        directory_entries_counts: timeline.get_directory_metrics().to_vec(),
        current_physical_size,
        current_logical_size_non_incremental: None,
        pitr_history_size,
        within_ancestor_pitr,
        timeline_dir_layer_file_size_sum: None,
        wal_source_connstr,
        last_received_msg_lsn,
@@ -1191,10 +1199,15 @@ fn synthetic_size_html_response(
        timeline_map.insert(ti.timeline_id, index);
        timeline_ids.push(ti.timeline_id.to_string());
    }
-    let seg_to_branch: Vec<usize> = inputs
+    let seg_to_branch: Vec<(usize, SvgBranchKind)> = inputs
        .segments
        .iter()
-        .map(|seg| *timeline_map.get(&seg.timeline_id).unwrap())
+        .map(|seg| {
            (
                *timeline_map.get(&seg.timeline_id).unwrap(),
                seg.kind.into(),
            )
        })
        .collect();
    let svg =
@@ -1296,7 +1309,7 @@ async fn update_tenant_config_handler(
    crate::tenant::Tenant::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf)
        .await
-        .map_err(ApiError::InternalServerError)?;
+        .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;
    tenant.set_new_tenant_config(new_tenant_conf);
    json_response(StatusCode::OK, ())
@@ -1527,15 +1540,13 @@ async fn handle_tenant_break(
 // Obtains an lsn lease on the given timeline.
 async fn lsn_lease_handler(
-    request: Request<Body>,
+    mut request: Request<Body>,
    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
-
+    let lsn = json_request::<LsnLeaseRequest>(&mut request).await?.lsn;
    let lsn: Lsn = parse_query_param(&request, "lsn")?
        .ok_or_else(|| ApiError::BadRequest(anyhow!("missing 'lsn' query parameter")))?;
    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
@@ -2396,6 +2407,189 @@ async fn post_top_tenants(
    )
 }
 async fn put_tenant_timeline_import_basebackup(
    request: Request<Body>,
    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    let base_lsn: Lsn = must_parse_query_param(&request, "base_lsn")?;
    let end_lsn: Lsn = must_parse_query_param(&request, "end_lsn")?;
    let pg_version: u32 = must_parse_query_param(&request, "pg_version")?;
    check_permission(&request, Some(tenant_id))?;
    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
    let span = info_span!("import_basebackup", tenant_id=%tenant_id, timeline_id=%timeline_id, base_lsn=%base_lsn, end_lsn=%end_lsn, pg_version=%pg_version);
    async move {
        let state = get_state(&request);
        let tenant = state
            .tenant_manager
            .get_attached_tenant_shard(TenantShardId::unsharded(tenant_id))?;
        let broker_client = state.broker_client.clone();
        let mut body = StreamReader::new(request.into_body().map(|res| {
            res.map_err(|error| {
                std::io::Error::new(std::io::ErrorKind::Other, anyhow::anyhow!(error))
            })
        }));
        tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
        let timeline = tenant
            .create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx)
            .map_err(ApiError::InternalServerError)
            .await?;
        // TODO mark timeline as not ready until it reaches end_lsn.
        // We might have some wal to import as well, and we should prevent compute
        // from connecting before that and writing conflicting wal.
        //
        // This is not relevant for pageserver->pageserver migrations, since there's
        // no wal to import. But should be fixed if we want to import from postgres.
        // TODO leave clean state on error. For now you can use detach to clean
        // up broken state from a failed import.
        // Import basebackup provided via CopyData
        info!("importing basebackup");
        timeline
            .import_basebackup_from_tar(tenant.clone(), &mut body, base_lsn, broker_client, &ctx)
            .await
            .map_err(ApiError::InternalServerError)?;
        // Read the end of the tar archive.
        read_tar_eof(body)
            .await
            .map_err(ApiError::InternalServerError)?;
        // TODO check checksum
        // Meanwhile you can verify client-side by taking fullbackup
        // and checking that it matches in size with what was imported.
        // It wouldn't work if base came from vanilla postgres though,
        // since we discard some log files.
        info!("done");
        json_response(StatusCode::OK, ())
    }
    .instrument(span)
    .await
 }
 async fn put_tenant_timeline_import_wal(
    request: Request<Body>,
    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    let start_lsn: Lsn = must_parse_query_param(&request, "start_lsn")?;
    let end_lsn: Lsn = must_parse_query_param(&request, "end_lsn")?;
    check_permission(&request, Some(tenant_id))?;
    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
    let span = info_span!("import_wal", tenant_id=%tenant_id, timeline_id=%timeline_id, start_lsn=%start_lsn, end_lsn=%end_lsn);
    async move {
        let state = get_state(&request);
        let timeline = active_timeline_of_active_tenant(&state.tenant_manager, TenantShardId::unsharded(tenant_id), timeline_id).await?;
        let mut body = StreamReader::new(request.into_body().map(|res| {
            res.map_err(|error| {
                std::io::Error::new(std::io::ErrorKind::Other, anyhow::anyhow!(error))
            })
        }));
        let last_record_lsn = timeline.get_last_record_lsn();
        if last_record_lsn != start_lsn {
            return Err(ApiError::InternalServerError(anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}")));
        }
        // TODO leave clean state on error. For now you can use detach to clean
        // up broken state from a failed import.
        // Import wal provided via CopyData
        info!("importing wal");
        crate::import_datadir::import_wal_from_tar(&timeline, &mut body, start_lsn, end_lsn, &ctx).await.map_err(ApiError::InternalServerError)?;
        info!("wal import complete");
        // Read the end of the tar archive.
        read_tar_eof(body).await.map_err(ApiError::InternalServerError)?;
        // TODO Does it make sense to overshoot?
        if timeline.get_last_record_lsn() < end_lsn {
            return Err(ApiError::InternalServerError(anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}")));
        }
        // Flush data to disk, then upload to s3. No need for a forced checkpoint.
        // We only want to persist the data, and it doesn't matter if it's in the
        // shape of deltas or images.
        info!("flushing layers");
        timeline.freeze_and_flush().await.map_err(|e| match e {
            tenant::timeline::FlushLayerError::Cancelled => ApiError::ShuttingDown,
            other => ApiError::InternalServerError(anyhow::anyhow!(other)),
        })?;
        info!("done");
        json_response(StatusCode::OK, ())
    }.instrument(span).await
 }
 /// Read the end of a tar archive.
 ///
 /// A tar archive normally ends with two consecutive blocks of zeros, 512 bytes each.
 /// `tokio_tar` already read the first such block. Read the second all-zeros block,
 /// and check that there is no more data after the EOF marker.
 ///
 /// 'tar' command can also write extra blocks of zeros, up to a record
 /// size, controlled by the --record-size argument. Ignore them too.
 async fn read_tar_eof(mut reader: (impl tokio::io::AsyncRead + Unpin)) -> anyhow::Result<()> {
    use tokio::io::AsyncReadExt;
    let mut buf = [0u8; 512];
    // Read the all-zeros block, and verify it
    let mut total_bytes = 0;
    while total_bytes < 512 {
        let nbytes = reader.read(&mut buf[total_bytes..]).await?;
        total_bytes += nbytes;
        if nbytes == 0 {
            break;
        }
    }
    if total_bytes < 512 {
        anyhow::bail!("incomplete or invalid tar EOF marker");
    }
    if !buf.iter().all(|&x| x == 0) {
        anyhow::bail!("invalid tar EOF marker");
    }
    // Drain any extra zero-blocks after the EOF marker
    let mut trailing_bytes = 0;
    let mut seen_nonzero_bytes = false;
    loop {
        let nbytes = reader.read(&mut buf).await?;
        trailing_bytes += nbytes;
        if !buf.iter().all(|&x| x == 0) {
            seen_nonzero_bytes = true;
        }
        if nbytes == 0 {
            break;
        }
    }
    if seen_nonzero_bytes {
        anyhow::bail!("unexpected non-zero bytes after the tar archive");
    }
    if trailing_bytes % 512 != 0 {
        anyhow::bail!("unexpected number of zeros ({trailing_bytes}), not divisible by tar block size (512 bytes), after the tar archive");
    }
    Ok(())
 }
 /// Common functionality of all the HTTP API handlers.
 ///
 /// - Adds a tracing span to each request (by `request_span`)
@@ -2690,5 +2884,13 @@ pub fn make_router(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/perf_info",
            |r| testing_api_handler("perf_info", r, perf_info),
        )
        .put(
            "/v1/tenant/:tenant_id/timeline/:timeline_id/import_basebackup",
            |r| api_handler(r, put_tenant_timeline_import_basebackup),
        )
        .put(
            "/v1/tenant/:tenant_id/timeline/:timeline_id/import_wal",
            |r| api_handler(r, put_tenant_timeline_import_wal),
        )
        .any(handler_404))
 }
--- a/pageserver/src/l0_flush.rs
+++ b/pageserver/src/l0_flush.rs
@@ -0,0 +1,46 @@
 use std::{num::NonZeroUsize, sync::Arc};
 use crate::tenant::ephemeral_file;
 #[derive(Default, Debug, PartialEq, Eq, Clone, serde::Deserialize)]
 #[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
 pub enum L0FlushConfig {
    #[default]
    PageCached,
    #[serde(rename_all = "snake_case")]
    Direct { max_concurrency: NonZeroUsize },
 }
 #[derive(Clone)]
 pub struct L0FlushGlobalState(Arc<Inner>);
 pub(crate) enum Inner {
    PageCached,
    Direct { semaphore: tokio::sync::Semaphore },
 }
 impl L0FlushGlobalState {
    pub fn new(config: L0FlushConfig) -> Self {
        match config {
            L0FlushConfig::PageCached => Self(Arc::new(Inner::PageCached)),
            L0FlushConfig::Direct { max_concurrency } => {
                let semaphore = tokio::sync::Semaphore::new(max_concurrency.get());
                Self(Arc::new(Inner::Direct { semaphore }))
            }
        }
    }
    pub(crate) fn inner(&self) -> &Arc<Inner> {
        &self.0
    }
 }
 impl L0FlushConfig {
    pub(crate) fn prewarm_on_write(&self) -> ephemeral_file::PrewarmPageCacheOnWrite {
        use L0FlushConfig::*;
        match self {
            PageCached => ephemeral_file::PrewarmPageCacheOnWrite::Yes,
            Direct { .. } => ephemeral_file::PrewarmPageCacheOnWrite::No,
        }
    }
 }
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -11,6 +11,7 @@ pub mod deletion_queue;
 pub mod disk_usage_eviction_task;
 pub mod http;
 pub mod import_datadir;
 pub mod l0_flush;
 pub use pageserver_api::keyspace;
 pub mod aux_file;
 pub mod metrics;
@@ -22,7 +23,6 @@ pub mod span;
 pub(crate) mod statvfs;
 pub mod task_mgr;
 pub mod tenant;
 pub mod trace;
 pub mod utilization;
 pub mod virtual_file;
 pub mod walingest;
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -8,7 +8,7 @@ use metrics::{
 };
 use once_cell::sync::Lazy;
 use pageserver_api::shard::TenantShardId;
-use strum::{EnumCount, IntoEnumIterator, VariantNames};
+use strum::{EnumCount, VariantNames};
 use strum_macros::{EnumVariantNames, IntoStaticStr};
 use tracing::warn;
 use utils::id::TimelineId;
@@ -464,6 +464,24 @@ static LAST_RECORD_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });
 static PITR_HISTORY_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
    register_uint_gauge_vec!(
        "pageserver_pitr_history_size",
        "Data written since PITR cutoff on this timeline",
        &["tenant_id", "shard_id", "timeline_id"]
    )
    .expect("failed to define a metric")
 });
 static TIMELINE_ARCHIVE_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
    register_uint_gauge_vec!(
        "pageserver_archive_size",
        "Timeline's logical size if it is considered eligible for archival (outside PITR window), else zero",
        &["tenant_id", "shard_id", "timeline_id"]
    )
    .expect("failed to define a metric")
 });
 static STANDBY_HORIZON: Lazy<IntGaugeVec> = Lazy::new(|| {
    register_int_gauge_vec!(
        "pageserver_standby_horizon",
@@ -476,7 +494,7 @@ static STANDBY_HORIZON: Lazy<IntGaugeVec> = Lazy::new(|| {
 static RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
    register_uint_gauge_vec!(
        "pageserver_resident_physical_size",
-        "The size of the layer files present in the pageserver's filesystem.",
+        "The size of the layer files present in the pageserver's filesystem, for attached locations.",
        &["tenant_id", "shard_id", "timeline_id"]
    )
    .expect("failed to define a metric")
@@ -551,6 +569,22 @@ static VALID_LSN_LEASE_COUNT: Lazy<UIntGaugeVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });
 pub(crate) static CIRCUIT_BREAKERS_BROKEN: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_circuit_breaker_broken",
        "How many times a circuit breaker has broken"
    )
    .expect("failed to define a metric")
 });
 pub(crate) static CIRCUIT_BREAKERS_UNBROKEN: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_circuit_breaker_unbroken",
        "How many times a circuit breaker has been un-broken (recovered)"
    )
    .expect("failed to define a metric")
 });
 pub(crate) mod initial_logical_size {
    use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
    use once_cell::sync::Lazy;
@@ -1076,21 +1110,12 @@ pub(crate) mod virtual_file_io_engine {
    });
 }
 #[derive(Debug)]
 struct GlobalAndPerTimelineHistogram {
    global: Histogram,
    per_tenant_timeline: Histogram,
 }
 impl GlobalAndPerTimelineHistogram {
    fn observe(&self, value: f64) {
        self.global.observe(value);
        self.per_tenant_timeline.observe(value);
    }
 }
 struct GlobalAndPerTimelineHistogramTimer<'a, 'c> {
-    h: &'a GlobalAndPerTimelineHistogram,
+    global_metric: &'a Histogram,
    // Optional because not all op types are tracked per-timeline
    timeline_metric: Option<&'a Histogram>,
    ctx: &'c RequestContext,
    start: std::time::Instant,
    op: SmgrQueryType,
@@ -1121,7 +1146,10 @@ impl<'a, 'c> Drop for GlobalAndPerTimelineHistogramTimer<'a, 'c> {
                elapsed
            }
        };
-        self.h.observe(ex_throttled.as_secs_f64());
+        self.global_metric.observe(ex_throttled.as_secs_f64());
        if let Some(timeline_metric) = self.timeline_metric {
            timeline_metric.observe(ex_throttled.as_secs_f64());
        }
    }
 }
@@ -1146,7 +1174,8 @@ pub enum SmgrQueryType {
 #[derive(Debug)]
 pub(crate) struct SmgrQueryTimePerTimeline {
-    metrics: [GlobalAndPerTimelineHistogram; SmgrQueryType::COUNT],
+    global_metrics: [Histogram; SmgrQueryType::COUNT],
    per_timeline_getpage: Histogram,
 }
 static SMGR_QUERY_TIME_PER_TENANT_TIMELINE: Lazy<HistogramVec> = Lazy::new(|| {
@@ -1224,27 +1253,32 @@ impl SmgrQueryTimePerTimeline {
        let tenant_id = tenant_shard_id.tenant_id.to_string();
        let shard_slug = format!("{}", tenant_shard_id.shard_slug());
        let timeline_id = timeline_id.to_string();
-        let metrics = std::array::from_fn(|i| {
+        let global_metrics = std::array::from_fn(|i| {
            let op = SmgrQueryType::from_repr(i).unwrap();
-            let global = SMGR_QUERY_TIME_GLOBAL
+            SMGR_QUERY_TIME_GLOBAL
                .get_metric_with_label_values(&[op.into()])
-                .unwrap();
+                .unwrap()
            let per_tenant_timeline = SMGR_QUERY_TIME_PER_TENANT_TIMELINE
                .get_metric_with_label_values(&[op.into(), &tenant_id, &shard_slug, &timeline_id])
                .unwrap();
            GlobalAndPerTimelineHistogram {
                global,
                per_tenant_timeline,
            }
        });
-        Self { metrics }
+
        let per_timeline_getpage = SMGR_QUERY_TIME_PER_TENANT_TIMELINE
            .get_metric_with_label_values(&[
                SmgrQueryType::GetPageAtLsn.into(),
                &tenant_id,
                &shard_slug,
                &timeline_id,
            ])
            .unwrap();
        Self {
            global_metrics,
            per_timeline_getpage,
        }
    }
    pub(crate) fn start_timer<'c: 'a, 'a>(
        &'a self,
        op: SmgrQueryType,
        ctx: &'c RequestContext,
-    ) -> impl Drop + '_ {
+    ) -> Option<impl Drop + '_> {
-        let metric = &self.metrics[op as usize];
+        let global_metric = &self.global_metrics[op as usize];
        let start = Instant::now();
        match ctx.micros_spent_throttled.open() {
            Ok(()) => (),
@@ -1263,12 +1297,20 @@ impl SmgrQueryTimePerTimeline {
                });
            }
        }
-        GlobalAndPerTimelineHistogramTimer {
+
-            h: metric,
+        let timeline_metric = if matches!(op, SmgrQueryType::GetPageAtLsn) {
            Some(&self.per_timeline_getpage)
        } else {
            None
        };
        Some(GlobalAndPerTimelineHistogramTimer {
            global_metric,
            timeline_metric,
            ctx,
            start,
            op,
-        }
+        })
    }
 }
@@ -1315,17 +1357,9 @@ mod smgr_query_time_tests {
            let get_counts = || {
                let global: u64 = ops
                    .iter()
-                    .map(|op| metrics.metrics[*op as usize].global.get_sample_count())
+                    .map(|op| metrics.global_metrics[*op as usize].get_sample_count())
                    .sum();
-                let per_tenant_timeline: u64 = ops
+                (global, metrics.per_timeline_getpage.get_sample_count())
                    .iter()
                    .map(|op| {
                        metrics.metrics[*op as usize]
                            .per_tenant_timeline
                            .get_sample_count()
                    })
                    .sum();
                (global, per_tenant_timeline)
            };
            let (pre_global, pre_per_tenant_timeline) = get_counts();
@@ -1336,7 +1370,12 @@ mod smgr_query_time_tests {
            drop(timer);
            let (post_global, post_per_tenant_timeline) = get_counts();
-            assert_eq!(post_per_tenant_timeline, 1);
+            if matches!(op, super::SmgrQueryType::GetPageAtLsn) {
                // getpage ops are tracked per-timeline, others aren't
                assert_eq!(post_per_tenant_timeline, 1);
            } else {
                assert_eq!(post_per_tenant_timeline, 0);
            }
            assert!(post_global > pre_global);
        }
    }
@@ -1433,10 +1472,12 @@ impl<'a, 'c> BasebackupQueryTimeOngoingRecording<'a, 'c> {
    }
 }
-pub(crate) static LIVE_CONNECTIONS_COUNT: Lazy<IntGaugeVec> = Lazy::new(|| {
+pub(crate) static LIVE_CONNECTIONS: Lazy<IntCounterPairVec> = Lazy::new(|| {
-    register_int_gauge_vec!(
+    register_int_counter_pair_vec!(
-        "pageserver_live_connections",
+        "pageserver_live_connections_started",
-        "Number of live network connections",
+        "Number of network connections that we started handling",
        "pageserver_live_connections_finished",
        "Number of network connections that we finished handling",
        &["pageserver_connection_kind"]
    )
    .expect("failed to define a metric")
@@ -1447,10 +1488,7 @@ pub(crate) enum ComputeCommandKind {
    PageStreamV2,
    PageStream,
    Basebackup,
    GetLastRecordRlsn,
    Fullbackup,
    ImportBasebackup,
    ImportWal,
    LeaseLsn,
    Show,
 }
@@ -1691,6 +1729,15 @@ pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| {
 }
 });
 pub(crate) static SECONDARY_RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
    register_uint_gauge_vec!(
        "pageserver_secondary_resident_physical_size",
        "The size of the layer files present in the pageserver's filesystem, for secondary locations.",
        &["tenant_id", "shard_id"]
    )
    .expect("failed to define a metric")
 });
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 pub enum RemoteOpKind {
    Upload,
@@ -2093,6 +2140,8 @@ pub(crate) struct TimelineMetrics {
    pub garbage_collect_histo: StorageTimeMetrics,
    pub find_gc_cutoffs_histo: StorageTimeMetrics,
    pub last_record_gauge: IntGauge,
    pub pitr_history_size: UIntGauge,
    pub archival_size: UIntGauge,
    pub standby_horizon_gauge: IntGauge,
    pub resident_physical_size_gauge: UIntGauge,
    /// copy of LayeredTimeline.current_logical_size
@@ -2166,6 +2215,15 @@ impl TimelineMetrics {
        let last_record_gauge = LAST_RECORD_LSN
            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
            .unwrap();
        let pitr_history_size = PITR_HISTORY_SIZE
            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
            .unwrap();
        let archival_size = TIMELINE_ARCHIVE_SIZE
            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
            .unwrap();
        let standby_horizon_gauge = STANDBY_HORIZON
            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
            .unwrap();
@@ -2218,6 +2276,8 @@ impl TimelineMetrics {
            find_gc_cutoffs_histo,
            load_layer_map_histo,
            last_record_gauge,
            pitr_history_size,
            archival_size,
            standby_horizon_gauge,
            resident_physical_size_gauge,
            current_logical_size_gauge,
@@ -2275,6 +2335,10 @@ impl TimelineMetrics {
        if let Some(metric) = Lazy::get(&DIRECTORY_ENTRIES_COUNT) {
            let _ = metric.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        }
        let _ = TIMELINE_ARCHIVE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        let _ = PITR_HISTORY_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        let _ = EVICTIONS.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        let _ = AUX_FILE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        let _ = VALID_LSN_LEASE_COUNT.remove_label_values(&[tenant_id, shard_id, timeline_id]);
@@ -2308,14 +2372,12 @@ impl TimelineMetrics {
            let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, shard_id, timeline_id]);
        }
-        for op in SmgrQueryType::iter() {
+        let _ = SMGR_QUERY_TIME_PER_TENANT_TIMELINE.remove_label_values(&[
-            let _ = SMGR_QUERY_TIME_PER_TENANT_TIMELINE.remove_label_values(&[
+            SmgrQueryType::GetPageAtLsn.into(),
-                op.into(),
+            tenant_id,
-                tenant_id,
+            shard_id,
-                shard_id,
+            timeline_id,
-                timeline_id,
+        ]);
            ]);
        }
    }
 }
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -4,9 +4,7 @@
 use anyhow::Context;
 use async_compression::tokio::write::GzipEncoder;
 use bytes::Buf;
 use bytes::Bytes;
 use futures::stream::FuturesUnordered;
 use futures::Stream;
 use futures::StreamExt;
 use pageserver_api::key::Key;
 use pageserver_api::models::TenantState;
@@ -28,7 +26,6 @@ use std::borrow::Cow;
 use std::collections::HashMap;
 use std::io;
 use std::net::TcpListener;
 use std::pin::pin;
 use std::str;
 use std::str::FromStr;
 use std::sync::Arc;
@@ -37,10 +34,8 @@ use std::time::Instant;
 use std::time::SystemTime;
 use tokio::io::AsyncWriteExt;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_util::io::StreamReader;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::id::ConnectionId;
 use utils::sync::gate::GateGuard;
 use utils::{
    auth::{Claims, Scope, SwappableJwtAuth},
@@ -53,9 +48,8 @@ use crate::auth::check_permission;
 use crate::basebackup;
 use crate::basebackup::BasebackupError;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::import_datadir::import_wal_from_tar;
 use crate::metrics;
-use crate::metrics::{ComputeCommandKind, COMPUTE_COMMANDS_COUNTERS, LIVE_CONNECTIONS_COUNT};
+use crate::metrics::{ComputeCommandKind, COMPUTE_COMMANDS_COUNTERS, LIVE_CONNECTIONS};
 use crate::pgdatadir_mapping::Version;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
@@ -66,13 +60,11 @@ use crate::tenant::mgr::GetTenantError;
 use crate::tenant::mgr::ShardResolveResult;
 use crate::tenant::mgr::ShardSelector;
 use crate::tenant::mgr::TenantManager;
 use crate::tenant::timeline::FlushLayerError;
 use crate::tenant::timeline::WaitLsnError;
 use crate::tenant::GetTimelineError;
 use crate::tenant::PageReconstructError;
 use crate::tenant::Tenant;
 use crate::tenant::Timeline;
 use crate::trace::Tracer;
 use pageserver_api::key::rel_block_to_key;
 use pageserver_api::reltag::SlruKind;
 use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
@@ -82,56 +74,6 @@ use postgres_ffi::BLCKSZ;
 // is not yet in state [`TenantState::Active`].
 const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000);
 /// Read the end of a tar archive.
 ///
 /// A tar archive normally ends with two consecutive blocks of zeros, 512 bytes each.
 /// `tokio_tar` already read the first such block. Read the second all-zeros block,
 /// and check that there is no more data after the EOF marker.
 ///
 /// 'tar' command can also write extra blocks of zeros, up to a record
 /// size, controlled by the --record-size argument. Ignore them too.
 async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<()> {
    use tokio::io::AsyncReadExt;
    let mut buf = [0u8; 512];
    // Read the all-zeros block, and verify it
    let mut total_bytes = 0;
    while total_bytes < 512 {
        let nbytes = reader.read(&mut buf[total_bytes..]).await?;
        total_bytes += nbytes;
        if nbytes == 0 {
            break;
        }
    }
    if total_bytes < 512 {
        anyhow::bail!("incomplete or invalid tar EOF marker");
    }
    if !buf.iter().all(|&x| x == 0) {
        anyhow::bail!("invalid tar EOF marker");
    }
    // Drain any extra zero-blocks after the EOF marker
    let mut trailing_bytes = 0;
    let mut seen_nonzero_bytes = false;
    loop {
        let nbytes = reader.read(&mut buf).await?;
        trailing_bytes += nbytes;
        if !buf.iter().all(|&x| x == 0) {
            seen_nonzero_bytes = true;
        }
        if nbytes == 0 {
            break;
        }
    }
    if seen_nonzero_bytes {
        anyhow::bail!("unexpected non-zero bytes after the tar archive");
    }
    if trailing_bytes % 512 != 0 {
        anyhow::bail!("unexpected number of zeros ({trailing_bytes}), not divisible by tar block size (512 bytes), after the tar archive");
    }
    Ok(())
 }
 ///////////////////////////////////////////////////////////////////////////////
 ///
@@ -141,7 +83,6 @@ async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<()
 ///
 pub async fn libpq_listener_main(
    tenant_manager: Arc<TenantManager>,
    broker_client: storage_broker::BrokerClientChannel,
    auth: Option<Arc<SwappableJwtAuth>>,
    listener: TcpListener,
    auth_type: AuthType,
@@ -186,7 +127,6 @@ pub async fn libpq_listener_main(
                    false,
                    page_service_conn_main(
                        tenant_manager.clone(),
                        broker_client.clone(),
                        local_auth,
                        socket,
                        auth_type,
@@ -209,20 +149,14 @@ pub async fn libpq_listener_main(
 #[instrument(skip_all, fields(peer_addr))]
 async fn page_service_conn_main(
    tenant_manager: Arc<TenantManager>,
    broker_client: storage_broker::BrokerClientChannel,
    auth: Option<Arc<SwappableJwtAuth>>,
    socket: tokio::net::TcpStream,
    auth_type: AuthType,
    connection_ctx: RequestContext,
 ) -> anyhow::Result<()> {
-    // Immediately increment the gauge, then create a job to decrement it on task exit.
+    let _guard = LIVE_CONNECTIONS
-    // One of the pros of `defer!` is that this will *most probably*
+        .with_label_values(&["page_service"])
-    // get called, even in presence of panics.
+        .guard();
    let gauge = LIVE_CONNECTIONS_COUNT.with_label_values(&["page_service"]);
    gauge.inc();
    scopeguard::defer! {
        gauge.dec();
    }
    socket
        .set_nodelay(true)
@@ -267,12 +201,11 @@ async fn page_service_conn_main(
    // and create a child per-query context when it invokes process_query.
    // But it's in a shared crate, so, we store connection_ctx inside PageServerHandler
    // and create the per-query context in process_query ourselves.
-    let mut conn_handler =
+    let mut conn_handler = PageServerHandler::new(tenant_manager, auth, connection_ctx);
        PageServerHandler::new(tenant_manager, broker_client, auth, connection_ctx);
    let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;
    match pgbackend
-        .run(&mut conn_handler, task_mgr::shutdown_watcher)
+        .run(&mut conn_handler, &task_mgr::shutdown_token())
        .await
    {
        Ok(()) => {
@@ -299,7 +232,6 @@ struct HandlerTimeline {
 }
 struct PageServerHandler {
    broker_client: storage_broker::BrokerClientChannel,
    auth: Option<Arc<SwappableJwtAuth>>,
    claims: Option<Claims>,
@@ -391,13 +323,11 @@ impl From<WaitLsnError> for QueryError {
 impl PageServerHandler {
    pub fn new(
        tenant_manager: Arc<TenantManager>,
        broker_client: storage_broker::BrokerClientChannel,
        auth: Option<Arc<SwappableJwtAuth>>,
        connection_ctx: RequestContext,
    ) -> Self {
        PageServerHandler {
            tenant_manager,
            broker_client,
            auth,
            claims: None,
            connection_ctx,
@@ -480,73 +410,6 @@ impl PageServerHandler {
        )
    }
    fn copyin_stream<'a, IO>(
        &'a self,
        pgb: &'a mut PostgresBackend<IO>,
        cancel: &'a CancellationToken,
    ) -> impl Stream<Item = io::Result<Bytes>> + 'a
    where
        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
    {
        async_stream::try_stream! {
            loop {
                let msg = tokio::select! {
                    biased;
                    _ = cancel.cancelled() => {
                        // We were requested to shut down.
                        let msg = "pageserver is shutting down";
                        let _ = pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, None));
                        Err(QueryError::Shutdown)
                    }
                    msg = pgb.read_message() => { msg.map_err(QueryError::from)}
                };
                match msg {
                    Ok(Some(message)) => {
                        let copy_data_bytes = match message {
                            FeMessage::CopyData(bytes) => bytes,
                            FeMessage::CopyDone => { break },
                            FeMessage::Sync => continue,
                            FeMessage::Terminate => {
                                let msg = "client terminated connection with Terminate message during COPY";
                                let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
                                // error can't happen here, ErrorResponse serialization should be always ok
                                pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?;
                                Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
                                break;
                            }
                            m => {
                                let msg = format!("unexpected message {m:?}");
                                // error can't happen here, ErrorResponse serialization should be always ok
                                pgb.write_message_noflush(&BeMessage::ErrorResponse(&msg, None)).map_err(|e| e.into_io_error())?;
                                Err(io::Error::new(io::ErrorKind::Other, msg))?;
                                break;
                            }
                        };
                        yield copy_data_bytes;
                    }
                    Ok(None) => {
                        let msg = "client closed connection during COPY";
                        let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
                        // error can't happen here, ErrorResponse serialization should be always ok
                        pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?;
                        self.flush_cancellable(pgb, cancel).await.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
                        Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
                    }
                    Err(QueryError::Disconnected(ConnectionError::Io(io_error))) => {
                        Err(io_error)?;
                    }
                    Err(other) => {
                        Err(io::Error::new(io::ErrorKind::Other, other.to_string()))?;
                    }
                };
            }
        }
    }
    #[instrument(skip_all)]
    async fn handle_pagerequests<IO>(
        &mut self,
@@ -565,18 +428,6 @@ impl PageServerHandler {
            .get_active_tenant_with_timeout(tenant_id, ShardSelector::First, ACTIVE_TENANT_TIMEOUT)
            .await?;
        // Make request tracer if needed
        let mut tracer = if tenant.get_trace_read_requests() {
            let connection_id = ConnectionId::generate();
            let path =
                tenant
                    .conf
                    .trace_path(&tenant.tenant_shard_id(), &timeline_id, &connection_id);
            Some(Tracer::new(path))
        } else {
            None
        };
        // switch client to COPYBOTH
        pgb.write_message_noflush(&BeMessage::CopyBothResponse)?;
        self.flush_cancellable(pgb, &tenant.cancel).await?;
@@ -608,11 +459,6 @@ impl PageServerHandler {
            trace!("query: {copy_data_bytes:?}");
            fail::fail_point!("ps::handle-pagerequest-message");
            // Trace request if needed
            if let Some(t) = tracer.as_mut() {
                t.trace(&copy_data_bytes)
            }
            let neon_fe_msg =
                PagestreamFeMessage::parse(&mut copy_data_bytes.reader(), protocol_version)?;
@@ -718,128 +564,6 @@ impl PageServerHandler {
        Ok(())
    }
    #[allow(clippy::too_many_arguments)]
    #[instrument(skip_all, fields(%base_lsn, end_lsn=%_end_lsn, %pg_version))]
    async fn handle_import_basebackup<IO>(
        &self,
        pgb: &mut PostgresBackend<IO>,
        tenant_id: TenantId,
        timeline_id: TimelineId,
        base_lsn: Lsn,
        _end_lsn: Lsn,
        pg_version: u32,
        ctx: RequestContext,
    ) -> Result<(), QueryError>
    where
        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
    {
        debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id();
        // Create empty timeline
        info!("creating new timeline");
        let tenant = self
            .get_active_tenant_with_timeout(tenant_id, ShardSelector::Zero, ACTIVE_TENANT_TIMEOUT)
            .await?;
        let timeline = tenant
            .create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx)
            .await?;
        // TODO mark timeline as not ready until it reaches end_lsn.
        // We might have some wal to import as well, and we should prevent compute
        // from connecting before that and writing conflicting wal.
        //
        // This is not relevant for pageserver->pageserver migrations, since there's
        // no wal to import. But should be fixed if we want to import from postgres.
        // TODO leave clean state on error. For now you can use detach to clean
        // up broken state from a failed import.
        // Import basebackup provided via CopyData
        info!("importing basebackup");
        pgb.write_message_noflush(&BeMessage::CopyInResponse)?;
        self.flush_cancellable(pgb, &tenant.cancel).await?;
        let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb, &tenant.cancel)));
        timeline
            .import_basebackup_from_tar(
                tenant.clone(),
                &mut copyin_reader,
                base_lsn,
                self.broker_client.clone(),
                &ctx,
            )
            .await?;
        // Read the end of the tar archive.
        read_tar_eof(copyin_reader).await?;
        // TODO check checksum
        // Meanwhile you can verify client-side by taking fullbackup
        // and checking that it matches in size with what was imported.
        // It wouldn't work if base came from vanilla postgres though,
        // since we discard some log files.
        info!("done");
        Ok(())
    }
    #[instrument(skip_all, fields(shard_id, %start_lsn, %end_lsn))]
    async fn handle_import_wal<IO>(
        &self,
        pgb: &mut PostgresBackend<IO>,
        tenant_id: TenantId,
        timeline_id: TimelineId,
        start_lsn: Lsn,
        end_lsn: Lsn,
        ctx: RequestContext,
    ) -> Result<(), QueryError>
    where
        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
    {
        let timeline = self
            .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
            .await?;
        let last_record_lsn = timeline.get_last_record_lsn();
        if last_record_lsn != start_lsn {
            return Err(QueryError::Other(
                anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}"))
            );
        }
        // TODO leave clean state on error. For now you can use detach to clean
        // up broken state from a failed import.
        // Import wal provided via CopyData
        info!("importing wal");
        pgb.write_message_noflush(&BeMessage::CopyInResponse)?;
        self.flush_cancellable(pgb, &timeline.cancel).await?;
        let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb, &timeline.cancel)));
        import_wal_from_tar(&timeline, &mut copyin_reader, start_lsn, end_lsn, &ctx).await?;
        info!("wal import complete");
        // Read the end of the tar archive.
        read_tar_eof(copyin_reader).await?;
        // TODO Does it make sense to overshoot?
        if timeline.get_last_record_lsn() < end_lsn {
            return Err(QueryError::Other(
                anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}"))
            );
        }
        // Flush data to disk, then upload to s3. No need for a forced checkpoint.
        // We only want to persist the data, and it doesn't matter if it's in the
        // shape of deltas or images.
        info!("flushing layers");
        timeline.freeze_and_flush().await.map_err(|e| match e {
            FlushLayerError::Cancelled => QueryError::Shutdown,
            other => QueryError::Other(other.into()),
        })?;
        info!("done");
        Ok(())
    }
    /// Helper function to handle the LSN from client request.
    ///
    /// Each GetPage (and Exists and Nblocks) request includes information about
@@ -1656,53 +1380,6 @@ where
            metric_recording.observe(&res);
            res?;
        }
        // return pair of prev_lsn and last_lsn
        else if let Some(params) = parts.strip_prefix(&["get_last_record_rlsn"]) {
            if params.len() != 2 {
                return Err(QueryError::Other(anyhow::anyhow!(
                    "invalid param number for get_last_record_rlsn command"
                )));
            }
            let tenant_id = TenantId::from_str(params[0])
                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
            let timeline_id = TimelineId::from_str(params[1])
                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
            tracing::Span::current()
                .record("tenant_id", field::display(tenant_id))
                .record("timeline_id", field::display(timeline_id));
            self.check_permission(Some(tenant_id))?;
            COMPUTE_COMMANDS_COUNTERS
                .for_command(ComputeCommandKind::GetLastRecordRlsn)
                .inc();
            async {
                let timeline = self
                    .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
                    .await?;
                let end_of_timeline = timeline.get_last_record_rlsn();
                pgb.write_message_noflush(&BeMessage::RowDescription(&[
                    RowDescriptor::text_col(b"prev_lsn"),
                    RowDescriptor::text_col(b"last_lsn"),
                ]))?
                .write_message_noflush(&BeMessage::DataRow(&[
                    Some(end_of_timeline.prev.to_string().as_bytes()),
                    Some(end_of_timeline.last.to_string().as_bytes()),
                ]))?
                .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
                anyhow::Ok(())
            }
            .instrument(info_span!(
                "handle_get_last_record_lsn",
                shard_id = tracing::field::Empty
            ))
            .await?;
        }
        // same as basebackup, but result includes relational data as well
        else if let Some(params) = parts.strip_prefix(&["fullbackup"]) {
            if params.len() < 2 {
@@ -1757,109 +1434,6 @@ where
            )
            .await?;
            pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
        } else if query_string.starts_with("import basebackup ") {
            // Import the `base` section (everything but the wal) of a basebackup.
            // Assumes the tenant already exists on this pageserver.
            //
            // Files are scheduled to be persisted to remote storage, and the
            // caller should poll the http api to check when that is done.
            //
            // Example import command:
            // 1. Get start/end LSN from backup_manifest file
            // 2. Run:
            // cat my_backup/base.tar | psql -h $PAGESERVER \
            //     -c "import basebackup $TENANT $TIMELINE $START_LSN $END_LSN $PG_VERSION"
            let params = &parts[2..];
            if params.len() != 5 {
                return Err(QueryError::Other(anyhow::anyhow!(
                    "invalid param number for import basebackup command"
                )));
            }
            let tenant_id = TenantId::from_str(params[0])
                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
            let timeline_id = TimelineId::from_str(params[1])
                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
            let base_lsn = Lsn::from_str(params[2])
                .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?;
            let end_lsn = Lsn::from_str(params[3])
                .with_context(|| format!("Failed to parse Lsn from {}", params[3]))?;
            let pg_version = u32::from_str(params[4])
                .with_context(|| format!("Failed to parse pg_version from {}", params[4]))?;
            tracing::Span::current()
                .record("tenant_id", field::display(tenant_id))
                .record("timeline_id", field::display(timeline_id));
            self.check_permission(Some(tenant_id))?;
            COMPUTE_COMMANDS_COUNTERS
                .for_command(ComputeCommandKind::ImportBasebackup)
                .inc();
            match self
                .handle_import_basebackup(
                    pgb,
                    tenant_id,
                    timeline_id,
                    base_lsn,
                    end_lsn,
                    pg_version,
                    ctx,
                )
                .await
            {
                Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?,
                Err(e) => {
                    error!("error importing base backup between {base_lsn} and {end_lsn}: {e:?}");
                    pgb.write_message_noflush(&BeMessage::ErrorResponse(
                        &e.to_string(),
                        Some(e.pg_error_code()),
                    ))?
                }
            };
        } else if query_string.starts_with("import wal ") {
            // Import the `pg_wal` section of a basebackup.
            //
            // Files are scheduled to be persisted to remote storage, and the
            // caller should poll the http api to check when that is done.
            let params = &parts[2..];
            if params.len() != 4 {
                return Err(QueryError::Other(anyhow::anyhow!(
                    "invalid param number for import wal command"
                )));
            }
            let tenant_id = TenantId::from_str(params[0])
                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
            let timeline_id = TimelineId::from_str(params[1])
                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
            let start_lsn = Lsn::from_str(params[2])
                .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?;
            let end_lsn = Lsn::from_str(params[3])
                .with_context(|| format!("Failed to parse Lsn from {}", params[3]))?;
            tracing::Span::current()
                .record("tenant_id", field::display(tenant_id))
                .record("timeline_id", field::display(timeline_id));
            self.check_permission(Some(tenant_id))?;
            COMPUTE_COMMANDS_COUNTERS
                .for_command(ComputeCommandKind::ImportWal)
                .inc();
            match self
                .handle_import_wal(pgb, tenant_id, timeline_id, start_lsn, end_lsn, ctx)
                .await
            {
                Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?,
                Err(e) => {
                    error!("error importing WAL between {start_lsn} and {end_lsn}: {e:?}");
                    pgb.write_message_noflush(&BeMessage::ErrorResponse(
                        &e.to_string(),
                        Some(e.pg_error_code()),
                    ))?
                }
            };
        } else if query_string.to_ascii_lowercase().starts_with("set ") {
            // important because psycopg2 executes "SET datestyle TO 'ISO'"
            // on connect
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -522,7 +522,7 @@ impl Timeline {
        ctx: &RequestContext,
    ) -> Result<Option<TimestampTz>, PageReconstructError> {
        let mut max: Option<TimestampTz> = None;
-        self.map_all_timestamps(probe_lsn, ctx, |timestamp| {
+        self.map_all_timestamps::<()>(probe_lsn, ctx, |timestamp| {
            if let Some(max_prev) = max {
                max = Some(max_prev.max(timestamp));
            } else {
@@ -854,13 +854,14 @@ impl Timeline {
        result.add_key(DBDIR_KEY);
        // Fetch list of database dirs and iterate them
-        let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
+        let dbdir = self.list_dbdirs(lsn, ctx).await?;
-        let dbdir = DbDirectory::des(&buf)?;
+        let mut dbs: Vec<((Oid, Oid), bool)> = dbdir.into_iter().collect();
-        let mut dbs: Vec<(Oid, Oid)> = dbdir.dbdirs.keys().cloned().collect();
+        dbs.sort_unstable_by(|(k_a, _), (k_b, _)| k_a.cmp(k_b));
-        dbs.sort_unstable();
+        for ((spcnode, dbnode), has_relmap_file) in dbs {
-        for (spcnode, dbnode) in dbs {
+            if has_relmap_file {
-            result.add_key(relmap_file_key(spcnode, dbnode));
+                result.add_key(relmap_file_key(spcnode, dbnode));
            }
            result.add_key(rel_dir_to_key(spcnode, dbnode));
            let mut rels: Vec<RelTag> = self
@@ -919,6 +920,9 @@ impl Timeline {
            result.add_key(AUX_FILES_KEY);
        }
        // Add extra keyspaces in the test cases. Some test cases write keys into the storage without
        // creating directory keys. These test cases will add such keyspaces into `extra_test_dense_keyspace`
        // and the keys will not be garbage-colllected.
        #[cfg(test)]
        {
            let guard = self.extra_test_dense_keyspace.load();
@@ -927,13 +931,48 @@ impl Timeline {
            }
        }
-        Ok((
+        let dense_keyspace = result.to_keyspace();
-            result.to_keyspace(),
+        let sparse_keyspace = SparseKeySpace(KeySpace {
-            /* AUX sparse key space */
+            ranges: vec![Key::metadata_aux_key_range(), repl_origin_key_range()],
-            SparseKeySpace(KeySpace {
+        });
-                ranges: vec![repl_origin_key_range(), Key::metadata_aux_key_range()],
+
-            }),
+        if cfg!(debug_assertions) {
-        ))
+            // Verify if the sparse keyspaces are ordered and non-overlapping.
            // We do not use KeySpaceAccum for sparse_keyspace because we want to ensure each
            // category of sparse keys are split into their own image/delta files. If there
            // are overlapping keyspaces, they will be automatically merged by keyspace accum,
            // and we want the developer to keep the keyspaces separated.
            let ranges = &sparse_keyspace.0.ranges;
            // TODO: use a single overlaps_with across the codebase
            fn overlaps_with<T: Ord>(a: &Range<T>, b: &Range<T>) -> bool {
                !(a.end <= b.start || b.end <= a.start)
            }
            for i in 0..ranges.len() {
                for j in 0..i {
                    if overlaps_with(&ranges[i], &ranges[j]) {
                        panic!(
                            "overlapping sparse keyspace: {}..{} and {}..{}",
                            ranges[i].start, ranges[i].end, ranges[j].start, ranges[j].end
                        );
                    }
                }
            }
            for i in 1..ranges.len() {
                assert!(
                    ranges[i - 1].end <= ranges[i].start,
                    "unordered sparse keyspace: {}..{} and {}..{}",
                    ranges[i - 1].start,
                    ranges[i - 1].end,
                    ranges[i].start,
                    ranges[i].end
                );
            }
        }
        Ok((dense_keyspace, sparse_keyspace))
    }
    /// Get cached size of relation if it not updated after specified LSN
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -39,6 +39,7 @@ use tokio::task::JoinSet;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::backoff;
 use utils::circuit_breaker::CircuitBreaker;
 use utils::completion;
 use utils::crashsafe::path_with_suffix_extension;
 use utils::failpoint_support;
@@ -73,9 +74,11 @@ use crate::deletion_queue::DeletionQueueClient;
 use crate::deletion_queue::DeletionQueueError;
 use crate::import_datadir;
 use crate::is_uninit_mark;
 use crate::l0_flush::L0FlushGlobalState;
 use crate::metrics::TENANT;
 use crate::metrics::{
-    remove_tenant_metrics, BROKEN_TENANTS_SET, TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC,
+    remove_tenant_metrics, BROKEN_TENANTS_SET, CIRCUIT_BREAKERS_BROKEN, CIRCUIT_BREAKERS_UNBROKEN,
    TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC,
 };
 use crate::repository::GcResult;
 use crate::task_mgr;
@@ -166,6 +169,7 @@ pub struct TenantSharedResources {
    pub broker_client: storage_broker::BrokerClientChannel,
    pub remote_storage: GenericRemoteStorage,
    pub deletion_queue_client: DeletionQueueClient,
    pub l0_flush_global_state: L0FlushGlobalState,
 }
 /// A [`Tenant`] is really an _attached_ tenant.  The configuration
@@ -274,6 +278,10 @@ pub struct Tenant {
    eviction_task_tenant_state: tokio::sync::Mutex<EvictionTaskTenantState>,
    /// Track repeated failures to compact, so that we can back off.
    /// Overhead of mutex is acceptable because compaction is done with a multi-second period.
    compaction_circuit_breaker: std::sync::Mutex<CircuitBreaker>,
    /// If the tenant is in Activating state, notify this to encourage it
    /// to proceed to Active as soon as possible, rather than waiting for lazy
    /// background warmup.
@@ -294,6 +302,8 @@ pub struct Tenant {
    /// An ongoing timeline detach must be checked during attempts to GC or compact a timeline.
    ongoing_timeline_detach: std::sync::Mutex<Option<(TimelineId, utils::completion::Barrier)>>,
    l0_flush_global_state: L0FlushGlobalState,
 }
 impl std::fmt::Debug for Tenant {
@@ -529,6 +539,15 @@ impl From<PageReconstructError> for GcError {
    }
 }
 #[derive(thiserror::Error, Debug)]
 pub(crate) enum LoadConfigError {
    #[error("TOML deserialization error: '{0}'")]
    DeserializeToml(#[from] toml_edit::de::Error),
    #[error("Config not found at {0}")]
    NotFound(Utf8PathBuf),
 }
 impl Tenant {
    /// Yet another helper for timeline initialization.
    ///
@@ -667,6 +686,7 @@ impl Tenant {
            broker_client,
            remote_storage,
            deletion_queue_client,
            l0_flush_global_state,
        } = resources;
        let attach_mode = attached_conf.location.attach_mode;
@@ -681,6 +701,7 @@ impl Tenant {
            tenant_shard_id,
            remote_storage.clone(),
            deletion_queue_client,
            l0_flush_global_state,
        ));
        // The attach task will carry a GateGuard, so that shutdown() reliably waits for it to drop out if
@@ -980,6 +1001,7 @@ impl Tenant {
                TimelineResources {
                    remote_client,
                    timeline_get_throttle: self.timeline_get_throttle.clone(),
                    l0_flush_global_state: self.l0_flush_global_state.clone(),
                },
                ctx,
            )
@@ -1349,7 +1371,7 @@ impl Tenant {
        initdb_lsn: Lsn,
        pg_version: u32,
        ctx: &RequestContext,
-        delta_layer_desc: Vec<Vec<(pageserver_api::key::Key, Lsn, crate::repository::Value)>>,
+        delta_layer_desc: Vec<timeline::DeltaLayerTestDesc>,
        image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>,
        end_lsn: Lsn,
    ) -> anyhow::Result<Arc<Timeline>> {
@@ -1625,13 +1647,31 @@ impl Tenant {
            timelines_to_compact
        };
        // Before doing any I/O work, check our circuit breaker
        if self.compaction_circuit_breaker.lock().unwrap().is_broken() {
            info!("Skipping compaction due to previous failures");
            return Ok(());
        }
        for (timeline_id, timeline) in &timelines_to_compact {
            timeline
                .compact(cancel, EnumSet::empty(), ctx)
                .instrument(info_span!("compact_timeline", %timeline_id))
-                .await?;
+                .await
                .map_err(|e| {
                    self.compaction_circuit_breaker
                        .lock()
                        .unwrap()
                        .fail(&CIRCUIT_BREAKERS_BROKEN, &e);
                    e
                })?;
        }
        self.compaction_circuit_breaker
            .lock()
            .unwrap()
            .success(&CIRCUIT_BREAKERS_UNBROKEN);
        Ok(())
    }
@@ -1800,9 +1840,15 @@ impl Tenant {
        // If we're still attaching, fire the cancellation token early to drop out: this
        // will prevent us flushing, but ensures timely shutdown if some I/O during attach
        // is very slow.
-        if matches!(self.current_state(), TenantState::Attaching) {
+        let shutdown_mode = if matches!(self.current_state(), TenantState::Attaching) {
            self.cancel.cancel();
-        }
+
            // Having fired our cancellation token, do not try and flush timelines: their cancellation tokens
            // are children of ours, so their flush loops will have shut down already
            timeline::ShutdownMode::Hard
        } else {
            shutdown_mode
        };
        match self.set_stopping(shutdown_progress, false, false).await {
            Ok(()) => {}
@@ -2319,13 +2365,6 @@ impl Tenant {
            .unwrap_or(self.conf.default_tenant_conf.pitr_interval)
    }
    pub fn get_trace_read_requests(&self) -> bool {
        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
        tenant_conf
            .trace_read_requests
            .unwrap_or(self.conf.default_tenant_conf.trace_read_requests)
    }
    pub fn get_min_resident_size_override(&self) -> Option<u64> {
        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
        tenant_conf
@@ -2469,6 +2508,7 @@ impl Tenant {
        tenant_shard_id: TenantShardId,
        remote_storage: GenericRemoteStorage,
        deletion_queue_client: DeletionQueueClient,
        l0_flush_global_state: L0FlushGlobalState,
    ) -> Tenant {
        debug_assert!(
            !attached_conf.location.generation.is_none() || conf.control_plane_api.is_none()
@@ -2547,6 +2587,14 @@ impl Tenant {
            cached_logical_sizes: tokio::sync::Mutex::new(HashMap::new()),
            cached_synthetic_tenant_size: Arc::new(AtomicU64::new(0)),
            eviction_task_tenant_state: tokio::sync::Mutex::new(EvictionTaskTenantState::default()),
            compaction_circuit_breaker: std::sync::Mutex::new(CircuitBreaker::new(
                format!("compaction-{tenant_shard_id}"),
                5,
                // Compaction can be a very expensive operation, and might leak disk space.  It also ought
                // to be infallible, as long as remote storage is available.  So if it repeatedly fails,
                // use an extremely long backoff.
                Some(Duration::from_secs(3600 * 24)),
            )),
            activate_now_sem: tokio::sync::Semaphore::new(0),
            cancel: CancellationToken::default(),
            gate: Gate::default(),
@@ -2556,6 +2604,7 @@ impl Tenant {
            )),
            tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)),
            ongoing_timeline_detach: std::sync::Mutex::default(),
            l0_flush_global_state,
        }
    }
@@ -2563,36 +2612,35 @@ impl Tenant {
    pub(super) fn load_tenant_config(
        conf: &'static PageServerConf,
        tenant_shard_id: &TenantShardId,
-    ) -> anyhow::Result<LocationConf> {
+    ) -> Result<LocationConf, LoadConfigError> {
        let config_path = conf.tenant_location_config_path(tenant_shard_id);
-        if config_path.exists() {
+        info!("loading tenant configuration from {config_path}");
            // New-style config takes precedence
            let deserialized = Self::read_config(&config_path)?;
            Ok(toml_edit::de::from_document::<LocationConf>(deserialized)?)
        } else {
            // The config should almost always exist for a tenant directory:
            //  - When attaching a tenant, the config is the first thing we write
            //  - When detaching a tenant, we atomically move the directory to a tmp location
            //    before deleting contents.
            //
            // The very rare edge case that can result in a missing config is if we crash during attach
            // between creating directory and writing config.  Callers should handle that as if the
            // directory didn't exist.
            anyhow::bail!("tenant config not found in {}", config_path);
        }
    }
    fn read_config(path: &Utf8Path) -> anyhow::Result<toml_edit::Document> {
        info!("loading tenant configuration from {path}");
        // load and parse file
-        let config = fs::read_to_string(path)
+        let config = fs::read_to_string(&config_path).map_err(|e| {
-            .with_context(|| format!("Failed to load config from path '{path}'"))?;
+            match e.kind() {
                std::io::ErrorKind::NotFound => {
                    // The config should almost always exist for a tenant directory:
                    //  - When attaching a tenant, the config is the first thing we write
                    //  - When detaching a tenant, we atomically move the directory to a tmp location
                    //    before deleting contents.
                    //
                    // The very rare edge case that can result in a missing config is if we crash during attach
                    // between creating directory and writing config.  Callers should handle that as if the
                    // directory didn't exist.
-        config
+                    LoadConfigError::NotFound(config_path)
-            .parse::<toml_edit::Document>()
+                }
-            .with_context(|| format!("Failed to parse config from file '{path}' as toml file"))
+                _ => {
                    // No IO errors except NotFound are acceptable here: other kinds of error indicate local storage or permissions issues
                    // that we cannot cleanly recover
                    crate::virtual_file::on_fatal_io_error(&e, "Reading tenant config file")
                }
            }
        })?;
        Ok(toml_edit::de::from_str::<LocationConf>(&config)?)
    }
    #[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))]
@@ -2600,7 +2648,7 @@ impl Tenant {
        conf: &'static PageServerConf,
        tenant_shard_id: &TenantShardId,
        location_conf: &LocationConf,
-    ) -> anyhow::Result<()> {
+    ) -> std::io::Result<()> {
        let config_path = conf.tenant_location_config_path(tenant_shard_id);
        Self::persist_tenant_config_at(tenant_shard_id, &config_path, location_conf).await
@@ -2611,7 +2659,7 @@ impl Tenant {
        tenant_shard_id: &TenantShardId,
        config_path: &Utf8Path,
        location_conf: &LocationConf,
-    ) -> anyhow::Result<()> {
+    ) -> std::io::Result<()> {
        debug!("persisting tenantconf to {config_path}");
        let mut conf_content = r#"# This file contains a specific per-tenant's config.
@@ -2620,22 +2668,20 @@ impl Tenant {
        .to_string();
        fail::fail_point!("tenant-config-before-write", |_| {
-            anyhow::bail!("tenant-config-before-write");
+            Err(std::io::Error::new(
                std::io::ErrorKind::Other,
                "tenant-config-before-write",
            ))
        });
        // Convert the config to a toml file.
-        conf_content += &toml_edit::ser::to_string_pretty(&location_conf)?;
+        conf_content +=
            &toml_edit::ser::to_string_pretty(&location_conf).expect("Config serialization failed");
        let temp_path = path_with_suffix_extension(config_path, TEMP_FILE_SUFFIX);
        let tenant_shard_id = *tenant_shard_id;
        let config_path = config_path.to_owned();
        let conf_content = conf_content.into_bytes();
-        VirtualFile::crashsafe_overwrite(config_path.clone(), temp_path, conf_content)
+        VirtualFile::crashsafe_overwrite(config_path.to_owned(), temp_path, conf_content).await
            .await
            .with_context(|| format!("write tenant {tenant_shard_id} config to {config_path}"))?;
        Ok(())
    }
    //
@@ -2853,6 +2899,7 @@ impl Tenant {
            {
                let mut target = timeline.gc_info.write().unwrap();
                // Cull any expired leases
                let now = SystemTime::now();
                target.leases.retain(|_, lease| !lease.is_expired(&now));
@@ -2861,6 +2908,31 @@ impl Tenant {
                    .valid_lsn_lease_count_gauge
                    .set(target.leases.len() as u64);
                // Look up parent's PITR cutoff to update the child's knowledge of whether it is within parent's PITR
                if let Some(ancestor_id) = timeline.get_ancestor_timeline_id() {
                    if let Some(ancestor_gc_cutoffs) = gc_cutoffs.get(&ancestor_id) {
                        target.within_ancestor_pitr =
                            timeline.get_ancestor_lsn() >= ancestor_gc_cutoffs.pitr;
                    }
                }
                // Update metrics that depend on GC state
                timeline
                    .metrics
                    .archival_size
                    .set(if target.within_ancestor_pitr {
                        timeline.metrics.current_logical_size_gauge.get()
                    } else {
                        0
                    });
                timeline.metrics.pitr_history_size.set(
                    timeline
                        .get_last_record_lsn()
                        .checked_sub(target.cutoffs.pitr)
                        .unwrap_or(Lsn(0))
                        .0,
                );
                match gc_cutoffs.remove(&timeline.timeline_id) {
                    Some(cutoffs) => {
                        target.retain_lsns = branchpoints;
@@ -2912,7 +2984,7 @@ impl Tenant {
        dst_id: TimelineId,
        ancestor_lsn: Option<Lsn>,
        ctx: &RequestContext,
-        delta_layer_desc: Vec<Vec<(pageserver_api::key::Key, Lsn, crate::repository::Value)>>,
+        delta_layer_desc: Vec<timeline::DeltaLayerTestDesc>,
        image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>,
        end_lsn: Lsn,
    ) -> anyhow::Result<Arc<Timeline>> {
@@ -3296,6 +3368,7 @@ impl Tenant {
        TimelineResources {
            remote_client,
            timeline_get_throttle: self.timeline_get_throttle.clone(),
            l0_flush_global_state: self.l0_flush_global_state.clone(),
        }
    }
@@ -3632,6 +3705,7 @@ pub(crate) mod harness {
    use utils::logging;
    use crate::deletion_queue::mock::MockDeletionQueue;
    use crate::l0_flush::L0FlushConfig;
    use crate::walredo::apply_neon;
    use crate::{repository::Key, walrecord::NeonWalRecord};
@@ -3669,7 +3743,6 @@ pub(crate) mod harness {
                walreceiver_connect_timeout: Some(tenant_conf.walreceiver_connect_timeout),
                lagging_wal_timeout: Some(tenant_conf.lagging_wal_timeout),
                max_lsn_wal_lag: Some(tenant_conf.max_lsn_wal_lag),
                trace_read_requests: Some(tenant_conf.trace_read_requests),
                eviction_policy: Some(tenant_conf.eviction_policy),
                min_resident_size_override: tenant_conf.min_resident_size_override,
                evictions_low_residence_duration_metric_threshold: Some(
@@ -3821,6 +3894,8 @@ pub(crate) mod harness {
                self.tenant_shard_id,
                self.remote_storage.clone(),
                self.deletion_queue.new_client(),
                // TODO: ideally we should run all unit tests with both configs
                L0FlushGlobalState::new(L0FlushConfig::default()),
            ));
            let preload = tenant
@@ -3908,7 +3983,7 @@ mod tests {
    use storage_layer::PersistentLayerKey;
    use tests::storage_layer::ValuesReconstructState;
    use tests::timeline::{GetVectoredError, ShutdownMode};
-    use timeline::GcInfo;
+    use timeline::{DeltaLayerTestDesc, GcInfo};
    use utils::bin_ser::BeSer;
    use utils::id::TenantId;
@@ -6204,27 +6279,6 @@ mod tests {
            .await
            .unwrap();
        async fn get_vectored_impl_wrapper(
            tline: &Arc<Timeline>,
            key: Key,
            lsn: Lsn,
            ctx: &RequestContext,
        ) -> Result<Option<Bytes>, GetVectoredError> {
            let mut reconstruct_state = ValuesReconstructState::new();
            let mut res = tline
                .get_vectored_impl(
                    KeySpace::single(key..key.next()),
                    lsn,
                    &mut reconstruct_state,
                    ctx,
                )
                .await?;
            Ok(res.pop_last().map(|(k, v)| {
                assert_eq!(k, key);
                v.unwrap()
            }))
        }
        let lsn = Lsn(0x30);
        // test vectored get on parent timeline
@@ -6300,27 +6354,6 @@ mod tests {
            .await
            .unwrap();
        async fn get_vectored_impl_wrapper(
            tline: &Arc<Timeline>,
            key: Key,
            lsn: Lsn,
            ctx: &RequestContext,
        ) -> Result<Option<Bytes>, GetVectoredError> {
            let mut reconstruct_state = ValuesReconstructState::new();
            let mut res = tline
                .get_vectored_impl(
                    KeySpace::single(key..key.next()),
                    lsn,
                    &mut reconstruct_state,
                    ctx,
                )
                .await?;
            Ok(res.pop_last().map(|(k, v)| {
                assert_eq!(k, key);
                v.unwrap()
            }))
        }
        let lsn = Lsn(0x30);
        // test vectored get on parent timeline
@@ -6396,9 +6429,18 @@ mod tests {
                &ctx,
                // delta layers
                vec![
-                    vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))],
+                    DeltaLayerTestDesc::new_with_inferred_key_range(
-                    vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))],
+                        Lsn(0x10)..Lsn(0x20),
-                    vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))],
+                        vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))],
                    ),
                    DeltaLayerTestDesc::new_with_inferred_key_range(
                        Lsn(0x20)..Lsn(0x30),
                        vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))],
                    ),
                    DeltaLayerTestDesc::new_with_inferred_key_range(
                        Lsn(0x20)..Lsn(0x30),
                        vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))],
                    ),
                ],
                // image layers
                vec![
@@ -6464,17 +6506,29 @@ mod tests {
                &ctx,
                // delta layers
                vec![
-                    vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))],
+                    DeltaLayerTestDesc::new_with_inferred_key_range(
-                    vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))],
+                        Lsn(0x10)..Lsn(0x20),
-                    vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))],
+                        vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))],
-                    vec![
+                    ),
-                        (key0, Lsn(0x30), Value::Image(test_img("metadata key 0"))),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(
-                        (key3, Lsn(0x30), Value::Image(test_img("metadata key 3"))),
+                        Lsn(0x20)..Lsn(0x30),
-                    ],
+                        vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))],
                    ),
                    DeltaLayerTestDesc::new_with_inferred_key_range(
                        Lsn(0x20)..Lsn(0x30),
                        vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))],
                    ),
                    DeltaLayerTestDesc::new_with_inferred_key_range(
                        Lsn(0x30)..Lsn(0x40),
                        vec![
                            (key0, Lsn(0x30), Value::Image(test_img("metadata key 0"))),
                            (key3, Lsn(0x30), Value::Image(test_img("metadata key 3"))),
                        ],
                    ),
                ],
                // image layers
                vec![(Lsn(0x10), vec![(key1, test_img("metadata key 1"))])],
-                Lsn(0x30),
+                Lsn(0x40),
            )
            .await
            .unwrap();
@@ -6497,7 +6551,7 @@ mod tests {
        // Image layers are created at last_record_lsn
        let images = tline
-            .inspect_image_layers(Lsn(0x30), &ctx)
+            .inspect_image_layers(Lsn(0x40), &ctx)
            .await
            .unwrap()
            .into_iter()
@@ -6523,9 +6577,18 @@ mod tests {
                &ctx,
                // delta layers
                vec![
-                    vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))],
+                    DeltaLayerTestDesc::new_with_inferred_key_range(
-                    vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))],
+                        Lsn(0x10)..Lsn(0x20),
-                    vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))],
+                        vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))],
                    ),
                    DeltaLayerTestDesc::new_with_inferred_key_range(
                        Lsn(0x20)..Lsn(0x30),
                        vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))],
                    ),
                    DeltaLayerTestDesc::new_with_inferred_key_range(
                        Lsn(0x20)..Lsn(0x30),
                        vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))],
                    ),
                ],
                // image layers
                vec![(Lsn(0x10), vec![(key1, test_img("metadata key 1"))])],
@@ -6573,15 +6636,21 @@ mod tests {
            key
        }
-        // We create one bottom-most image layer, a delta layer D1 crossing the GC horizon, D2 below the horizon, and D3 above the horizon.
+        // We create
        // - one bottom-most image layer,
        // - a delta layer D1 crossing the GC horizon with data below and above the horizon,
        // - a delta layer D2 crossing the GC horizon with data only below the horizon,
        // - a delta layer D3 above the horizon.
        //
-        //  | D1 |                       | D3 |
+        //                             | D3 |
        //  | D1 |
        // -|    |-- gc horizon -----------------
        //  |    |                | D2 |
        // --------- img layer ------------------
        //
        // What we should expact from this compaction is:
-        //  | Part of D1 |               | D3 |
+        //                             | D3 |
        //  | Part of D1 |
        // --------- img layer with D1+D2 at GC horizon------------------
        // img layer at 0x10
@@ -6621,13 +6690,13 @@ mod tests {
        let delta3 = vec![
            (
                get_key(8),
-                Lsn(0x40),
+                Lsn(0x48),
-                Value::Image(Bytes::from("value 8@0x40")),
+                Value::Image(Bytes::from("value 8@0x48")),
            ),
            (
                get_key(9),
-                Lsn(0x40),
+                Lsn(0x48),
-                Value::Image(Bytes::from("value 9@0x40")),
+                Value::Image(Bytes::from("value 9@0x48")),
            ),
        ];
@@ -6637,7 +6706,11 @@ mod tests {
                Lsn(0x10),
                DEFAULT_PG_VERSION,
                &ctx,
-                vec![delta1, delta2, delta3], // delta layers
+                vec![
                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta1),
                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta2),
                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x48)..Lsn(0x50), delta3),
                ], // delta layers
                vec![(Lsn(0x10), img_layer)], // image layers
                Lsn(0x50),
            )
@@ -6658,8 +6731,8 @@ mod tests {
            Bytes::from_static(b"value 5@0x20"),
            Bytes::from_static(b"value 6@0x20"),
            Bytes::from_static(b"value 7@0x10"),
-            Bytes::from_static(b"value 8@0x40"),
+            Bytes::from_static(b"value 8@0x48"),
-            Bytes::from_static(b"value 9@0x40"),
+            Bytes::from_static(b"value 9@0x48"),
        ];
        for (idx, expected) in expected_result.iter().enumerate() {
@@ -6747,10 +6820,10 @@ mod tests {
                    lsn_range: Lsn(0x30)..Lsn(0x41),
                    is_delta: true
                },
-                // The delta layer we created and should not be picked for the compaction
+                // The delta3 layer that should not be picked for the compaction
                PersistentLayerKey {
                    key_range: get_key(8)..get_key(10),
-                    lsn_range: Lsn(0x40)..Lsn(0x41),
+                    lsn_range: Lsn(0x48)..Lsn(0x50),
                    is_delta: true
                }
            ]
@@ -6814,7 +6887,10 @@ mod tests {
                Lsn(0x10),
                DEFAULT_PG_VERSION,
                &ctx,
-                vec![delta1],              // delta layers
+                vec![DeltaLayerTestDesc::new_with_inferred_key_range(
                    Lsn(0x10)..Lsn(0x40),
                    delta1,
                )], // delta layers
                vec![(Lsn(0x10), image1)], // image layers
                Lsn(0x50),
            )
@@ -6938,15 +7014,21 @@ mod tests {
            key
        }
-        // We create one bottom-most image layer, a delta layer D1 crossing the GC horizon, D2 below the horizon, and D3 above the horizon.
+        // We create
        // - one bottom-most image layer,
        // - a delta layer D1 crossing the GC horizon with data below and above the horizon,
        // - a delta layer D2 crossing the GC horizon with data only below the horizon,
        // - a delta layer D3 above the horizon.
        //
-        //  | D1 |                       | D3 |
+        //                             | D3 |
        //  | D1 |
        // -|    |-- gc horizon -----------------
        //  |    |                | D2 |
        // --------- img layer ------------------
        //
        // What we should expact from this compaction is:
-        //  | Part of D1 |               | D3 |
+        //                             | D3 |
        //  | Part of D1 |
        // --------- img layer with D1+D2 at GC horizon------------------
        // img layer at 0x10
@@ -6996,13 +7078,13 @@ mod tests {
        let delta3 = vec![
            (
                get_key(8),
-                Lsn(0x40),
+                Lsn(0x48),
-                Value::WalRecord(NeonWalRecord::wal_append("@0x40")),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
            ),
            (
                get_key(9),
-                Lsn(0x40),
+                Lsn(0x48),
-                Value::WalRecord(NeonWalRecord::wal_append("@0x40")),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
            ),
        ];
@@ -7012,7 +7094,11 @@ mod tests {
                Lsn(0x10),
                DEFAULT_PG_VERSION,
                &ctx,
-                vec![delta1, delta2, delta3], // delta layers
+                vec![
                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta1),
                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta2),
                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x48)..Lsn(0x50), delta3),
                ], // delta layers
                vec![(Lsn(0x10), img_layer)], // image layers
                Lsn(0x50),
            )
@@ -7027,6 +7113,7 @@ mod tests {
                    horizon: Lsn(0x30),
                },
                leases: Default::default(),
                within_ancestor_pitr: false,
            };
        }
@@ -7039,8 +7126,8 @@ mod tests {
            Bytes::from_static(b"value 5@0x10@0x20"),
            Bytes::from_static(b"value 6@0x10@0x20"),
            Bytes::from_static(b"value 7@0x10"),
-            Bytes::from_static(b"value 8@0x10@0x40"),
+            Bytes::from_static(b"value 8@0x10@0x48"),
-            Bytes::from_static(b"value 9@0x10@0x40"),
+            Bytes::from_static(b"value 9@0x10@0x48"),
        ];
        let expected_result_at_gc_horizon = [
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -6,13 +6,20 @@
 //! is written as a one byte. If it's larger than that, the length
 //! is written as a four-byte integer, in big-endian, with the high
 //! bit set. This way, we can detect whether it's 1- or 4-byte header
-//! by peeking at the first byte.
+//! by peeking at the first byte. For blobs larger than 128 bits,
 //! we also specify three reserved bits, only one of the three bit
 //! patterns is currently in use (0b011) and signifies compression
 //! with zstd.
 //!
 //! len <  128: 0XXXXXXX
-//! len >= 128: 1XXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX
+//! len >= 128: 1CCCXXXX XXXXXXXX XXXXXXXX XXXXXXXX
 //!
 use async_compression::Level;
 use bytes::{BufMut, BytesMut};
 use pageserver_api::models::ImageCompressionAlgorithm;
 use tokio::io::AsyncWriteExt;
 use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice};
 use tracing::warn;
 use crate::context::RequestContext;
 use crate::page_cache::PAGE_SZ;
@@ -66,12 +73,37 @@ impl<'a> BlockCursor<'a> {
                len_buf.copy_from_slice(&buf[off..off + 4]);
                off += 4;
            }
-            len_buf[0] &= 0x7f;
+            let bit_mask = if self.read_compressed {
                !LEN_COMPRESSION_BIT_MASK
            } else {
                0x7f
            };
            len_buf[0] &= bit_mask;
            u32::from_be_bytes(len_buf) as usize
        };
        let compression_bits = first_len_byte & LEN_COMPRESSION_BIT_MASK;
-        dstbuf.clear();
+        let mut tmp_buf = Vec::new();
-        dstbuf.reserve(len);
+        let buf_to_write;
        let compression = if compression_bits <= BYTE_UNCOMPRESSED || !self.read_compressed {
            if compression_bits > BYTE_UNCOMPRESSED {
                warn!("reading key above future limit ({len} bytes)");
            }
            buf_to_write = dstbuf;
            None
        } else if compression_bits == BYTE_ZSTD {
            buf_to_write = &mut tmp_buf;
            Some(dstbuf)
        } else {
            let error = std::io::Error::new(
                std::io::ErrorKind::InvalidData,
                format!("invalid compression byte {compression_bits:x}"),
            );
            return Err(error);
        };
        buf_to_write.clear();
        buf_to_write.reserve(len);
        // Read the payload
        let mut remain = len;
@@ -85,14 +117,35 @@ impl<'a> BlockCursor<'a> {
                page_remain = PAGE_SZ;
            }
            let this_blk_len = min(remain, page_remain);
-            dstbuf.extend_from_slice(&buf[off..off + this_blk_len]);
+            buf_to_write.extend_from_slice(&buf[off..off + this_blk_len]);
            remain -= this_blk_len;
            off += this_blk_len;
        }
        if let Some(dstbuf) = compression {
            if compression_bits == BYTE_ZSTD {
                let mut decoder = async_compression::tokio::write::ZstdDecoder::new(dstbuf);
                decoder.write_all(buf_to_write).await?;
                decoder.flush().await?;
            } else {
                unreachable!("already checked above")
            }
        }
        Ok(())
    }
 }
 /// Reserved bits for length and compression
 pub(super) const LEN_COMPRESSION_BIT_MASK: u8 = 0xf0;
 /// The maximum size of blobs we support. The highest few bits
 /// are reserved for compression and other further uses.
 const MAX_SUPPORTED_LEN: usize = 0x0fff_ffff;
 pub(super) const BYTE_UNCOMPRESSED: u8 = 0x80;
 pub(super) const BYTE_ZSTD: u8 = BYTE_UNCOMPRESSED | 0x10;
 /// A wrapper of `VirtualFile` that allows users to write blobs.
 ///
 /// If a `BlobWriter` is dropped, the internal buffer will be
@@ -219,6 +272,18 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
        &mut self,
        srcbuf: B,
        ctx: &RequestContext,
    ) -> (B::Buf, Result<u64, Error>) {
        self.write_blob_maybe_compressed(srcbuf, ctx, ImageCompressionAlgorithm::Disabled)
            .await
    }
    /// Write a blob of data. Returns the offset that it was written to,
    /// which can be used to retrieve the data later.
    pub async fn write_blob_maybe_compressed<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
        &mut self,
        srcbuf: B,
        ctx: &RequestContext,
        algorithm: ImageCompressionAlgorithm,
    ) -> (B::Buf, Result<u64, Error>) {
        let offset = self.offset;
@@ -226,29 +291,60 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
        let mut io_buf = self.io_buf.take().expect("we always put it back below");
        io_buf.clear();
-        let (io_buf, hdr_res) = async {
+        let mut compressed_buf = None;
        let ((io_buf, hdr_res), srcbuf) = async {
            if len < 128 {
                // Short blob. Write a 1-byte length header
                io_buf.put_u8(len as u8);
-                self.write_all(io_buf, ctx).await
+                (
                    self.write_all(io_buf, ctx).await,
                    srcbuf.slice_full().into_inner(),
                )
            } else {
                // Write a 4-byte length header
-                if len > 0x7fff_ffff {
+                if len > MAX_SUPPORTED_LEN {
                    return (
-                        io_buf,
+                        (
-                        Err(Error::new(
+                            io_buf,
-                            ErrorKind::Other,
+                            Err(Error::new(
-                            format!("blob too large ({len} bytes)"),
+                                ErrorKind::Other,
-                        )),
+                                format!("blob too large ({len} bytes)"),
                            )),
                        ),
                        srcbuf.slice_full().into_inner(),
                    );
                }
-                if len > 0x0fff_ffff {
+                let (high_bit_mask, len_written, srcbuf) = match algorithm {
-                    tracing::warn!("writing blob above future limit ({len} bytes)");
+                    ImageCompressionAlgorithm::Zstd { level } => {
-                }
+                        let mut encoder = if let Some(level) = level {
-                let mut len_buf = (len as u32).to_be_bytes();
+                            async_compression::tokio::write::ZstdEncoder::with_quality(
-                len_buf[0] |= 0x80;
+                                Vec::new(),
                                Level::Precise(level.into()),
                            )
                        } else {
                            async_compression::tokio::write::ZstdEncoder::new(Vec::new())
                        };
                        let slice = srcbuf.slice_full();
                        encoder.write_all(&slice[..]).await.unwrap();
                        encoder.shutdown().await.unwrap();
                        let compressed = encoder.into_inner();
                        if compressed.len() < len {
                            let compressed_len = compressed.len();
                            compressed_buf = Some(compressed);
                            (BYTE_ZSTD, compressed_len, slice.into_inner())
                        } else {
                            (BYTE_UNCOMPRESSED, len, slice.into_inner())
                        }
                    }
                    ImageCompressionAlgorithm::Disabled => {
                        (BYTE_UNCOMPRESSED, len, srcbuf.slice_full().into_inner())
                    }
                };
                let mut len_buf = (len_written as u32).to_be_bytes();
                assert_eq!(len_buf[0] & 0xf0, 0);
                len_buf[0] |= high_bit_mask;
                io_buf.extend_from_slice(&len_buf[..]);
-                self.write_all(io_buf, ctx).await
+                (self.write_all(io_buf, ctx).await, srcbuf)
            }
        }
        .await;
@@ -257,7 +353,12 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
            Ok(_) => (),
            Err(e) => return (Slice::into_inner(srcbuf.slice(..)), Err(e)),
        }
-        let (srcbuf, res) = self.write_all(srcbuf, ctx).await;
+        let (srcbuf, res) = if let Some(compressed_buf) = compressed_buf {
            let (_buf, res) = self.write_all(compressed_buf, ctx).await;
            (Slice::into_inner(srcbuf.slice(..)), res)
        } else {
            self.write_all(srcbuf, ctx).await
        };
        (srcbuf, res.map(|_| offset))
    }
 }
@@ -289,37 +390,65 @@ impl BlobWriter<false> {
 }
 #[cfg(test)]
-mod tests {
+pub(crate) mod tests {
    use super::*;
    use crate::{context::DownloadBehavior, task_mgr::TaskKind, tenant::block_io::BlockReaderRef};
    use camino::Utf8PathBuf;
    use camino_tempfile::Utf8TempDir;
    use rand::{Rng, SeedableRng};
    async fn round_trip_test<const BUFFERED: bool>(blobs: &[Vec<u8>]) -> Result<(), Error> {
        round_trip_test_compressed::<BUFFERED>(blobs, false).await
    }
    pub(crate) async fn write_maybe_compressed<const BUFFERED: bool>(
        blobs: &[Vec<u8>],
        compression: bool,
        ctx: &RequestContext,
    ) -> Result<(Utf8TempDir, Utf8PathBuf, Vec<u64>), Error> {
        let temp_dir = camino_tempfile::tempdir()?;
        let pathbuf = temp_dir.path().join("file");
        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
        // Write part (in block to drop the file)
        let mut offsets = Vec::new();
        {
-            let file = VirtualFile::create(pathbuf.as_path(), &ctx).await?;
+            let file = VirtualFile::create(pathbuf.as_path(), ctx).await?;
            let mut wtr = BlobWriter::<BUFFERED>::new(file, 0);
            for blob in blobs.iter() {
-                let (_, res) = wtr.write_blob(blob.clone(), &ctx).await;
+                let (_, res) = if compression {
                    wtr.write_blob_maybe_compressed(
                        blob.clone(),
                        ctx,
                        ImageCompressionAlgorithm::Zstd { level: Some(1) },
                    )
                    .await
                } else {
                    wtr.write_blob(blob.clone(), ctx).await
                };
                let offs = res?;
                offsets.push(offs);
            }
            // Write out one page worth of zeros so that we can
            // read again with read_blk
-            let (_, res) = wtr.write_blob(vec![0; PAGE_SZ], &ctx).await;
+            let (_, res) = wtr.write_blob(vec![0; PAGE_SZ], ctx).await;
            let offs = res?;
            println!("Writing final blob at offs={offs}");
-            wtr.flush_buffer(&ctx).await?;
+            wtr.flush_buffer(ctx).await?;
        }
        Ok((temp_dir, pathbuf, offsets))
    }
-        let file = VirtualFile::open(pathbuf.as_path(), &ctx).await?;
+    async fn round_trip_test_compressed<const BUFFERED: bool>(
        blobs: &[Vec<u8>],
        compression: bool,
    ) -> Result<(), Error> {
        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
        let (_temp_dir, pathbuf, offsets) =
            write_maybe_compressed::<BUFFERED>(blobs, compression, &ctx).await?;
        let file = VirtualFile::open(pathbuf, &ctx).await?;
        let rdr = BlockReaderRef::VirtualFile(&file);
-        let rdr = BlockCursor::new(rdr);
+        let rdr = BlockCursor::new_with_compression(rdr, compression);
        for (idx, (blob, offset)) in blobs.iter().zip(offsets.iter()).enumerate() {
            let blob_read = rdr.read_blob(*offset, &ctx).await?;
            assert_eq!(
@@ -330,7 +459,7 @@ mod tests {
        Ok(())
    }
-    fn random_array(len: usize) -> Vec<u8> {
+    pub(crate) fn random_array(len: usize) -> Vec<u8> {
        let mut rng = rand::thread_rng();
        (0..len).map(|_| rng.gen()).collect::<_>()
    }
@@ -353,6 +482,8 @@ mod tests {
        ];
        round_trip_test::<false>(blobs).await?;
        round_trip_test::<true>(blobs).await?;
        round_trip_test_compressed::<false>(blobs, true).await?;
        round_trip_test_compressed::<true>(blobs, true).await?;
        Ok(())
    }
@@ -361,10 +492,15 @@ mod tests {
        let blobs = &[
            b"test".to_vec(),
            random_array(10 * PAGE_SZ),
            b"hello".to_vec(),
            random_array(66 * PAGE_SZ),
            vec![0xf3; 24 * PAGE_SZ],
            b"foobar".to_vec(),
        ];
        round_trip_test::<false>(blobs).await?;
        round_trip_test::<true>(blobs).await?;
        round_trip_test_compressed::<false>(blobs, true).await?;
        round_trip_test_compressed::<true>(blobs, true).await?;
        Ok(())
    }
--- a/pageserver/src/tenant/block_io.rs
+++ b/pageserver/src/tenant/block_io.rs
@@ -37,6 +37,7 @@ where
 pub enum BlockLease<'a> {
    PageReadGuard(PageReadGuard<'static>),
    EphemeralFileMutableTail(&'a [u8; PAGE_SZ]),
    Slice(&'a [u8; PAGE_SZ]),
    #[cfg(test)]
    Arc(std::sync::Arc<[u8; PAGE_SZ]>),
    #[cfg(test)]
@@ -63,6 +64,7 @@ impl<'a> Deref for BlockLease<'a> {
        match self {
            BlockLease::PageReadGuard(v) => v.deref(),
            BlockLease::EphemeralFileMutableTail(v) => v,
            BlockLease::Slice(v) => v,
            #[cfg(test)]
            BlockLease::Arc(v) => v.deref(),
            #[cfg(test)]
@@ -81,6 +83,7 @@ pub(crate) enum BlockReaderRef<'a> {
    FileBlockReader(&'a FileBlockReader<'a>),
    EphemeralFile(&'a EphemeralFile),
    Adapter(Adapter<&'a DeltaLayerInner>),
    Slice(&'a [u8]),
    #[cfg(test)]
    TestDisk(&'a super::disk_btree::tests::TestDisk),
    #[cfg(test)]
@@ -99,6 +102,7 @@ impl<'a> BlockReaderRef<'a> {
            FileBlockReader(r) => r.read_blk(blknum, ctx).await,
            EphemeralFile(r) => r.read_blk(blknum, ctx).await,
            Adapter(r) => r.read_blk(blknum, ctx).await,
            Slice(s) => Self::read_blk_slice(s, blknum),
            #[cfg(test)]
            TestDisk(r) => r.read_blk(blknum),
            #[cfg(test)]
@@ -107,6 +111,24 @@ impl<'a> BlockReaderRef<'a> {
    }
 }
 impl<'a> BlockReaderRef<'a> {
    fn read_blk_slice(slice: &[u8], blknum: u32) -> std::io::Result<BlockLease> {
        let start = (blknum as usize).checked_mul(PAGE_SZ).unwrap();
        let end = start.checked_add(PAGE_SZ).unwrap();
        if end > slice.len() {
            return Err(std::io::Error::new(
                std::io::ErrorKind::UnexpectedEof,
                format!("slice too short, len={} end={}", slice.len(), end),
            ));
        }
        let slice = &slice[start..end];
        let page_sized: &[u8; PAGE_SZ] = slice
            .try_into()
            .expect("we add PAGE_SZ to start, so the slice must have PAGE_SZ");
        Ok(BlockLease::Slice(page_sized))
    }
 }
 ///
 /// A "cursor" for efficiently reading multiple pages from a BlockReader
 ///
@@ -127,16 +149,24 @@ impl<'a> BlockReaderRef<'a> {
 /// ```
 ///
 pub struct BlockCursor<'a> {
    pub(super) read_compressed: bool,
    reader: BlockReaderRef<'a>,
 }
 impl<'a> BlockCursor<'a> {
    pub(crate) fn new(reader: BlockReaderRef<'a>) -> Self {
-        BlockCursor { reader }
+        Self::new_with_compression(reader, false)
    }
    pub(crate) fn new_with_compression(reader: BlockReaderRef<'a>, read_compressed: bool) -> Self {
        BlockCursor {
            read_compressed,
            reader,
        }
    }
    // Needed by cli
    pub fn new_fileblockreader(reader: &'a FileBlockReader) -> Self {
        BlockCursor {
            read_compressed: false,
            reader: BlockReaderRef::FileBlockReader(reader),
        }
    }
@@ -166,11 +196,17 @@ pub struct FileBlockReader<'a> {
    /// Unique ID of this file, used as key in the page cache.
    file_id: page_cache::FileId,
    compressed_reads: bool,
 }
 impl<'a> FileBlockReader<'a> {
    pub fn new(file: &'a VirtualFile, file_id: FileId) -> Self {
-        FileBlockReader { file_id, file }
+        FileBlockReader {
            file_id,
            file,
            compressed_reads: true,
        }
    }
    /// Read a page from the underlying file into given buffer.
@@ -217,7 +253,10 @@ impl<'a> FileBlockReader<'a> {
 impl BlockReader for FileBlockReader<'_> {
    fn block_cursor(&self) -> BlockCursor<'_> {
-        BlockCursor::new(BlockReaderRef::FileBlockReader(self))
+        BlockCursor::new_with_compression(
            BlockReaderRef::FileBlockReader(self),
            self.compressed_reads,
        )
    }
 }
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -335,7 +335,6 @@ pub struct TenantConf {
    /// A lagging safekeeper will be changed after `lagging_wal_timeout` time elapses since the last WAL update,
    /// to avoid eager reconnects.
    pub max_lsn_wal_lag: NonZeroU64,
    pub trace_read_requests: bool,
    pub eviction_policy: EvictionPolicy,
    pub min_resident_size_override: Option<u64>,
    // See the corresponding metric's help string.
@@ -436,10 +435,6 @@ pub struct TenantConfOpt {
    #[serde(default)]
    pub max_lsn_wal_lag: Option<NonZeroU64>,
    #[serde(skip_serializing_if = "Option::is_none")]
    #[serde(default)]
    pub trace_read_requests: Option<bool>,
    #[serde(skip_serializing_if = "Option::is_none")]
    #[serde(default)]
    pub eviction_policy: Option<EvictionPolicy>,
@@ -519,9 +514,6 @@ impl TenantConfOpt {
                .lagging_wal_timeout
                .unwrap_or(global_conf.lagging_wal_timeout),
            max_lsn_wal_lag: self.max_lsn_wal_lag.unwrap_or(global_conf.max_lsn_wal_lag),
            trace_read_requests: self
                .trace_read_requests
                .unwrap_or(global_conf.trace_read_requests),
            eviction_policy: self.eviction_policy.unwrap_or(global_conf.eviction_policy),
            min_resident_size_override: self
                .min_resident_size_override
@@ -581,7 +573,6 @@ impl Default for TenantConf {
                .expect("cannot parse default walreceiver lagging wal timeout"),
            max_lsn_wal_lag: NonZeroU64::new(DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG)
                .expect("cannot parse default max walreceiver Lsn wal lag"),
            trace_read_requests: false,
            eviction_policy: EvictionPolicy::NoEviction,
            min_resident_size_override: None,
            evictions_low_residence_duration_metric_threshold: humantime::parse_duration(
@@ -659,7 +650,6 @@ impl From<TenantConfOpt> for models::TenantConfig {
            walreceiver_connect_timeout: value.walreceiver_connect_timeout.map(humantime),
            lagging_wal_timeout: value.lagging_wal_timeout.map(humantime),
            max_lsn_wal_lag: value.max_lsn_wal_lag,
            trace_read_requests: value.trace_read_requests,
            eviction_policy: value.eviction_policy,
            min_resident_size_override: value.min_resident_size_override,
            evictions_low_residence_duration_metric_threshold: value
--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -550,10 +550,10 @@ where
    /// We maintain the length of the stack to be always greater than zero.
    /// Two exceptions are:
    /// 1. `Self::flush_node`. The method will push the new node if it extracted the last one.
-    ///   So because other methods cannot see the intermediate state invariant still holds.
+    ///    So because other methods cannot see the intermediate state invariant still holds.
    /// 2. `Self::finish`. It consumes self and does not return it back,
-    ///  which means that this is where the structure is destroyed.
+    ///    which means that this is where the structure is destroyed.
-    ///  Thus stack of zero length cannot be observed by other methods.
+    ///    Thus stack of zero length cannot be observed by other methods.
    stack: Vec<BuildNode<L>>,
    /// Last key that was appended to the tree. Used to sanity check that append
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -21,6 +21,7 @@ pub struct EphemeralFile {
 }
 mod page_caching;
 pub(crate) use page_caching::PrewarmOnWrite as PrewarmPageCacheOnWrite;
 mod zero_padded_read_write;
 impl EphemeralFile {
@@ -53,7 +54,7 @@ impl EphemeralFile {
        Ok(EphemeralFile {
            _tenant_shard_id: tenant_shard_id,
            _timeline_id: timeline_id,
-            rw: page_caching::RW::new(file),
+            rw: page_caching::RW::new(file, conf.l0_flush.prewarm_on_write()),
        })
    }
@@ -65,6 +66,11 @@ impl EphemeralFile {
        self.rw.page_cache_file_id()
    }
    /// See [`self::page_caching::RW::load_to_vec`].
    pub(crate) async fn load_to_vec(&self, ctx: &RequestContext) -> Result<Vec<u8>, io::Error> {
        self.rw.load_to_vec(ctx).await
    }
    pub(crate) async fn read_blk(
        &self,
        blknum: u32,
--- a/pageserver/src/tenant/ephemeral_file/page_caching.rs
+++ b/pageserver/src/tenant/ephemeral_file/page_caching.rs
@@ -8,6 +8,7 @@ use crate::virtual_file::VirtualFile;
 use once_cell::sync::Lazy;
 use std::io::{self, ErrorKind};
 use std::ops::{Deref, Range};
 use tokio_epoll_uring::BoundedBuf;
 use tracing::*;
@@ -19,14 +20,23 @@ pub struct RW {
    rw: super::zero_padded_read_write::RW<PreWarmingWriter>,
 }
 /// When we flush a block to the underlying [`crate::virtual_file::VirtualFile`],
 /// should we pre-warm the [`crate::page_cache`] with the contents?
 #[derive(Clone, Copy)]
 pub enum PrewarmOnWrite {
    Yes,
    No,
 }
 impl RW {
-    pub fn new(file: VirtualFile) -> Self {
+    pub fn new(file: VirtualFile, prewarm_on_write: PrewarmOnWrite) -> Self {
        let page_cache_file_id = page_cache::next_file_id();
        Self {
            page_cache_file_id,
            rw: super::zero_padded_read_write::RW::new(PreWarmingWriter::new(
                page_cache_file_id,
                file,
                prewarm_on_write,
            )),
        }
    }
@@ -49,6 +59,43 @@ impl RW {
        self.rw.bytes_written()
    }
    /// Load all blocks that can be read via [`Self::read_blk`] into a contiguous memory buffer.
    ///
    /// This includes the blocks that aren't yet flushed to disk by the internal buffered writer.
    /// The last block is zero-padded to [`PAGE_SZ`], so, the returned buffer is always a multiple of [`PAGE_SZ`].
    pub(super) async fn load_to_vec(&self, ctx: &RequestContext) -> Result<Vec<u8>, io::Error> {
        // round up to the next PAGE_SZ multiple, required by blob_io
        let size = {
            let s = usize::try_from(self.bytes_written()).unwrap();
            if s % PAGE_SZ == 0 {
                s
            } else {
                s.checked_add(PAGE_SZ - (s % PAGE_SZ)).unwrap()
            }
        };
        let vec = Vec::with_capacity(size);
        // read from disk what we've already flushed
        let writer = self.rw.as_writer();
        let flushed_range = writer.written_range();
        let mut vec = writer
            .file
            .read_exact_at(
                vec.slice(0..(flushed_range.end - flushed_range.start)),
                u64::try_from(flushed_range.start).unwrap(),
                ctx,
            )
            .await?
            .into_inner();
        // copy from in-memory buffer what we haven't flushed yet but would return when accessed via read_blk
        let buffered = self.rw.get_tail_zero_padded();
        vec.extend_from_slice(buffered);
        assert_eq!(vec.len(), size);
        assert_eq!(vec.len() % PAGE_SZ, 0);
        Ok(vec)
    }
    pub(crate) async fn read_blk(
        &self,
        blknum: u32,
@@ -116,19 +163,40 @@ impl Drop for RW {
 }
 struct PreWarmingWriter {
    prewarm_on_write: PrewarmOnWrite,
    nwritten_blocks: u32,
    page_cache_file_id: page_cache::FileId,
    file: VirtualFile,
 }
 impl PreWarmingWriter {
-    fn new(page_cache_file_id: page_cache::FileId, file: VirtualFile) -> Self {
+    fn new(
        page_cache_file_id: page_cache::FileId,
        file: VirtualFile,
        prewarm_on_write: PrewarmOnWrite,
    ) -> Self {
        Self {
            prewarm_on_write,
            nwritten_blocks: 0,
            page_cache_file_id,
            file,
        }
    }
    /// Return the byte range within `file` that has been written though `write_all`.
    ///
    /// The returned range would be invalidated by another `write_all`. To prevent that, we capture `&_`.
    fn written_range(&self) -> (impl Deref<Target = Range<usize>> + '_) {
        let nwritten_blocks = usize::try_from(self.nwritten_blocks).unwrap();
        struct Wrapper(Range<usize>);
        impl Deref for Wrapper {
            type Target = Range<usize>;
            fn deref(&self) -> &Range<usize> {
                &self.0
            }
        }
        Wrapper(0..nwritten_blocks * PAGE_SZ)
    }
 }
 impl crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter for PreWarmingWriter {
@@ -178,45 +246,51 @@ impl crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter for PreWarmi
            assert_eq!(&check_bounds_stuff_works, &*buf);
        }
        // Pre-warm page cache with the contents.
        // At least in isolated bulk ingest benchmarks (test_bulk_insert.py), the pre-warming
        // benefits the code that writes InMemoryLayer=>L0 layers.
        let nblocks = buflen / PAGE_SZ;
        let nblocks32 = u32::try_from(nblocks).unwrap();
-        let cache = page_cache::get();
+
-        static CTX: Lazy<RequestContext> = Lazy::new(|| {
+        if matches!(self.prewarm_on_write, PrewarmOnWrite::Yes) {
-            RequestContext::new(
+            // Pre-warm page cache with the contents.
-                crate::task_mgr::TaskKind::EphemeralFilePreWarmPageCache,
+            // At least in isolated bulk ingest benchmarks (test_bulk_insert.py), the pre-warming
-                crate::context::DownloadBehavior::Error,
+            // benefits the code that writes InMemoryLayer=>L0 layers.
-            )
+
-        });
+            let cache = page_cache::get();
-        for blknum_in_buffer in 0..nblocks {
+            static CTX: Lazy<RequestContext> = Lazy::new(|| {
-            let blk_in_buffer = &buf[blknum_in_buffer * PAGE_SZ..(blknum_in_buffer + 1) * PAGE_SZ];
+                RequestContext::new(
-            let blknum = self
+                    crate::task_mgr::TaskKind::EphemeralFilePreWarmPageCache,
-                .nwritten_blocks
+                    crate::context::DownloadBehavior::Error,
-                .checked_add(blknum_in_buffer as u32)
+                )
-                .unwrap();
+            });
-            match cache
+            for blknum_in_buffer in 0..nblocks {
-                .read_immutable_buf(self.page_cache_file_id, blknum, &CTX)
+                let blk_in_buffer =
-                .await
+                    &buf[blknum_in_buffer * PAGE_SZ..(blknum_in_buffer + 1) * PAGE_SZ];
-            {
+                let blknum = self
-                Err(e) => {
+                    .nwritten_blocks
-                    error!("ephemeral_file write_blob failed to get immutable buf to pre-warm page cache: {e:?}");
+                    .checked_add(blknum_in_buffer as u32)
-                    // fail gracefully, it's not the end of the world if we can't pre-warm the cache here
+                    .unwrap();
-                }
+                match cache
-                Ok(v) => match v {
+                    .read_immutable_buf(self.page_cache_file_id, blknum, &CTX)
-                    page_cache::ReadBufResult::Found(_guard) => {
+                    .await
-                        // This function takes &mut self, so, it shouldn't be possible to reach this point.
+                {
-                        unreachable!("we just wrote block {blknum} to the VirtualFile, which is owned by Self, \
+                    Err(e) => {
                        error!("ephemeral_file write_blob failed to get immutable buf to pre-warm page cache: {e:?}");
                        // fail gracefully, it's not the end of the world if we can't pre-warm the cache here
                    }
                    Ok(v) => match v {
                        page_cache::ReadBufResult::Found(_guard) => {
                            // This function takes &mut self, so, it shouldn't be possible to reach this point.
                            unreachable!("we just wrote block {blknum} to the VirtualFile, which is owned by Self, \
                                      and this function takes &mut self, so, no concurrent read_blk is possible");
-                    }
+                        }
-                    page_cache::ReadBufResult::NotFound(mut write_guard) => {
+                        page_cache::ReadBufResult::NotFound(mut write_guard) => {
-                        write_guard.copy_from_slice(blk_in_buffer);
+                            write_guard.copy_from_slice(blk_in_buffer);
-                        let _ = write_guard.mark_valid();
+                            let _ = write_guard.mark_valid();
-                    }
+                        }
-                },
+                    },
                }
            }
        }
        self.nwritten_blocks = self.nwritten_blocks.checked_add(nblocks32).unwrap();
        Ok((buflen, buf.into_inner()))
    }
--- a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs
+++ b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs
@@ -75,6 +75,21 @@ where
        flushed_offset + u64::try_from(buffer.pending()).unwrap()
    }
    /// Get a slice of all blocks that [`Self::read_blk`] would return as [`ReadResult::ServedFromZeroPaddedMutableTail`].
    pub fn get_tail_zero_padded(&self) -> &[u8] {
        let buffer: &zero_padded::Buffer<TAIL_SZ> = self.buffered_writer.inspect_buffer();
        let buffer_written_up_to = buffer.pending();
        // pad to next page boundary
        let read_up_to = if buffer_written_up_to % PAGE_SZ == 0 {
            buffer_written_up_to
        } else {
            buffer_written_up_to
                .checked_add(PAGE_SZ - (buffer_written_up_to % PAGE_SZ))
                .unwrap()
        };
        &buffer.as_zero_padded_slice()[0..read_up_to]
    }
    pub(crate) async fn read_blk(&self, blknum: u32) -> Result<ReadResult<'_, W>, std::io::Error> {
        let flushed_offset = self.buffered_writer.as_inner().bytes_written();
        let buffer: &zero_padded::Buffer<TAIL_SZ> = self.buffered_writer.inspect_buffer();
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -43,7 +43,8 @@ use crate::tenant::config::{
 use crate::tenant::span::debug_assert_current_span_has_tenant_id;
 use crate::tenant::storage_layer::inmemory_layer;
 use crate::tenant::timeline::ShutdownMode;
-use crate::tenant::{AttachedTenantConf, GcError, SpawnMode, Tenant, TenantState};
+use crate::tenant::{AttachedTenantConf, GcError, LoadConfigError, SpawnMode, Tenant, TenantState};
 use crate::virtual_file::MaybeFatalIo;
 use crate::{InitializationOrder, TEMP_FILE_SUFFIX};
 use utils::crashsafe::path_with_suffix_extension;
@@ -272,7 +273,7 @@ pub struct TenantManager {
 }
 fn emergency_generations(
-    tenant_confs: &HashMap<TenantShardId, anyhow::Result<LocationConf>>,
+    tenant_confs: &HashMap<TenantShardId, Result<LocationConf, LoadConfigError>>,
 ) -> HashMap<TenantShardId, TenantStartupMode> {
    tenant_confs
        .iter()
@@ -296,7 +297,7 @@ fn emergency_generations(
 async fn init_load_generations(
    conf: &'static PageServerConf,
-    tenant_confs: &HashMap<TenantShardId, anyhow::Result<LocationConf>>,
+    tenant_confs: &HashMap<TenantShardId, Result<LocationConf, LoadConfigError>>,
    resources: &TenantSharedResources,
    cancel: &CancellationToken,
 ) -> anyhow::Result<Option<HashMap<TenantShardId, TenantStartupMode>>> {
@@ -346,56 +347,32 @@ async fn init_load_generations(
 /// Given a directory discovered in the pageserver's tenants/ directory, attempt
 /// to load a tenant config from it.
 ///
-/// If file is missing, return Ok(None)
+/// If we cleaned up something expected (like an empty dir or a temp dir), return None.
 fn load_tenant_config(
    conf: &'static PageServerConf,
    tenant_shard_id: TenantShardId,
    dentry: Utf8DirEntry,
-) -> anyhow::Result<Option<(TenantShardId, anyhow::Result<LocationConf>)>> {
+) -> Option<Result<LocationConf, LoadConfigError>> {
    let tenant_dir_path = dentry.path().to_path_buf();
    if crate::is_temporary(&tenant_dir_path) {
        info!("Found temporary tenant directory, removing: {tenant_dir_path}");
        // No need to use safe_remove_tenant_dir_all because this is already
        // a temporary path
-        if let Err(e) = std::fs::remove_dir_all(&tenant_dir_path) {
+        std::fs::remove_dir_all(&tenant_dir_path).fatal_err("delete temporary tenant dir");
-            error!(
+        return None;
                "Failed to remove temporary directory '{}': {:?}",
                tenant_dir_path, e
            );
        }
        return Ok(None);
    }
    // This case happens if we crash during attachment before writing a config into the dir
    let is_empty = tenant_dir_path
        .is_empty_dir()
-        .with_context(|| format!("Failed to check whether {tenant_dir_path:?} is an empty dir"))?;
+        .fatal_err("Checking for empty tenant dir");
    if is_empty {
        info!("removing empty tenant directory {tenant_dir_path:?}");
-        if let Err(e) = std::fs::remove_dir(&tenant_dir_path) {
+        std::fs::remove_dir(&tenant_dir_path).fatal_err("delete empty tenant dir");
-            error!(
+        return None;
                "Failed to remove empty tenant directory '{}': {e:#}",
                tenant_dir_path
            )
        }
        return Ok(None);
    }
-    let tenant_shard_id = match tenant_dir_path
+    Some(Tenant::load_tenant_config(conf, &tenant_shard_id))
        .file_name()
        .unwrap_or_default()
        .parse::<TenantShardId>()
    {
        Ok(id) => id,
        Err(_) => {
            warn!("Invalid tenant path (garbage in our repo directory?): {tenant_dir_path}",);
            return Ok(None);
        }
    };
    Ok(Some((
        tenant_shard_id,
        Tenant::load_tenant_config(conf, &tenant_shard_id),
    )))
 }
 /// Initial stage of load: walk the local tenants directory, clean up any temp files,
@@ -405,32 +382,51 @@ fn load_tenant_config(
 /// seconds even on reasonably fast drives.
 async fn init_load_tenant_configs(
    conf: &'static PageServerConf,
-) -> anyhow::Result<HashMap<TenantShardId, anyhow::Result<LocationConf>>> {
+) -> HashMap<TenantShardId, Result<LocationConf, LoadConfigError>> {
    let tenants_dir = conf.tenants_path();
-    let dentries = tokio::task::spawn_blocking(move || -> anyhow::Result<Vec<Utf8DirEntry>> {
+    let dentries = tokio::task::spawn_blocking(move || -> Vec<Utf8DirEntry> {
-        let dir_entries = tenants_dir
+        let context = format!("read tenants dir {tenants_dir}");
-            .read_dir_utf8()
+        let dir_entries = tenants_dir.read_dir_utf8().fatal_err(&context);
            .with_context(|| format!("Failed to list tenants dir {tenants_dir:?}"))?;
-        Ok(dir_entries.collect::<Result<Vec<_>, std::io::Error>>()?)
+        dir_entries
            .collect::<Result<Vec<_>, std::io::Error>>()
            .fatal_err(&context)
    })
-    .await??;
+    .await
    .expect("Config load task panicked");
    let mut configs = HashMap::new();
    let mut join_set = JoinSet::new();
    for dentry in dentries {
-        join_set.spawn_blocking(move || load_tenant_config(conf, dentry));
+        let tenant_shard_id = match dentry.file_name().parse::<TenantShardId>() {
            Ok(id) => id,
            Err(_) => {
                warn!(
                    "Invalid tenant path (garbage in our repo directory?): '{}'",
                    dentry.file_name()
                );
                continue;
            }
        };
        join_set.spawn_blocking(move || {
            (
                tenant_shard_id,
                load_tenant_config(conf, tenant_shard_id, dentry),
            )
        });
    }
    while let Some(r) = join_set.join_next().await {
-        if let Some((tenant_id, tenant_config)) = r?? {
+        let (tenant_shard_id, tenant_config) = r.expect("Panic in config load task");
-            configs.insert(tenant_id, tenant_config);
+        if let Some(tenant_config) = tenant_config {
            configs.insert(tenant_shard_id, tenant_config);
        }
    }
-    Ok(configs)
+    configs
 }
 #[derive(Debug, thiserror::Error)]
@@ -472,7 +468,7 @@ pub async fn init_tenant_mgr(
    );
    // Scan local filesystem for attached tenants
-    let tenant_configs = init_load_tenant_configs(conf).await?;
+    let tenant_configs = init_load_tenant_configs(conf).await;
    // Determine which tenants are to be secondary or attached, and in which generation
    let tenant_modes = init_load_generations(conf, &tenant_configs, &resources, &cancel).await?;
@@ -590,31 +586,23 @@ pub async fn init_tenant_mgr(
    );
    // For those shards that have live configurations, construct `Tenant` or `SecondaryTenant` objects and start them running
    for (tenant_shard_id, location_conf, config_write_result) in config_write_results {
-        // Errors writing configs are fatal
+        // Writing a config to local disk is foundational to startup up tenants: panic if we can't.
-        config_write_result?;
+        config_write_result.fatal_err("write tenant shard config file");
        let tenant_dir_path = conf.tenant_path(&tenant_shard_id);
        let shard_identity = location_conf.shard;
        let slot = match location_conf.mode {
-            LocationMode::Attached(attached_conf) => {
+            LocationMode::Attached(attached_conf) => TenantSlot::Attached(tenant_spawn(
-                match tenant_spawn(
+                conf,
-                    conf,
+                tenant_shard_id,
-                    tenant_shard_id,
+                &tenant_dir_path,
-                    &tenant_dir_path,
+                resources.clone(),
-                    resources.clone(),
+                AttachedTenantConf::new(location_conf.tenant_conf, attached_conf),
-                    AttachedTenantConf::new(location_conf.tenant_conf, attached_conf),
+                shard_identity,
-                    shard_identity,
+                Some(init_order.clone()),
-                    Some(init_order.clone()),
+                SpawnMode::Lazy,
-                    SpawnMode::Lazy,
+                &ctx,
-                    &ctx,
+            )),
                ) {
                    Ok(tenant) => TenantSlot::Attached(tenant),
                    Err(e) => {
                        error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Failed to start tenant: {e:#}");
                        continue;
                    }
                }
            }
            LocationMode::Secondary(secondary_conf) => {
                info!(
                    tenant_id = %tenant_shard_id.tenant_id,
@@ -649,8 +637,7 @@ pub async fn init_tenant_mgr(
    })
 }
-/// Wrapper for Tenant::spawn that checks invariants before running, and inserts
+/// Wrapper for Tenant::spawn that checks invariants before running
 /// a broken tenant in the map if Tenant::spawn fails.
 #[allow(clippy::too_many_arguments)]
 fn tenant_spawn(
    conf: &'static PageServerConf,
@@ -662,23 +649,18 @@ fn tenant_spawn(
    init_order: Option<InitializationOrder>,
    mode: SpawnMode,
    ctx: &RequestContext,
-) -> anyhow::Result<Arc<Tenant>> {
+) -> Arc<Tenant> {
-    anyhow::ensure!(
+    // All these conditions should have been satisfied by our caller: the tenant dir exists, is a well formed
-        tenant_path.is_dir(),
+    // path, and contains a configuration file.  Assertions that do synchronous I/O are limited to debug mode
-        "Cannot load tenant from path {tenant_path:?}, it either does not exist or not a directory"
+    // to avoid impacting prod runtime performance.
-    );
+    assert!(!crate::is_temporary(tenant_path));
-    anyhow::ensure!(
+    debug_assert!(tenant_path.is_dir());
-        !crate::is_temporary(tenant_path),
+    debug_assert!(conf
-        "Cannot load tenant from temporary path {tenant_path:?}"
+        .tenant_location_config_path(&tenant_shard_id)
-    );
+        .try_exists()
-    anyhow::ensure!(
+        .unwrap());
        !tenant_path.is_empty_dir().with_context(|| {
            format!("Failed to check whether {tenant_path:?} is an empty dir")
        })?,
        "Cannot load tenant from empty directory {tenant_path:?}"
    );
-    let tenant = Tenant::spawn(
+    Tenant::spawn(
        conf,
        tenant_shard_id,
        resources,
@@ -687,9 +669,7 @@ fn tenant_spawn(
        init_order,
        mode,
        ctx,
-    );
+    )
    Ok(tenant)
 }
 async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
@@ -840,8 +820,9 @@ pub(crate) enum UpsertLocationError {
    #[error("Failed to flush: {0}")]
    Flush(anyhow::Error),
    /// This error variant is for unexpected situations (soft assertions) where the system is in an unexpected state.
    #[error("Internal error: {0}")]
-    Other(#[from] anyhow::Error),
+    InternalError(anyhow::Error),
 }
 impl TenantManager {
@@ -971,7 +952,8 @@ impl TenantManager {
        match fast_path_taken {
            Some(FastPathModified::Attached(tenant)) => {
                Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
-                    .await?;
+                    .await
                    .fatal_err("write tenant shard config");
                // Transition to AttachedStale means we may well hold a valid generation
                // still, and have been requested to go stale as part of a migration.  If
@@ -1001,7 +983,8 @@ impl TenantManager {
            }
            Some(FastPathModified::Secondary(_secondary_tenant)) => {
                Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
-                    .await?;
+                    .await
                    .fatal_err("write tenant shard config");
                return Ok(None);
            }
@@ -1067,7 +1050,7 @@ impl TenantManager {
            Some(TenantSlot::InProgress(_)) => {
                // This should never happen: acquire_slot should error out
                // if the contents of a slot were InProgress.
-                return Err(UpsertLocationError::Other(anyhow::anyhow!(
+                return Err(UpsertLocationError::InternalError(anyhow::anyhow!(
                    "Acquired an InProgress slot, this is a bug."
                )));
            }
@@ -1086,12 +1069,14 @@ impl TenantManager {
        // Does not need to be fsync'd because local storage is just a cache.
        tokio::fs::create_dir_all(&timelines_path)
            .await
-            .with_context(|| format!("Creating {timelines_path}"))?;
+            .fatal_err("create timelines/ dir");
        // Before activating either secondary or attached mode, persist the
        // configuration, so that on restart we will re-attach (or re-start
        // secondary) on the tenant.
-        Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config).await?;
+        Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
            .await
            .fatal_err("write tenant shard config");
        let new_slot = match &new_location_config.mode {
            LocationMode::Secondary(secondary_config) => {
@@ -1110,13 +1095,15 @@ impl TenantManager {
                // from upserts.  This enables creating generation-less tenants even though neon_local
                // always uses generations when calling the location conf API.
                let attached_conf = if cfg!(feature = "testing") {
-                    let mut conf = AttachedTenantConf::try_from(new_location_config)?;
+                    let mut conf = AttachedTenantConf::try_from(new_location_config)
                        .map_err(UpsertLocationError::BadRequest)?;
                    if self.conf.control_plane_api.is_none() {
                        conf.location.generation = Generation::none();
                    }
                    conf
                } else {
-                    AttachedTenantConf::try_from(new_location_config)?
+                    AttachedTenantConf::try_from(new_location_config)
                        .map_err(UpsertLocationError::BadRequest)?
                };
                let tenant = tenant_spawn(
@@ -1129,7 +1116,7 @@ impl TenantManager {
                    None,
                    spawn_mode,
                    ctx,
-                )?;
+                );
                TenantSlot::Attached(tenant)
            }
@@ -1143,7 +1130,7 @@ impl TenantManager {
        match slot_guard.upsert(new_slot) {
            Err(TenantSlotUpsertError::InternalError(e)) => {
-                Err(UpsertLocationError::Other(anyhow::anyhow!(e)))
+                Err(UpsertLocationError::InternalError(anyhow::anyhow!(e)))
            }
            Err(TenantSlotUpsertError::MapState(e)) => Err(UpsertLocationError::Unavailable(e)),
            Err(TenantSlotUpsertError::ShuttingDown((new_slot, _completion))) => {
@@ -1250,7 +1237,7 @@ impl TenantManager {
            None,
            SpawnMode::Eager,
            ctx,
-        )?;
+        );
        slot_guard.upsert(TenantSlot::Attached(tenant))?;
@@ -1984,7 +1971,7 @@ impl TenantManager {
            None,
            SpawnMode::Eager,
            ctx,
-        )?;
+        );
        slot_guard.upsert(TenantSlot::Attached(tenant))?;
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -519,7 +519,7 @@ impl RemoteTimelineClient {
        local_path: &Utf8Path,
        cancel: &CancellationToken,
        ctx: &RequestContext,
-    ) -> anyhow::Result<u64> {
+    ) -> Result<u64, DownloadError> {
        let downloaded_size = {
            let _unfinished_gauge_guard = self.metrics.call_begin(
                &RemoteOpFileKind::Layer,
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -23,6 +23,8 @@ use super::{
    storage_layer::LayerName,
 };
 use crate::metrics::SECONDARY_RESIDENT_PHYSICAL_SIZE;
 use metrics::UIntGauge;
 use pageserver_api::{
    models,
    shard::{ShardIdentity, TenantShardId},
@@ -99,6 +101,17 @@ pub(crate) struct SecondaryTenant {
    // Public state indicating overall progress of downloads relative to the last heatmap seen
    pub(crate) progress: std::sync::Mutex<models::SecondaryProgress>,
    // Sum of layer sizes on local disk
    pub(super) resident_size_metric: UIntGauge,
 }
 impl Drop for SecondaryTenant {
    fn drop(&mut self) {
        let tenant_id = self.tenant_shard_id.tenant_id.to_string();
        let shard_id = format!("{}", self.tenant_shard_id.shard_slug());
        let _ = SECONDARY_RESIDENT_PHYSICAL_SIZE.remove_label_values(&[&tenant_id, &shard_id]);
    }
 }
 impl SecondaryTenant {
@@ -108,6 +121,12 @@ impl SecondaryTenant {
        tenant_conf: TenantConfOpt,
        config: &SecondaryLocationConfig,
    ) -> Arc<Self> {
        let tenant_id = tenant_shard_id.tenant_id.to_string();
        let shard_id = format!("{}", tenant_shard_id.shard_slug());
        let resident_size_metric = SECONDARY_RESIDENT_PHYSICAL_SIZE
            .get_metric_with_label_values(&[&tenant_id, &shard_id])
            .unwrap();
        Arc::new(Self {
            tenant_shard_id,
            // todo: shall we make this a descendent of the
@@ -123,6 +142,8 @@ impl SecondaryTenant {
            detail: std::sync::Mutex::new(SecondaryDetail::new(config.clone())),
            progress: std::sync::Mutex::default(),
            resident_size_metric,
        })
    }
@@ -211,16 +232,12 @@ impl SecondaryTenant {
            // have to 100% match what is on disk, because it's a best-effort warming
            // of the cache.
            let mut detail = this.detail.lock().unwrap();
-            if let Some(timeline_detail) = detail.timelines.get_mut(&timeline_id) {
+            if let Some(removed) =
-                let removed = timeline_detail.on_disk_layers.remove(&name);
+                detail.evict_layer(name, &timeline_id, now, &this.resident_size_metric)
-
+            {
-                // We might race with removal of the same layer during downloads, if it was removed
+                // We might race with removal of the same layer during downloads, so finding the layer we
-                // from the heatmap.  If we see that the OnDiskState is gone, then no need to
+                // were trying to remove is optional.  Only issue the disk I/O to remove it if we found it.
-                // do a physical deletion or store in evicted_at.
+                removed.remove_blocking();
                if let Some(removed) = removed {
                    removed.remove_blocking();
                    timeline_detail.evicted_at.insert(name, now);
                }
            }
        })
        .await
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -46,6 +46,7 @@ use crate::tenant::{
 use camino::Utf8PathBuf;
 use chrono::format::{DelayedFormat, StrftimeItems};
 use futures::Future;
 use metrics::UIntGauge;
 use pageserver_api::models::SecondaryProgress;
 use pageserver_api::shard::TenantShardId;
 use remote_storage::{DownloadError, Etag, GenericRemoteStorage};
@@ -131,16 +132,66 @@ impl OnDiskState {
            .or_else(fs_ext::ignore_not_found)
            .fatal_err("Deleting secondary layer")
    }
    pub(crate) fn file_size(&self) -> u64 {
        self.metadata.file_size
    }
 }
 #[derive(Debug, Clone, Default)]
 pub(super) struct SecondaryDetailTimeline {
-    pub(super) on_disk_layers: HashMap<LayerName, OnDiskState>,
+    on_disk_layers: HashMap<LayerName, OnDiskState>,
    /// We remember when layers were evicted, to prevent re-downloading them.
    pub(super) evicted_at: HashMap<LayerName, SystemTime>,
 }
 impl SecondaryDetailTimeline {
    pub(super) fn remove_layer(
        &mut self,
        name: &LayerName,
        resident_metric: &UIntGauge,
    ) -> Option<OnDiskState> {
        let removed = self.on_disk_layers.remove(name);
        if let Some(removed) = &removed {
            resident_metric.sub(removed.file_size());
        }
        removed
    }
    /// `local_path`
    fn touch_layer<F>(
        &mut self,
        conf: &'static PageServerConf,
        tenant_shard_id: &TenantShardId,
        timeline_id: &TimelineId,
        touched: &HeatMapLayer,
        resident_metric: &UIntGauge,
        local_path: F,
    ) where
        F: FnOnce() -> Utf8PathBuf,
    {
        use std::collections::hash_map::Entry;
        match self.on_disk_layers.entry(touched.name.clone()) {
            Entry::Occupied(mut v) => {
                v.get_mut().access_time = touched.access_time;
            }
            Entry::Vacant(e) => {
                e.insert(OnDiskState::new(
                    conf,
                    tenant_shard_id,
                    timeline_id,
                    touched.name.clone(),
                    touched.metadata.clone(),
                    touched.access_time,
                    local_path(),
                ));
                resident_metric.add(touched.metadata.file_size);
            }
        }
    }
 }
 // Aspects of a heatmap that we remember after downloading it
 #[derive(Clone, Debug)]
 struct DownloadSummary {
@@ -158,7 +209,7 @@ pub(super) struct SecondaryDetail {
    last_download: Option<DownloadSummary>,
    next_download: Option<Instant>,
-    pub(super) timelines: HashMap<TimelineId, SecondaryDetailTimeline>,
+    timelines: HashMap<TimelineId, SecondaryDetailTimeline>,
 }
 /// Helper for logging SystemTime
@@ -191,6 +242,38 @@ impl SecondaryDetail {
        }
    }
    pub(super) fn evict_layer(
        &mut self,
        name: LayerName,
        timeline_id: &TimelineId,
        now: SystemTime,
        resident_metric: &UIntGauge,
    ) -> Option<OnDiskState> {
        let timeline = self.timelines.get_mut(timeline_id)?;
        let removed = timeline.remove_layer(&name, resident_metric);
        if removed.is_some() {
            timeline.evicted_at.insert(name, now);
        }
        removed
    }
    pub(super) fn remove_timeline(
        &mut self,
        timeline_id: &TimelineId,
        resident_metric: &UIntGauge,
    ) {
        let removed = self.timelines.remove(timeline_id);
        if let Some(removed) = removed {
            resident_metric.sub(
                removed
                    .on_disk_layers
                    .values()
                    .map(|l| l.metadata.file_size)
                    .sum(),
            );
        }
    }
    /// Additionally returns the total number of layers, used for more stable relative access time
    /// based eviction.
    pub(super) fn get_layers_for_eviction(
@@ -601,8 +684,13 @@ impl<'a> TenantDownloader<'a> {
                Some(t) => t,
                None => {
                    // We have no existing state: need to scan local disk for layers first.
-                    let timeline_state =
+                    let timeline_state = init_timeline_state(
-                        init_timeline_state(self.conf, tenant_shard_id, timeline).await;
+                        self.conf,
                        tenant_shard_id,
                        timeline,
                        &self.secondary_state.resident_size_metric,
                    )
                    .await;
                    // Re-acquire detail lock now that we're done with async load from local FS
                    self.secondary_state
@@ -671,6 +759,25 @@ impl<'a> TenantDownloader<'a> {
                .await?;
        }
        // Metrics consistency check in testing builds
        if cfg!(feature = "testing") {
            let detail = self.secondary_state.detail.lock().unwrap();
            let resident_size = detail
                .timelines
                .values()
                .map(|tl| {
                    tl.on_disk_layers
                        .values()
                        .map(|v| v.metadata.file_size)
                        .sum::<u64>()
                })
                .sum::<u64>();
            assert_eq!(
                resident_size,
                self.secondary_state.resident_size_metric.get()
            );
        }
        // Only update last_etag after a full successful download: this way will not skip
        // the next download, even if the heatmap's actual etag is unchanged.
        self.secondary_state.detail.lock().unwrap().last_download = Some(DownloadSummary {
@@ -783,7 +890,7 @@ impl<'a> TenantDownloader<'a> {
            for delete_timeline in &delete_timelines {
                // We haven't removed from disk yet, but optimistically remove from in-memory state: if removal
                // from disk fails that will be a fatal error.
-                detail.timelines.remove(delete_timeline);
+                detail.remove_timeline(delete_timeline, &self.secondary_state.resident_size_metric);
            }
        }
@@ -801,7 +908,7 @@ impl<'a> TenantDownloader<'a> {
            let Some(timeline_state) = detail.timelines.get_mut(&timeline_id) else {
                continue;
            };
-            timeline_state.on_disk_layers.remove(&layer_name);
+            timeline_state.remove_layer(&layer_name, &self.secondary_state.resident_size_metric);
        }
        for timeline_id in delete_timelines {
@@ -1000,33 +1107,24 @@ impl<'a> TenantDownloader<'a> {
            let timeline_detail = detail.timelines.entry(timeline_id).or_default();
            tracing::info!("Wrote timeline_detail for {} touched layers", touched.len());
-
+            touched.into_iter().for_each(|t| {
-            for t in touched {
+                timeline_detail.touch_layer(
-                use std::collections::hash_map::Entry;
+                    self.conf,
-                match timeline_detail.on_disk_layers.entry(t.name.clone()) {
+                    tenant_shard_id,
-                    Entry::Occupied(mut v) => {
+                    &timeline_id,
-                        v.get_mut().access_time = t.access_time;
+                    &t,
-                    }
+                    &self.secondary_state.resident_size_metric,
-                    Entry::Vacant(e) => {
+                    || {
-                        let local_path = local_layer_path(
+                        local_layer_path(
                            self.conf,
                            tenant_shard_id,
                            &timeline_id,
                            &t.name,
                            &t.metadata.generation,
-                        );
+                        )
-                        e.insert(OnDiskState::new(
+                    },
-                            self.conf,
+                )
-                            tenant_shard_id,
+            });
                            &timeline_id,
                            t.name,
                            t.metadata.clone(),
                            t.access_time,
                            local_path,
                        ));
                    }
                }
            }
        }
        result
@@ -1135,6 +1233,7 @@ async fn init_timeline_state(
    conf: &'static PageServerConf,
    tenant_shard_id: &TenantShardId,
    heatmap: &HeatMapTimeline,
    resident_metric: &UIntGauge,
 ) -> SecondaryDetailTimeline {
    let timeline_path = conf.timeline_path(tenant_shard_id, &heatmap.timeline_id);
    let mut detail = SecondaryDetailTimeline::default();
@@ -1210,17 +1309,13 @@ async fn init_timeline_state(
                        } else {
                            // We expect the access time to be initialized immediately afterwards, when
                            // the latest heatmap is applied to the state.
-                            detail.on_disk_layers.insert(
+                            detail.touch_layer(
-                                name.clone(),
+                                conf,
-                                OnDiskState::new(
+                                tenant_shard_id,
-                                    conf,
+                                &heatmap.timeline_id,
-                                    tenant_shard_id,
+                                remote_meta,
-                                    &heatmap.timeline_id,
+                                resident_metric,
-                                    name,
+                                || file_path,
                                    remote_meta.metadata.clone(),
                                    remote_meta.access_time,
                                    file_path,
                                ),
                            );
                        }
                    }
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -3,6 +3,7 @@ use std::collections::hash_map::Entry;
 use std::collections::{HashMap, HashSet};
 use std::sync::Arc;
 use tenant_size_model::svg::SvgBranchKind;
 use tokio::sync::oneshot::error::RecvError;
 use tokio::sync::Semaphore;
 use tokio_util::sync::CancellationToken;
@@ -87,6 +88,9 @@ impl SegmentMeta {
            LsnKind::BranchPoint => true,
            LsnKind::GcCutOff => true,
            LsnKind::BranchEnd => false,
            LsnKind::LeasePoint => true,
            LsnKind::LeaseStart => false,
            LsnKind::LeaseEnd => false,
        }
    }
 }
@@ -103,6 +107,21 @@ pub enum LsnKind {
    GcCutOff,
    /// Last record LSN
    BranchEnd,
    /// A LSN lease is granted here.
    LeasePoint,
    /// A lease starts from here.
    LeaseStart,
    /// Last record LSN for the lease (should have the same LSN as the previous [`LsnKind::LeaseStart`]).
    LeaseEnd,
 }
 impl From<LsnKind> for SvgBranchKind {
    fn from(kind: LsnKind) -> Self {
        match kind {
            LsnKind::LeasePoint | LsnKind::LeaseStart | LsnKind::LeaseEnd => SvgBranchKind::Lease,
            _ => SvgBranchKind::Timeline,
        }
    }
 }
 /// Collect all relevant LSNs to the inputs. These will only be helpful in the serialized form as
@@ -124,6 +143,9 @@ pub struct TimelineInputs {
    /// Cutoff point calculated from the user-supplied 'max_retention_period'
    retention_param_cutoff: Option<Lsn>,
    /// Lease points on the timeline
    lease_points: Vec<Lsn>,
 }
 /// Gathers the inputs for the tenant sizing model.
@@ -234,6 +256,13 @@ pub(super) async fn gather_inputs(
            None
        };
        let lease_points = gc_info
            .leases
            .keys()
            .filter(|&&lsn| lsn > ancestor_lsn)
            .copied()
            .collect::<Vec<_>>();
        // next_gc_cutoff in parent branch are not of interest (right now at least), nor do we
        // want to query any logical size before initdb_lsn.
        let branch_start_lsn = cmp::max(ancestor_lsn, timeline.initdb_lsn);
@@ -248,6 +277,8 @@ pub(super) async fn gather_inputs(
            .map(|lsn| (lsn, LsnKind::BranchPoint))
            .collect::<Vec<_>>();
        lsns.extend(lease_points.iter().map(|&lsn| (lsn, LsnKind::LeasePoint)));
        drop(gc_info);
        // Add branch points we collected earlier, just in case there were any that were
@@ -296,6 +327,7 @@ pub(super) async fn gather_inputs(
            if kind == LsnKind::BranchPoint {
                branchpoint_segments.insert((timeline_id, lsn), segments.len());
            }
            segments.push(SegmentMeta {
                segment: Segment {
                    parent: Some(parent),
@@ -306,7 +338,45 @@ pub(super) async fn gather_inputs(
                timeline_id: timeline.timeline_id,
                kind,
            });
-            parent += 1;
+
            parent = segments.len() - 1;
            if kind == LsnKind::LeasePoint {
                // Needs `LeaseStart` and `LeaseEnd` as well to model lease as a read-only branch that never writes data
                // (i.e. it's lsn has not advanced from ancestor_lsn), and therefore the three segments have the same LSN
                // value. Without the other two segments, the calculation code would not count the leased LSN as a point
                // to be retained.
                // Did not use `BranchStart` or `BranchEnd` so we can differentiate branches and leases during debug.
                //
                // Alt Design: rewrite the entire calculation code to be independent of timeline id. Both leases and
                // branch points can be given a synthetic id so we can unite them.
                let mut lease_parent = parent;
                // Start of a lease.
                segments.push(SegmentMeta {
                    segment: Segment {
                        parent: Some(lease_parent),
                        lsn: lsn.0,
                        size: None,                   // Filled in later, if necessary
                        needed: lsn > next_gc_cutoff, // only needed if the point is within rentention.
                    },
                    timeline_id: timeline.timeline_id,
                    kind: LsnKind::LeaseStart,
                });
                lease_parent += 1;
                // End of the lease.
                segments.push(SegmentMeta {
                    segment: Segment {
                        parent: Some(lease_parent),
                        lsn: lsn.0,
                        size: None,   // Filled in later, if necessary
                        needed: true, // everything at the lease LSN must be readable => is needed
                    },
                    timeline_id: timeline.timeline_id,
                    kind: LsnKind::LeaseEnd,
                });
            }
        }
        // Current end of the timeline
@@ -332,6 +402,7 @@ pub(super) async fn gather_inputs(
            pitr_cutoff,
            next_gc_cutoff,
            retention_param_cutoff,
            lease_points,
        });
    }
@@ -674,7 +745,8 @@ fn verify_size_for_multiple_branches() {
      "horizon_cutoff": "0/2210CD0",
      "pitr_cutoff": "0/2210CD0",
      "next_gc_cutoff": "0/2210CD0",
-      "retention_param_cutoff": null
+      "retention_param_cutoff": null,
      "lease_points": []
    },
    {
      "timeline_id": "454626700469f0a9914949b9d018e876",
@@ -684,7 +756,8 @@ fn verify_size_for_multiple_branches() {
      "horizon_cutoff": "0/1817770",
      "pitr_cutoff": "0/1817770",
      "next_gc_cutoff": "0/1817770",
-      "retention_param_cutoff": null
+      "retention_param_cutoff": null,
      "lease_points": []
    },
    {
      "timeline_id": "cb5e3cbe60a4afc00d01880e1a37047f",
@@ -694,7 +767,8 @@ fn verify_size_for_multiple_branches() {
      "horizon_cutoff": "0/18B3D98",
      "pitr_cutoff": "0/18B3D98",
      "next_gc_cutoff": "0/18B3D98",
-      "retention_param_cutoff": null
+      "retention_param_cutoff": null,
      "lease_points": []
    }
  ]
 }
@@ -749,7 +823,8 @@ fn verify_size_for_one_branch() {
      "horizon_cutoff": "47/240A5860",
      "pitr_cutoff": "47/240A5860",
      "next_gc_cutoff": "47/240A5860",
-      "retention_param_cutoff": "0/0"
+      "retention_param_cutoff": "0/0",
      "lease_points": []
    }
  ]
 }"#;
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -7,6 +7,9 @@ pub(crate) mod layer;
 mod layer_desc;
 mod layer_name;
 #[cfg(test)]
 pub mod merge_iterator;
 use crate::context::{AccessStatsBehavior, RequestContext};
 use crate::repository::Value;
 use crate::task_mgr::TaskKind;
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -49,7 +49,7 @@ use camino::{Utf8Path, Utf8PathBuf};
 use futures::StreamExt;
 use itertools::Itertools;
 use pageserver_api::keyspace::KeySpace;
-use pageserver_api::models::LayerAccessKind;
+use pageserver_api::models::{ImageCompressionAlgorithm, LayerAccessKind};
 use pageserver_api::shard::TenantShardId;
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
@@ -223,6 +223,11 @@ pub struct DeltaLayerInner {
    file: VirtualFile,
    file_id: FileId,
    #[allow(dead_code)]
    layer_key_range: Range<Key>,
    #[allow(dead_code)]
    layer_lsn_range: Range<Lsn>,
    max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
 }
@@ -452,7 +457,12 @@ impl DeltaLayerWriterInner {
        ctx: &RequestContext,
    ) -> (Vec<u8>, anyhow::Result<()>) {
        assert!(self.lsn_range.start <= lsn);
-        let (val, res) = self.blob_writer.write_blob(val, ctx).await;
+        // We don't want to use compression in delta layer creation
        let compression = ImageCompressionAlgorithm::Disabled;
        let (val, res) = self
            .blob_writer
            .write_blob_maybe_compressed(val, ctx, compression)
            .await;
        let off = match res {
            Ok(off) => off,
            Err(e) => return (val, Err(anyhow::anyhow!(e))),
@@ -737,6 +747,16 @@ impl DeltaLayer {
 }
 impl DeltaLayerInner {
    #[cfg(test)]
    pub(crate) fn key_range(&self) -> &Range<Key> {
        &self.layer_key_range
    }
    #[cfg(test)]
    pub(crate) fn lsn_range(&self) -> &Range<Lsn> {
        &self.layer_lsn_range
    }
    /// Returns nested result following Result<Result<_, OpErr>, Critical>:
    /// - inner has the success or transient failure
    /// - outer has the permanent failure
@@ -785,6 +805,8 @@ impl DeltaLayerInner {
            index_start_blk: actual_summary.index_start_blk,
            index_root_blk: actual_summary.index_root_blk,
            max_vectored_read_bytes,
            layer_key_range: actual_summary.key_range,
            layer_lsn_range: actual_summary.lsn_range,
        }))
    }
@@ -1299,7 +1321,7 @@ impl DeltaLayerInner {
                        offsets.start.pos(),
                        offsets.end.pos(),
                        meta,
-                        Some(max_read_size),
+                        max_read_size,
                    ))
                }
            } else {
@@ -1593,13 +1615,17 @@ impl<'a> DeltaLayerIterator<'a> {
                let lsn = DeltaKey::extract_lsn_from_buf(&raw_key);
                let blob_ref = BlobRef(value);
                let offset = blob_ref.pos();
-                if let Some(batch_plan) = self.planner.handle(key, lsn, offset, BlobFlag::None) {
+                if let Some(batch_plan) = self.planner.handle(key, lsn, offset) {
                    break batch_plan;
                }
            } else {
                self.is_end = true;
                let data_end_offset = self.delta_layer.index_start_offset();
-                break self.planner.handle_range_end(data_end_offset);
+                if let Some(item) = self.planner.handle_range_end(data_end_offset) {
                    break item;
                } else {
                    return Ok(()); // TODO: test empty iterator
                }
            }
        };
        let vectored_blob_reader = VectoredBlobReader::new(&self.delta_layer.file);
@@ -1634,7 +1660,7 @@ impl<'a> DeltaLayerIterator<'a> {
 }
 #[cfg(test)]
-mod test {
+pub(crate) mod test {
    use std::collections::BTreeMap;
    use itertools::MinMaxResult;
@@ -2212,13 +2238,20 @@ mod test {
        }
    }
-    async fn produce_delta_layer(
+    pub(crate) fn sort_delta(
        (k1, l1, _): &(Key, Lsn, Value),
        (k2, l2, _): &(Key, Lsn, Value),
    ) -> std::cmp::Ordering {
        (k1, l1).cmp(&(k2, l2))
    }
    pub(crate) async fn produce_delta_layer(
        tenant: &Tenant,
        tline: &Arc<Timeline>,
        mut deltas: Vec<(Key, Lsn, Value)>,
        ctx: &RequestContext,
    ) -> anyhow::Result<ResidentLayer> {
-        deltas.sort_by(|(k1, l1, _), (k2, l2, _)| (k1, l1).cmp(&(k2, l2)));
+        deltas.sort_by(sort_delta);
        let (key_start, _, _) = deltas.first().unwrap();
        let (key_max, _, _) = deltas.first().unwrap();
        let lsn_min = deltas.iter().map(|(_, lsn, _)| lsn).min().unwrap();
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -369,6 +369,16 @@ impl ImageLayer {
 }
 impl ImageLayerInner {
    #[cfg(test)]
    pub(crate) fn key_range(&self) -> &Range<Key> {
        &self.key_range
    }
    #[cfg(test)]
    pub(crate) fn lsn(&self) -> Lsn {
        self.lsn
    }
    /// Returns nested result following Result<Result<_, OpErr>, Critical>:
    /// - inner has the success or transient failure
    /// - outer has the permanent failure
@@ -799,7 +809,11 @@ impl ImageLayerWriterInner {
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
        ensure!(self.key_range.contains(&key));
-        let (_img, res) = self.blob_writer.write_blob(img, ctx).await;
+        let compression = self.conf.image_compression;
        let (_img, res) = self
            .blob_writer
            .write_blob_maybe_compressed(img, ctx, compression)
            .await;
        // TODO: re-use the buffer for `img` further upstack
        let off = res?;
@@ -984,14 +998,17 @@ impl<'a> ImageLayerIterator<'a> {
                    Key::from_slice(&raw_key[..KEY_SIZE]),
                    self.image_layer.lsn,
                    offset,
                    BlobFlag::None,
                ) {
                    break batch_plan;
                }
            } else {
                self.is_end = true;
                let payload_end = self.image_layer.index_start_blk as u64 * PAGE_SZ as u64;
-                break self.planner.handle_range_end(payload_end);
+                if let Some(item) = self.planner.handle_range_end(payload_end) {
                    break item;
                } else {
                    return Ok(()); // TODO: a test case on empty iterator
                }
            }
        };
        let vectored_blob_reader = VectoredBlobReader::new(&self.image_layer.file);
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -6,13 +6,14 @@
 //!
 use crate::config::PageServerConf;
 use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
 use crate::page_cache::PAGE_SZ;
 use crate::repository::{Key, Value};
-use crate::tenant::block_io::BlockReader;
+use crate::tenant::block_io::{BlockCursor, BlockReader, BlockReaderRef};
 use crate::tenant::ephemeral_file::EphemeralFile;
 use crate::tenant::storage_layer::ValueReconstructResult;
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::{PageReconstructError, Timeline};
-use crate::{page_cache, walrecord};
+use crate::{l0_flush, page_cache, walrecord};
 use anyhow::{anyhow, ensure, Result};
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::InMemoryLayerInfo;
@@ -410,6 +411,7 @@ impl InMemoryLayer {
                continue;
            }
            // TODO: this uses the page cache => https://github.com/neondatabase/neon/issues/8183
            let buf = reader.read_blob(block_read.block_offset, &ctx).await;
            if let Err(e) = buf {
                reconstruct_state
@@ -620,6 +622,13 @@ impl InMemoryLayer {
        // rare though, so we just accept the potential latency hit for now.
        let inner = self.inner.read().await;
        let l0_flush_global_state = timeline.l0_flush_global_state.inner().clone();
        use l0_flush::Inner;
        let _concurrency_permit = match &*l0_flush_global_state {
            Inner::PageCached => None,
            Inner::Direct { semaphore, .. } => Some(semaphore.acquire().await),
        };
        let end_lsn = *self.end_lsn.get().unwrap();
        let key_count = if let Some(key_range) = key_range {
@@ -645,28 +654,83 @@ impl InMemoryLayer {
        )
        .await?;
-        let mut buf = Vec::new();
+        match &*l0_flush_global_state {
            l0_flush::Inner::PageCached => {
                let ctx = RequestContextBuilder::extend(ctx)
                    .page_content_kind(PageContentKind::InMemoryLayer)
                    .build();
-        let cursor = inner.file.block_cursor();
+                let mut buf = Vec::new();
-        let ctx = RequestContextBuilder::extend(ctx)
+                let cursor = inner.file.block_cursor();
-            .page_content_kind(PageContentKind::InMemoryLayer)
+
-            .build();
+                for (key, vec_map) in inner.index.iter() {
-        for (key, vec_map) in inner.index.iter() {
+                    // Write all page versions
-            // Write all page versions
+                    for (lsn, pos) in vec_map.as_slice() {
-            for (lsn, pos) in vec_map.as_slice() {
+                        cursor.read_blob_into_buf(*pos, &mut buf, &ctx).await?;
-                cursor.read_blob_into_buf(*pos, &mut buf, &ctx).await?;
+                        let will_init = Value::des(&buf)?.will_init();
-                let will_init = Value::des(&buf)?.will_init();
+                        let res;
-                let res;
+                        (buf, res) = delta_layer_writer
-                (buf, res) = delta_layer_writer
+                            .put_value_bytes(*key, *lsn, buf, will_init, &ctx)
-                    .put_value_bytes(*key, *lsn, buf, will_init, &ctx)
+                            .await;
-                    .await;
+                        res?;
-                res?;
+                    }
                }
            }
            l0_flush::Inner::Direct { .. } => {
                let file_contents: Vec<u8> = inner.file.load_to_vec(ctx).await?;
                assert_eq!(
                    file_contents.len() % PAGE_SZ,
                    0,
                    "needed by BlockReaderRef::Slice"
                );
                assert_eq!(file_contents.len(), {
                    let written = usize::try_from(inner.file.len()).unwrap();
                    if written % PAGE_SZ == 0 {
                        written
                    } else {
                        written.checked_add(PAGE_SZ - (written % PAGE_SZ)).unwrap()
                    }
                });
                let cursor = BlockCursor::new(BlockReaderRef::Slice(&file_contents));
                let mut buf = Vec::new();
                for (key, vec_map) in inner.index.iter() {
                    // Write all page versions
                    for (lsn, pos) in vec_map.as_slice() {
                        // TODO: once we have blob lengths in the in-memory index, we can
                        // 1. get rid of the blob_io / BlockReaderRef::Slice business and
                        // 2. load the file contents into a Bytes and
                        // 3. the use `Bytes::slice` to get the `buf` that is our blob
                        // 4. pass that `buf` into `put_value_bytes`
                        // => https://github.com/neondatabase/neon/issues/8183
                        cursor.read_blob_into_buf(*pos, &mut buf, ctx).await?;
                        let will_init = Value::des(&buf)?.will_init();
                        let res;
                        (buf, res) = delta_layer_writer
                            .put_value_bytes(*key, *lsn, buf, will_init, ctx)
                            .await;
                        res?;
                    }
                }
            }
        }
        // MAX is used here because we identify L0 layers by full key range
-        let delta_layer = delta_layer_writer.finish(Key::MAX, timeline, &ctx).await?;
+        let delta_layer = delta_layer_writer.finish(Key::MAX, timeline, ctx).await?;
        // Hold the permit until all the IO is done, including the fsync in `delta_layer_writer.finish()``.
        //
        // If we didn't and our caller drops this future, tokio-epoll-uring would extend the lifetime of
        // the `file_contents: Vec<u8>` until the IO is done, but not the permit's lifetime.
        // Thus, we'd have more concurrenct `Vec<u8>` in existence than the semaphore allows.
        //
        // We hold across the fsync so that on ext4 mounted with data=ordered, all the kernel page cache pages
        // we dirtied when writing to the filesystem have been flushed and marked !dirty.
        drop(_concurrency_permit);
        Ok(Some(delta_layer))
    }
 }
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -1096,19 +1096,10 @@ impl LayerInner {
        match rx.await {
            Ok(Ok(res)) => Ok(res),
-            Ok(Err(e)) => {
+            Ok(Err(remote_storage::DownloadError::Cancelled)) => {
-                // sleep already happened in the spawned task, if it was not cancelled
+                Err(DownloadError::DownloadCancelled)
                match e.downcast_ref::<remote_storage::DownloadError>() {
                    // If the download failed due to its cancellation token,
                    // propagate the cancellation error upstream.
                    Some(remote_storage::DownloadError::Cancelled) => {
                        Err(DownloadError::DownloadCancelled)
                    }
                    // FIXME: this is not embedding the error because historically it would had
                    // been output to compute, however that is no longer the case.
                    _ => Err(DownloadError::DownloadFailed),
                }
            }
            Ok(Err(_)) => Err(DownloadError::DownloadFailed),
            Err(_gone) => Err(DownloadError::DownloadCancelled),
        }
    }
@@ -1118,7 +1109,7 @@ impl LayerInner {
        timeline: Arc<Timeline>,
        permit: heavier_once_cell::InitPermit,
        ctx: &RequestContext,
-    ) -> anyhow::Result<Arc<DownloadedLayer>> {
+    ) -> Result<Arc<DownloadedLayer>, remote_storage::DownloadError> {
        let result = timeline
            .remote_client
            .download_layer_file(
--- a/pageserver/src/tenant/storage_layer/layer_desc.rs
+++ b/pageserver/src/tenant/storage_layer/layer_desc.rs
@@ -25,7 +25,7 @@ pub struct PersistentLayerDesc {
    ///
    /// - For an open in-memory layer, the end bound is MAX_LSN
    /// - For a frozen in-memory layer or a delta layer, the end bound is a valid lsn after the
-    /// range start
+    ///   range start
    /// - An image layer represents snapshot at one LSN, so end_lsn is always the snapshot LSN + 1
    pub lsn_range: Range<Lsn>,
    /// Whether this is a delta layer, and also, is this incremental.
--- a/pageserver/src/tenant/storage_layer/merge_iterator.rs
+++ b/pageserver/src/tenant/storage_layer/merge_iterator.rs
@@ -0,0 +1,412 @@
 use std::{
    cmp::Ordering,
    collections::{binary_heap, BinaryHeap},
 };
 use pageserver_api::key::Key;
 use utils::lsn::Lsn;
 use crate::{context::RequestContext, repository::Value};
 use super::{
    delta_layer::{DeltaLayerInner, DeltaLayerIterator},
    image_layer::{ImageLayerInner, ImageLayerIterator},
 };
 #[derive(Clone, Copy)]
 enum LayerRef<'a> {
    Image(&'a ImageLayerInner),
    Delta(&'a DeltaLayerInner),
 }
 impl<'a> LayerRef<'a> {
    fn iter(self, ctx: &'a RequestContext) -> LayerIterRef<'a> {
        match self {
            Self::Image(x) => LayerIterRef::Image(x.iter(ctx)),
            Self::Delta(x) => LayerIterRef::Delta(x.iter(ctx)),
        }
    }
 }
 enum LayerIterRef<'a> {
    Image(ImageLayerIterator<'a>),
    Delta(DeltaLayerIterator<'a>),
 }
 impl LayerIterRef<'_> {
    async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
        match self {
            Self::Delta(x) => x.next().await,
            Self::Image(x) => x.next().await,
        }
    }
 }
 /// This type plays several roles at once
 /// 1. Unified iterator for image and delta layers.
 /// 2. `Ord` for use in [`MergeIterator::heap`] (for the k-merge).
 /// 3. Lazy creation of the real delta/image iterator.
 enum IteratorWrapper<'a> {
    NotLoaded {
        ctx: &'a RequestContext,
        first_key_lower_bound: (Key, Lsn),
        layer: LayerRef<'a>,
    },
    Loaded {
        iter: PeekableLayerIterRef<'a>,
    },
 }
 struct PeekableLayerIterRef<'a> {
    iter: LayerIterRef<'a>,
    peeked: Option<(Key, Lsn, Value)>, // None == end
 }
 impl<'a> PeekableLayerIterRef<'a> {
    async fn create(mut iter: LayerIterRef<'a>) -> anyhow::Result<Self> {
        let peeked = iter.next().await?;
        Ok(Self { iter, peeked })
    }
    fn peek(&self) -> &Option<(Key, Lsn, Value)> {
        &self.peeked
    }
    async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
        let result = self.peeked.take();
        self.peeked = self.iter.next().await?;
        Ok(result)
    }
 }
 impl<'a> std::cmp::PartialEq for IteratorWrapper<'a> {
    fn eq(&self, other: &Self) -> bool {
        self.cmp(other) == Ordering::Equal
    }
 }
 impl<'a> std::cmp::Eq for IteratorWrapper<'a> {}
 impl<'a> std::cmp::PartialOrd for IteratorWrapper<'a> {
    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
        Some(self.cmp(other))
    }
 }
 impl<'a> std::cmp::Ord for IteratorWrapper<'a> {
    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
        use std::cmp::Ordering;
        let a = self.peek_next_key_lsn();
        let b = other.peek_next_key_lsn();
        match (a, b) {
            (Some((k1, l1)), Some((k2, l2))) => {
                let loaded_1 = if self.is_loaded() { 1 } else { 0 };
                let loaded_2 = if other.is_loaded() { 1 } else { 0 };
                // When key_lsn are the same, the unloaded iter will always appear before the loaded one.
                // And note that we do a reverse at the end of the comparison, so it works with the max heap.
                (k1, l1, loaded_1).cmp(&(k2, l2, loaded_2))
            }
            (Some(_), None) => Ordering::Less,
            (None, Some(_)) => Ordering::Greater,
            (None, None) => Ordering::Equal,
        }
        .reverse()
    }
 }
 impl<'a> IteratorWrapper<'a> {
    pub fn create_from_image_layer(
        image_layer: &'a ImageLayerInner,
        ctx: &'a RequestContext,
    ) -> Self {
        Self::NotLoaded {
            layer: LayerRef::Image(image_layer),
            first_key_lower_bound: (image_layer.key_range().start, image_layer.lsn()),
            ctx,
        }
    }
    pub fn create_from_delta_layer(
        delta_layer: &'a DeltaLayerInner,
        ctx: &'a RequestContext,
    ) -> Self {
        Self::NotLoaded {
            layer: LayerRef::Delta(delta_layer),
            first_key_lower_bound: (delta_layer.key_range().start, delta_layer.lsn_range().start),
            ctx,
        }
    }
    fn peek_next_key_lsn(&self) -> Option<(&Key, Lsn)> {
        match self {
            Self::Loaded { iter } => iter.peek().as_ref().map(|(key, lsn, _)| (key, *lsn)),
            Self::NotLoaded {
                first_key_lower_bound: (key, lsn),
                ..
            } => Some((key, *lsn)),
        }
    }
    // CORRECTNESS: this function must always take `&mut self`, never `&self`.
    //
    // The reason is that `impl Ord for Self` evaluates differently after this function
    // returns. We're called through a `PeekMut::deref_mut`, which causes heap repair when
    // the PeekMut gets returned. So, it's critical that we actually run through `PeekMut::deref_mut`
    // and not just `PeekMut::deref`
    // If we don't take `&mut self`
    async fn load(&mut self) -> anyhow::Result<()> {
        assert!(!self.is_loaded());
        let Self::NotLoaded {
            ctx,
            first_key_lower_bound,
            layer,
        } = self
        else {
            unreachable!()
        };
        let iter = layer.iter(ctx);
        let iter = PeekableLayerIterRef::create(iter).await?;
        if let Some((k1, l1, _)) = iter.peek() {
            let (k2, l2) = first_key_lower_bound;
            debug_assert!((k1, l1) >= (k2, l2));
        }
        *self = Self::Loaded { iter };
        Ok(())
    }
    fn is_loaded(&self) -> bool {
        matches!(self, Self::Loaded { .. })
    }
    /// Correctness: must load the iterator before using.
    ///
    /// Given this iterator wrapper is private to the merge iterator, users won't be able to mis-use it.
    /// The public interfaces to use are [`crate::tenant::storage_layer::delta_layer::DeltaLayerIterator`] and
    /// [`crate::tenant::storage_layer::image_layer::ImageLayerIterator`].
    async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
        let Self::Loaded { iter } = self else {
            panic!("must load the iterator before using")
        };
        iter.next().await
    }
 }
 pub struct MergeIterator<'a> {
    heap: BinaryHeap<IteratorWrapper<'a>>,
 }
 impl<'a> MergeIterator<'a> {
    pub fn create(
        deltas: &[&'a DeltaLayerInner],
        images: &[&'a ImageLayerInner],
        ctx: &'a RequestContext,
    ) -> Self {
        let mut heap = Vec::with_capacity(images.len() + deltas.len());
        for image in images {
            heap.push(IteratorWrapper::create_from_image_layer(image, ctx));
        }
        for delta in deltas {
            heap.push(IteratorWrapper::create_from_delta_layer(delta, ctx));
        }
        Self {
            heap: BinaryHeap::from(heap),
        }
    }
    pub async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
        while let Some(mut iter) = self.heap.peek_mut() {
            if !iter.is_loaded() {
                // Once we load the iterator, we can know the real first key-value pair in the iterator.
                // We put it back into the heap so that a potentially unloaded layer may have a key between
                // [potential_first_key, loaded_first_key).
                iter.load().await?;
                continue;
            }
            let Some(item) = iter.next().await? else {
                // If the iterator returns None, we pop this iterator. Actually, in the current implementation,
                // we order None > Some, and all the rest of the iterators should return None.
                binary_heap::PeekMut::pop(iter);
                continue;
            };
            return Ok(Some(item));
        }
        Ok(None)
    }
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    use itertools::Itertools;
    use pageserver_api::key::Key;
    use utils::lsn::Lsn;
    use crate::{
        tenant::{
            harness::{TenantHarness, TIMELINE_ID},
            storage_layer::delta_layer::test::{produce_delta_layer, sort_delta},
        },
        DEFAULT_PG_VERSION,
    };
    async fn assert_merge_iter_equal(
        merge_iter: &mut MergeIterator<'_>,
        expect: &[(Key, Lsn, Value)],
    ) {
        let mut expect_iter = expect.iter();
        loop {
            let o1 = merge_iter.next().await.unwrap();
            let o2 = expect_iter.next();
            assert_eq!(o1.is_some(), o2.is_some());
            if o1.is_none() && o2.is_none() {
                break;
            }
            let (k1, l1, v1) = o1.unwrap();
            let (k2, l2, v2) = o2.unwrap();
            assert_eq!(&k1, k2);
            assert_eq!(l1, *l2);
            assert_eq!(&v1, v2);
        }
    }
    #[tokio::test]
    async fn merge_in_between() {
        use crate::repository::Value;
        use bytes::Bytes;
        let harness = TenantHarness::create("merge_iterator_merge_in_between").unwrap();
        let (tenant, ctx) = harness.load().await;
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
            .await
            .unwrap();
        fn get_key(id: u32) -> Key {
            let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap();
            key.field6 = id;
            key
        }
        let test_deltas1 = vec![
            (
                get_key(0),
                Lsn(0x10),
                Value::Image(Bytes::copy_from_slice(b"test")),
            ),
            (
                get_key(5),
                Lsn(0x10),
                Value::Image(Bytes::copy_from_slice(b"test")),
            ),
        ];
        let resident_layer_1 = produce_delta_layer(&tenant, &tline, test_deltas1.clone(), &ctx)
            .await
            .unwrap();
        let test_deltas2 = vec![
            (
                get_key(3),
                Lsn(0x10),
                Value::Image(Bytes::copy_from_slice(b"test")),
            ),
            (
                get_key(4),
                Lsn(0x10),
                Value::Image(Bytes::copy_from_slice(b"test")),
            ),
        ];
        let resident_layer_2 = produce_delta_layer(&tenant, &tline, test_deltas2.clone(), &ctx)
            .await
            .unwrap();
        let mut merge_iter = MergeIterator::create(
            &[
                resident_layer_2.get_as_delta(&ctx).await.unwrap(),
                resident_layer_1.get_as_delta(&ctx).await.unwrap(),
            ],
            &[],
            &ctx,
        );
        let mut expect = Vec::new();
        expect.extend(test_deltas1);
        expect.extend(test_deltas2);
        expect.sort_by(sort_delta);
        assert_merge_iter_equal(&mut merge_iter, &expect).await;
    }
    #[tokio::test]
    async fn delta_merge() {
        use crate::repository::Value;
        use bytes::Bytes;
        let harness = TenantHarness::create("merge_iterator_delta_merge").unwrap();
        let (tenant, ctx) = harness.load().await;
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
            .await
            .unwrap();
        fn get_key(id: u32) -> Key {
            let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap();
            key.field6 = id;
            key
        }
        const N: usize = 1000;
        let test_deltas1 = (0..N)
            .map(|idx| {
                (
                    get_key(idx as u32 / 10),
                    Lsn(0x20 * ((idx as u64) % 10 + 1)),
                    Value::Image(Bytes::from(format!("img{idx:05}"))),
                )
            })
            .collect_vec();
        let resident_layer_1 = produce_delta_layer(&tenant, &tline, test_deltas1.clone(), &ctx)
            .await
            .unwrap();
        let test_deltas2 = (0..N)
            .map(|idx| {
                (
                    get_key(idx as u32 / 10),
                    Lsn(0x20 * ((idx as u64) % 10 + 1) + 0x10),
                    Value::Image(Bytes::from(format!("img{idx:05}"))),
                )
            })
            .collect_vec();
        let resident_layer_2 = produce_delta_layer(&tenant, &tline, test_deltas2.clone(), &ctx)
            .await
            .unwrap();
        let test_deltas3 = (0..N)
            .map(|idx| {
                (
                    get_key(idx as u32 / 10 + N as u32),
                    Lsn(0x10 * ((idx as u64) % 10 + 1)),
                    Value::Image(Bytes::from(format!("img{idx:05}"))),
                )
            })
            .collect_vec();
        let resident_layer_3 = produce_delta_layer(&tenant, &tline, test_deltas3.clone(), &ctx)
            .await
            .unwrap();
        let mut merge_iter = MergeIterator::create(
            &[
                resident_layer_1.get_as_delta(&ctx).await.unwrap(),
                resident_layer_2.get_as_delta(&ctx).await.unwrap(),
                resident_layer_3.get_as_delta(&ctx).await.unwrap(),
            ],
            &[],
            &ctx,
        );
        let mut expect = Vec::new();
        expect.extend(test_deltas1);
        expect.extend(test_deltas2);
        expect.extend(test_deltas3);
        expect.sort_by(sort_delta);
        assert_merge_iter_equal(&mut merge_iter, &expect).await;
        // TODO: test layers are loaded only when needed, reducing num of active iterators in k-merge
    }
    // TODO: image layer merge, delta+image mixed merge
    // TODO: is it possible to have duplicated delta at same LSN now? we might need to test that
 }
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -14,6 +14,7 @@ use anyhow::{anyhow, bail, ensure, Context, Result};
 use arc_swap::ArcSwap;
 use bytes::Bytes;
 use camino::Utf8Path;
 use chrono::{DateTime, Utc};
 use enumset::EnumSet;
 use fail::fail_point;
 use once_cell::sync::Lazy;
@@ -65,13 +66,12 @@ use std::{
    ops::{Deref, Range},
 };
 use crate::metrics::GetKind;
 use crate::pgdatadir_mapping::MAX_AUX_FILE_V2_DELTAS;
 use crate::{
    aux_file::AuxFileSizeEstimator,
    tenant::{
        layer_map::{LayerMap, SearchResult},
        metadata::TimelineMetadata,
        storage_layer::PersistentLayerDesc,
    },
 };
 use crate::{
@@ -90,10 +90,15 @@ use crate::{
 use crate::{
    disk_usage_eviction_task::EvictionCandidate, tenant::storage_layer::delta_layer::DeltaEntry,
 };
 use crate::{
    l0_flush::{self, L0FlushGlobalState},
    metrics::GetKind,
 };
 use crate::{
    metrics::ScanLatencyOngoingRecording, tenant::timeline::logical_size::CurrentLogicalSize,
 };
 use crate::{pgdatadir_mapping::LsnForTimestamp, tenant::tasks::BackgroundLoopKind};
 use crate::{pgdatadir_mapping::MAX_AUX_FILE_V2_DELTAS, tenant::storage_layer::PersistentLayerKey};
 use crate::{
    pgdatadir_mapping::{AuxFilesDirectory, DirectoryKind},
    virtual_file::{MaybeFatalIo, VirtualFile},
@@ -208,6 +213,7 @@ pub struct TimelineResources {
    pub timeline_get_throttle: Arc<
        crate::tenant::throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>,
    >,
    pub l0_flush_global_state: l0_flush::L0FlushGlobalState,
 }
 pub(crate) struct AuxFilesState {
@@ -360,6 +366,7 @@ pub struct Timeline {
    repartition_threshold: u64,
    last_image_layer_creation_check_at: AtomicLsn,
    last_image_layer_creation_check_instant: std::sync::Mutex<Option<Instant>>,
    /// Current logical size of the "datadir", at the last LSN.
    current_logical_size: LogicalSize,
@@ -433,6 +440,8 @@ pub struct Timeline {
    /// in the future, add `extra_test_sparse_keyspace` if necessary.
    #[cfg(test)]
    pub(crate) extra_test_dense_keyspace: ArcSwap<KeySpace>,
    pub(crate) l0_flush_global_state: L0FlushGlobalState,
 }
 pub struct WalReceiverInfo {
@@ -457,6 +466,9 @@ pub(crate) struct GcInfo {
    /// Leases granted to particular LSNs.
    pub(crate) leases: BTreeMap<Lsn, LsnLease>,
    /// Whether our branch point is within our ancestor's PITR interval (for cost estimation)
    pub(crate) within_ancestor_pitr: bool,
 }
 impl GcInfo {
@@ -717,6 +729,9 @@ impl From<CreateImageLayersError> for CompactionError {
    fn from(e: CreateImageLayersError) -> Self {
        match e {
            CreateImageLayersError::Cancelled => CompactionError::ShuttingDown,
            CreateImageLayersError::Other(e) => {
                CompactionError::Other(e.context("create image layers"))
            }
            _ => CompactionError::Other(e.into()),
        }
    }
@@ -845,6 +860,18 @@ impl Timeline {
            .map(|ancestor| ancestor.timeline_id)
    }
    /// Get the bytes written since the PITR cutoff on this branch, and
    /// whether this branch's ancestor_lsn is within its parent's PITR.
    pub(crate) fn get_pitr_history_stats(&self) -> (u64, bool) {
        let gc_info = self.gc_info.read().unwrap();
        let history = self
            .get_last_record_lsn()
            .checked_sub(gc_info.cutoffs.pitr)
            .unwrap_or(Lsn(0))
            .0;
        (history, gc_info.within_ancestor_pitr)
    }
    /// Lock and get timeline's GC cutoff
    pub(crate) fn get_latest_gc_cutoff_lsn(&self) -> RcuReadGuard<Lsn> {
        self.latest_gc_cutoff_lsn.read()
@@ -996,6 +1023,7 @@ impl Timeline {
    }
    pub(crate) const MAX_GET_VECTORED_KEYS: u64 = 32;
    pub(crate) const VEC_GET_LAYERS_VISITED_WARN_THRESH: f64 = 512.0;
    /// Look up multiple page versions at a given LSN
    ///
@@ -1228,7 +1256,7 @@ impl Timeline {
        let get_data_timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME
            .for_get_kind(get_kind)
            .start_timer();
-        self.get_vectored_reconstruct_data(keyspace, lsn, reconstruct_state, ctx)
+        self.get_vectored_reconstruct_data(keyspace.clone(), lsn, reconstruct_state, ctx)
            .await?;
        get_data_timer.stop_and_record();
@@ -1258,11 +1286,25 @@ impl Timeline {
        // (this is a requirement, not a bug). Skip updating the metric in these cases
        // to avoid infinite results.
        if !results.is_empty() {
            let avg = layers_visited as f64 / results.len() as f64;
            if avg >= Self::VEC_GET_LAYERS_VISITED_WARN_THRESH {
                use utils::rate_limit::RateLimit;
                static LOGGED: Lazy<Mutex<RateLimit>> =
                    Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(60))));
                let mut rate_limit = LOGGED.lock().unwrap();
                rate_limit.call(|| {
                    tracing::info!(
                      shard_id = %self.tenant_shard_id.shard_slug(),
                      lsn = %lsn,
                      "Vectored read for {} visited {} layers on average per key and {} in total. {}/{} pages were returned",
                      keyspace, avg, layers_visited, results.len(), keyspace.total_raw_size());
                });
            }
            // Note that this is an approximation. Tracking the exact number of layers visited
            // per key requires virtually unbounded memory usage and is inefficient
            // (i.e. segment tree tracking each range queried from a layer)
-            crate::metrics::VEC_READ_NUM_LAYERS_VISITED
+            crate::metrics::VEC_READ_NUM_LAYERS_VISITED.observe(avg);
                .observe(layers_visited as f64 / results.len() as f64);
        }
        Ok(results)
@@ -1554,7 +1596,13 @@ impl Timeline {
                    let existing_lease = occupied.get_mut();
                    if valid_until > existing_lease.valid_until {
                        existing_lease.valid_until = valid_until;
                        let dt: DateTime<Utc> = valid_until.into();
                        info!("lease extended to {}", dt);
                    } else {
                        let dt: DateTime<Utc> = existing_lease.valid_until.into();
                        info!("existing lease covers greater length, valid until {}", dt);
                    }
                    existing_lease.clone()
                } else {
                    // Reject already GC-ed LSN (lsn < latest_gc_cutoff)
@@ -1563,6 +1611,8 @@ impl Timeline {
                        bail!("tried to request a page version that was garbage collected. requested at {} gc cutoff {}", lsn, *latest_gc_cutoff_lsn);
                    }
                    let dt: DateTime<Utc> = valid_until.into();
                    info!("lease created, valid until {}", dt);
                    entry.or_insert(LsnLease { valid_until }).clone()
                }
            };
@@ -2339,6 +2389,7 @@ impl Timeline {
                )),
                repartition_threshold: 0,
                last_image_layer_creation_check_at: AtomicLsn::new(0),
                last_image_layer_creation_check_instant: Mutex::new(None),
                last_received_wal: Mutex::new(None),
                rel_size_cache: RwLock::new(RelSizeCache {
@@ -2376,6 +2427,8 @@ impl Timeline {
                #[cfg(test)]
                extra_test_dense_keyspace: ArcSwap::new(Arc::new(KeySpace::default())),
                l0_flush_global_state: resources.l0_flush_global_state,
            };
            result.repartition_threshold =
                result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
@@ -3355,6 +3408,7 @@ impl Timeline {
        }
    }
    #[allow(clippy::doc_lazy_continuation)]
    /// Get the data needed to reconstruct all keys in the provided keyspace
    ///
    /// The algorithm is as follows:
@@ -4417,6 +4471,58 @@ impl Timeline {
        }
    }
    /// Predicate function which indicates whether we should check if new image layers
    /// are required. Since checking if new image layers are required is expensive in
    /// terms of CPU, we only do it in the following cases:
    /// 1. If the timeline has ingested sufficient WAL to justify the cost
    /// 2. If enough time has passed since the last check:
    ///     1. For large tenants, we wish to perform the check more often since they
    ///        suffer from the lack of image layers
    ///     2. For small tenants (that can mostly fit in RAM), we use a much longer interval
    fn should_check_if_image_layers_required(self: &Arc<Timeline>, lsn: Lsn) -> bool {
        const LARGE_TENANT_THRESHOLD: u64 = 2 * 1024 * 1024 * 1024;
        let last_checks_at = self.last_image_layer_creation_check_at.load();
        let distance = lsn
            .checked_sub(last_checks_at)
            .expect("Attempt to compact with LSN going backwards");
        let min_distance =
            self.get_image_layer_creation_check_threshold() as u64 * self.get_checkpoint_distance();
        let distance_based_decision = distance.0 >= min_distance;
        let mut time_based_decision = false;
        let mut last_check_instant = self.last_image_layer_creation_check_instant.lock().unwrap();
        if let CurrentLogicalSize::Exact(logical_size) = self.current_logical_size.current_size() {
            let check_required_after = if Into::<u64>::into(&logical_size) >= LARGE_TENANT_THRESHOLD
            {
                self.get_checkpoint_timeout()
            } else {
                Duration::from_secs(3600 * 48)
            };
            time_based_decision = match *last_check_instant {
                Some(last_check) => {
                    let elapsed = last_check.elapsed();
                    elapsed >= check_required_after
                }
                None => true,
            };
        }
        // Do the expensive delta layer counting only if this timeline has ingested sufficient
        // WAL since the last check or a checkpoint timeout interval has elapsed since the last
        // check.
        let decision = distance_based_decision || time_based_decision;
        if decision {
            self.last_image_layer_creation_check_at.store(lsn);
            *last_check_instant = Some(Instant::now());
        }
        decision
    }
    #[tracing::instrument(skip_all, fields(%lsn, %mode))]
    async fn create_image_layers(
        self: &Arc<Timeline>,
@@ -4439,22 +4545,7 @@ impl Timeline {
        // image layers  <100000000..100000099> and <200000000..200000199> are not completely covering it.
        let mut start = Key::MIN;
-        let check_for_image_layers = {
+        let check_for_image_layers = self.should_check_if_image_layers_required(lsn);
            let last_checks_at = self.last_image_layer_creation_check_at.load();
            let distance = lsn
                .checked_sub(last_checks_at)
                .expect("Attempt to compact with LSN going backwards");
            let min_distance = self.get_image_layer_creation_check_threshold() as u64
                * self.get_checkpoint_distance();
            // Skip the expensive delta layer counting if this timeline has not ingested sufficient
            // WAL since the last check.
            distance.0 >= min_distance
        };
        if check_for_image_layers {
            self.last_image_layer_creation_check_at.store(lsn);
        }
        for partition in partitioning.parts.iter() {
            let img_range = start..partition.ranges.last().unwrap().end;
@@ -4483,6 +4574,22 @@ impl Timeline {
                    start = img_range.end;
                    continue;
                }
            } else if let ImageLayerCreationMode::Force = mode {
                // When forced to create image layers, we might try and create them where they already
                // exist.  This mode is only used in tests/debug.
                let layers = self.layers.read().await;
                if layers.contains_key(&PersistentLayerKey {
                    key_range: img_range.clone(),
                    lsn_range: PersistentLayerDesc::image_layer_lsn_range(lsn),
                    is_delta: false,
                }) {
                    tracing::info!(
                        "Skipping image layer at {lsn} {}..{}, already exists",
                        img_range.start,
                        img_range.end
                    );
                    continue;
                }
            }
            let image_layer_writer = ImageLayerWriter::new(
@@ -4613,7 +4720,7 @@ impl Timeline {
    /// Requires a timeline that:
    /// - has an ancestor to detach from
    /// - the ancestor does not have an ancestor -- follows from the original RFC limitations, not
-    /// a technical requirement
+    ///   a technical requirement
    ///
    /// After the operation has been started, it cannot be canceled. Upon restart it needs to be
    /// polled again until completion.
@@ -4711,6 +4818,42 @@ impl DurationRecorder {
    }
 }
 /// Descriptor for a delta layer used in testing infra. The start/end key/lsn range of the
 /// delta layer might be different from the min/max key/lsn in the delta layer. Therefore,
 /// the layer descriptor requires the user to provide the ranges, which should cover all
 /// keys specified in the `data` field.
 #[cfg(test)]
 pub struct DeltaLayerTestDesc {
    pub lsn_range: Range<Lsn>,
    pub key_range: Range<Key>,
    pub data: Vec<(Key, Lsn, Value)>,
 }
 #[cfg(test)]
 impl DeltaLayerTestDesc {
    #[allow(dead_code)]
    pub fn new(lsn_range: Range<Lsn>, key_range: Range<Key>, data: Vec<(Key, Lsn, Value)>) -> Self {
        Self {
            lsn_range,
            key_range,
            data,
        }
    }
    pub fn new_with_inferred_key_range(
        lsn_range: Range<Lsn>,
        data: Vec<(Key, Lsn, Value)>,
    ) -> Self {
        let key_min = data.iter().map(|(key, _, _)| key).min().unwrap();
        let key_max = data.iter().map(|(key, _, _)| key).max().unwrap();
        Self {
            key_range: (*key_min)..(key_max.next()),
            lsn_range,
            data,
        }
    }
 }
 impl Timeline {
    async fn finish_compact_batch(
        self: &Arc<Self>,
@@ -5511,37 +5654,65 @@ impl Timeline {
    #[cfg(test)]
    pub(super) async fn force_create_delta_layer(
        self: &Arc<Timeline>,
-        mut deltas: Vec<(Key, Lsn, Value)>,
+        mut deltas: DeltaLayerTestDesc,
        check_start_lsn: Option<Lsn>,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
        let last_record_lsn = self.get_last_record_lsn();
-        deltas.sort_unstable_by(|(ka, la, _), (kb, lb, _)| (ka, la).cmp(&(kb, lb)));
+        deltas
-        let min_key = *deltas.first().map(|(k, _, _)| k).unwrap();
+            .data
-        let end_key = deltas.last().map(|(k, _, _)| k).unwrap().next();
+            .sort_unstable_by(|(ka, la, _), (kb, lb, _)| (ka, la).cmp(&(kb, lb)));
-        let min_lsn = *deltas.iter().map(|(_, lsn, _)| lsn).min().unwrap();
+        assert!(deltas.data.first().unwrap().0 >= deltas.key_range.start);
-        let max_lsn = *deltas.iter().map(|(_, lsn, _)| lsn).max().unwrap();
+        assert!(deltas.data.last().unwrap().0 < deltas.key_range.end);
        for (_, lsn, _) in &deltas.data {
            assert!(deltas.lsn_range.start <= *lsn && *lsn < deltas.lsn_range.end);
        }
        assert!(
-            max_lsn <= last_record_lsn,
+            deltas.lsn_range.end <= last_record_lsn,
-            "advance last record lsn before inserting a layer, max_lsn={max_lsn}, last_record_lsn={last_record_lsn}"
+            "advance last record lsn before inserting a layer, end_lsn={}, last_record_lsn={}",
            deltas.lsn_range.end,
            last_record_lsn
        );
        let end_lsn = Lsn(max_lsn.0 + 1);
        if let Some(check_start_lsn) = check_start_lsn {
-            assert!(min_lsn >= check_start_lsn);
+            assert!(deltas.lsn_range.start >= check_start_lsn);
        }
        // check if the delta layer does not violate the LSN invariant, the legacy compaction should always produce a batch of
        // layers of the same start/end LSN, and so should the force inserted layer
        {
            /// Checks if a overlaps with b, assume a/b = [start, end).
            pub fn overlaps_with<T: Ord>(a: &Range<T>, b: &Range<T>) -> bool {
                !(a.end <= b.start || b.end <= a.start)
            }
            let guard = self.layers.read().await;
            for layer in guard.layer_map().iter_historic_layers() {
                if layer.is_delta()
                    && overlaps_with(&layer.lsn_range, &deltas.lsn_range)
                    && layer.lsn_range != deltas.lsn_range
                {
                    // If a delta layer overlaps with another delta layer AND their LSN range is not the same, panic
                    panic!(
                        "inserted layer violates delta layer LSN invariant: current_lsn_range={}..{}, conflict_lsn_range={}..{}",
                        deltas.lsn_range.start, deltas.lsn_range.end, layer.lsn_range.start, layer.lsn_range.end
                    );
                }
            }
        }
        let mut delta_layer_writer = DeltaLayerWriter::new(
            self.conf,
            self.timeline_id,
            self.tenant_shard_id,
-            min_key,
+            deltas.key_range.start,
-            min_lsn..end_lsn,
+            deltas.lsn_range,
            ctx,
        )
        .await?;
-        for (key, lsn, val) in deltas {
+        for (key, lsn, val) in deltas.data {
            delta_layer_writer.put_value(key, lsn, val, ctx).await?;
        }
-        let delta_layer = delta_layer_writer.finish(end_key, self, ctx).await?;
+        let delta_layer = delta_layer_writer
            .finish(deltas.key_range.end, self, ctx)
            .await?;
        {
            let mut guard = self.layers.write().await;
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -182,13 +182,15 @@ async fn remove_timeline_from_tenant(
 /// 5. Delete index part
 /// 6. Delete meta, timeline directory
 /// 7. Delete mark file
 ///
 /// It is resumable from any step in case a crash/restart occurs.
 /// There are three entrypoints to the process:
 /// 1. [`DeleteTimelineFlow::run`] this is the main one called by a management api handler.
 /// 2. [`DeleteTimelineFlow::resume_deletion`] is called during restarts when local metadata is still present
-/// and we possibly neeed to continue deletion of remote files.
+///    and we possibly neeed to continue deletion of remote files.
 /// 3. [`DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces`] is used when we deleted remote
-/// index but still have local metadata, timeline directory and delete mark.
+///    index but still have local metadata, timeline directory and delete mark.
 ///
 /// Note the only other place that messes around timeline delete mark is the logic that scans directory with timelines during tenant load.
 #[derive(Default)]
 pub enum DeleteTimelineFlow {
@@ -272,6 +274,7 @@ impl DeleteTimelineFlow {
                TimelineResources {
                    remote_client,
                    timeline_get_throttle: tenant.timeline_get_throttle.clone(),
                    l0_flush_global_state: tenant.l0_flush_global_state.clone(),
                },
                // Important. We dont pass ancestor above because it can be missing.
                // Thus we need to skip the validation here.
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -339,6 +339,10 @@ impl LayerManager {
        self.layer_fmgr.contains(layer)
    }
    pub(crate) fn contains_key(&self, key: &PersistentLayerKey) -> bool {
        self.layer_fmgr.contains_key(key)
    }
    pub(crate) fn all_persistent_layers(&self) -> Vec<PersistentLayerKey> {
        self.layer_fmgr.0.keys().cloned().collect_vec()
    }
@@ -363,6 +367,10 @@ impl<T: AsLayerDesc + Clone> LayerFileManager<T> {
            .clone()
    }
    fn contains_key(&self, key: &PersistentLayerKey) -> bool {
        self.0.contains_key(key)
    }
    pub(crate) fn insert(&mut self, layer: T) {
        let present = self.0.insert(layer.layer_desc().key(), layer.clone());
        if present.is_some() && cfg!(debug_assertions) {
--- a/pageserver/src/tenant/timeline/logical_size.rs
+++ b/pageserver/src/tenant/timeline/logical_size.rs
@@ -11,11 +11,11 @@ use std::sync::atomic::{AtomicBool, AtomicI64, Ordering as AtomicOrdering};
 /// Calculation consists of two stages:
 ///
 /// 1. Initial size calculation. That might take a long time, because it requires
-/// reading all layers containing relation sizes at `initial_part_end`.
+///    reading all layers containing relation sizes at `initial_part_end`.
 ///
 /// 2. Collecting an incremental part and adding that to the initial size.
-/// Increments are appended on walreceiver writing new timeline data,
+///    Increments are appended on walreceiver writing new timeline data,
-/// which result in increase or decrease of the logical size.
+///    which result in increase or decrease of the logical size.
 pub(super) struct LogicalSize {
    /// Size, potentially slow to compute. Calculating this might require reading multiple
    /// layers, and even ancestor's layers.
@@ -45,17 +45,17 @@ pub(super) struct LogicalSize {
    /// Size shouldn't ever be negative, but this is signed for two reasons:
    ///
    /// 1. If we initialized the "baseline" size lazily, while we already
-    /// process incoming WAL, the incoming WAL records could decrement the
+    ///    process incoming WAL, the incoming WAL records could decrement the
-    /// variable and temporarily make it negative. (This is just future-proofing;
+    ///    variable and temporarily make it negative. (This is just future-proofing;
-    /// the initialization is currently not done lazily.)
+    ///    the initialization is currently not done lazily.)
    ///
    /// 2. If there is a bug and we e.g. forget to increment it in some cases
-    /// when size grows, but remember to decrement it when it shrinks again, the
+    ///    when size grows, but remember to decrement it when it shrinks again, the
-    /// variable could go negative. In that case, it seems better to at least
+    ///    variable could go negative. In that case, it seems better to at least
-    /// try to keep tracking it, rather than clamp or overflow it. Note that
+    ///    try to keep tracking it, rather than clamp or overflow it. Note that
-    /// get_current_logical_size() will clamp the returned value to zero if it's
+    ///    get_current_logical_size() will clamp the returned value to zero if it's
-    /// negative, and log an error. Could set it permanently to zero or some
+    ///    negative, and log an error. Could set it permanently to zero or some
-    /// special value to indicate "broken" instead, but this will do for now.
+    ///    special value to indicate "broken" instead, but this will do for now.
    ///
    /// Note that we also expose a copy of this value as a prometheus metric,
    /// see `current_logical_size_gauge`. Use the `update_current_logical_size`
--- a/pageserver/src/tenant/timeline/walreceiver.rs
+++ b/pageserver/src/tenant/timeline/walreceiver.rs
@@ -2,13 +2,13 @@
 //! To do so, a current implementation needs to do the following:
 //!
 //! * acknowledge the timelines that it needs to stream WAL into.
-//! Pageserver is able to dynamically (un)load tenants on attach and detach,
+//!   Pageserver is able to dynamically (un)load tenants on attach and detach,
-//! hence WAL receiver needs to react on such events.
+//!   hence WAL receiver needs to react on such events.
 //!
 //! * get a broker subscription, stream data from it to determine that a timeline needs WAL streaming.
-//! For that, it watches specific keys in storage_broker and pulls the relevant data periodically.
+//!   For that, it watches specific keys in storage_broker and pulls the relevant data periodically.
-//! The data is produced by safekeepers, that push it periodically and pull it to synchronize between each other.
+//!   The data is produced by safekeepers, that push it periodically and pull it to synchronize between each other.
-//! Without this data, no WAL streaming is possible currently.
+//!   Without this data, no WAL streaming is possible currently.
 //!
 //! Only one active WAL streaming connection is allowed at a time.
 //! The connection is supposed to be updated periodically, based on safekeeper timeline data.
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -26,7 +26,7 @@ use tracing::{debug, error, info, trace, warn, Instrument};
 use super::TaskStateUpdate;
 use crate::{
    context::RequestContext,
-    metrics::{LIVE_CONNECTIONS_COUNT, WALRECEIVER_STARTED_CONNECTIONS, WAL_INGEST},
+    metrics::{LIVE_CONNECTIONS, WALRECEIVER_STARTED_CONNECTIONS, WAL_INGEST},
    task_mgr::TaskKind,
    task_mgr::WALRECEIVER_RUNTIME,
    tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline, WalReceiverInfo},
@@ -208,14 +208,9 @@ pub(super) async fn handle_walreceiver_connection(
        .instrument(tracing::info_span!("poller")),
    );
-    // Immediately increment the gauge, then create a job to decrement it on task exit.
+    let _guard = LIVE_CONNECTIONS
-    // One of the pros of `defer!` is that this will *most probably*
+        .with_label_values(&["wal_receiver"])
-    // get called, even in presence of panics.
+        .guard();
    let gauge = LIVE_CONNECTIONS_COUNT.with_label_values(&["wal_receiver"]);
    gauge.inc();
    scopeguard::defer! {
        gauge.dec();
    }
    let identify = identify_system(&replication_client).await?;
    info!("{identify:?}");
--- a/pageserver/src/tenant/vectored_blob_io.rs
+++ b/pageserver/src/tenant/vectored_blob_io.rs
@@ -20,11 +20,13 @@ use std::num::NonZeroUsize;
 use bytes::BytesMut;
 use pageserver_api::key::Key;
 use tokio::io::AsyncWriteExt;
 use tokio_epoll_uring::BoundedBuf;
 use utils::lsn::Lsn;
 use utils::vec_map::VecMap;
 use crate::context::RequestContext;
 use crate::tenant::blob_io::{BYTE_UNCOMPRESSED, BYTE_ZSTD, LEN_COMPRESSION_BIT_MASK};
 use crate::virtual_file::VirtualFile;
 #[derive(Copy, Clone, Debug, PartialEq, Eq)]
@@ -68,7 +70,7 @@ impl VectoredRead {
    }
 }
-#[derive(Eq, PartialEq)]
+#[derive(Eq, PartialEq, Debug)]
 pub(crate) enum VectoredReadExtended {
    Yes,
    No,
@@ -91,7 +93,7 @@ impl VectoredReadBuilder {
        start_offset: u64,
        end_offset: u64,
        meta: BlobMeta,
-        max_read_size: Option<usize>,
+        max_read_size: usize,
    ) -> Self {
        let mut blobs_at = VecMap::default();
        blobs_at
@@ -102,10 +104,9 @@ impl VectoredReadBuilder {
            start: start_offset,
            end: end_offset,
            blobs_at,
-            max_read_size,
+            max_read_size: Some(max_read_size),
        }
    }
    /// Attempt to extend the current read with a new blob if the start
    /// offset matches with the current end of the vectored read
    /// and the resuting size is below the max read size
@@ -164,7 +165,7 @@ pub struct VectoredReadPlanner {
    // Arguments for previous blob passed into [`VectoredReadPlanner::handle`]
    prev: Option<(Key, Lsn, u64, BlobFlag)>,
-    max_read_size: Option<usize>,
+    max_read_size: usize,
 }
 impl VectoredReadPlanner {
@@ -172,20 +173,7 @@ impl VectoredReadPlanner {
        Self {
            blobs: BTreeMap::new(),
            prev: None,
-            max_read_size: Some(max_read_size),
+            max_read_size,
        }
    }
    /// This function should *only* be used if the caller has a way to control the limit. e.g., in [`StreamingVectoredReadPlanner`],
    /// it uses the vectored read planner to avoid duplicated logic on handling blob start/end, while expecting the vectored
    /// read planner to give a single read to a continuous range of bytes in the image layer. Therefore, it does not need the
    /// code path to split reads into chunks of `max_read_size`, and controls the read size itself.
    #[cfg(test)]
    pub(crate) fn new_caller_controlled_max_limit() -> Self {
        Self {
            blobs: BTreeMap::new(),
            prev: None,
            max_read_size: None,
        }
    }
@@ -203,9 +191,9 @@ impl VectoredReadPlanner {
    ///
    /// The `flag` argument has two interesting values:
    /// * [`BlobFlag::ReplaceAll`]: The blob for this key should replace all existing blobs.
-    /// This is used for WAL records that `will_init`.
+    ///   This is used for WAL records that `will_init`.
    /// * [`BlobFlag::Ignore`]: This blob should not be included in the read. This happens
-    /// if the blob is cached.
+    ///   if the blob is cached.
    pub fn handle(&mut self, key: Key, lsn: Lsn, offset: u64, flag: BlobFlag) {
        // Implementation note: internally lag behind by one blob such that
        // we have a start and end offset when initialising [`VectoredRead`]
@@ -315,7 +303,7 @@ impl<'a> VectoredBlobReader<'a> {
            read.size(),
            buf.capacity()
        );
-        let buf = self
+        let mut buf = self
            .file
            .read_exact_at(buf.slice(0..read.size()), read.start, ctx)
            .await?
@@ -337,38 +325,68 @@ impl<'a> VectoredBlobReader<'a> {
                .chain(std::iter::once(None)),
        );
        // Some scratch space, put here for reusing the allocation
        let mut decompressed_vec = Vec::new();
        for ((offset, meta), next) in pairs {
            let offset_in_buf = offset - start_offset;
            let first_len_byte = buf[offset_in_buf as usize];
-            // Each blob is prefixed by a header containing it's size.
+            // Each blob is prefixed by a header containing its size and compression information.
            // Extract the size and skip that header to find the start of the data.
            // The size can be 1 or 4 bytes. The most significant bit is 0 in the
            // 1 byte case and 1 in the 4 byte case.
-            let (size_length, blob_size) = if first_len_byte < 0x80 {
+            let (size_length, blob_size, compression_bits) = if first_len_byte < 0x80 {
-                (1, first_len_byte as u64)
+                (1, first_len_byte as u64, BYTE_UNCOMPRESSED)
            } else {
                let mut blob_size_buf = [0u8; 4];
                let offset_in_buf = offset_in_buf as usize;
                blob_size_buf.copy_from_slice(&buf[offset_in_buf..offset_in_buf + 4]);
-                blob_size_buf[0] &= 0x7f;
+                blob_size_buf[0] &= !LEN_COMPRESSION_BIT_MASK;
-                (4, u32::from_be_bytes(blob_size_buf) as u64)
+
                let compression_bits = first_len_byte & LEN_COMPRESSION_BIT_MASK;
                (
                    4,
                    u32::from_be_bytes(blob_size_buf) as u64,
                    compression_bits,
                )
            };
-            let start = offset_in_buf + size_length;
+            let start_raw = offset_in_buf + size_length;
-            let end = match next {
+            let end_raw = match next {
                Some((next_blob_start_offset, _)) => next_blob_start_offset - start_offset,
-                None => start + blob_size,
+                None => start_raw + blob_size,
            };
-
+            assert_eq!(end_raw - start_raw, blob_size);
-            assert_eq!(end - start, blob_size);
+            let (start, end);
            if compression_bits == BYTE_UNCOMPRESSED {
                start = start_raw as usize;
                end = end_raw as usize;
            } else if compression_bits == BYTE_ZSTD {
                let mut decoder =
                    async_compression::tokio::write::ZstdDecoder::new(&mut decompressed_vec);
                decoder
                    .write_all(&buf[start_raw as usize..end_raw as usize])
                    .await?;
                decoder.flush().await?;
                start = buf.len();
                buf.extend_from_slice(&decompressed_vec);
                end = buf.len();
                decompressed_vec.clear();
            } else {
                let error = std::io::Error::new(
                    std::io::ErrorKind::InvalidData,
                    format!("invalid compression byte {compression_bits:x}"),
                );
                return Err(error);
            }
            metas.push(VectoredBlob {
-                start: start as usize,
+                start,
-                end: end as usize,
+                end,
                meta: *meta,
-            })
+            });
        }
        Ok(VectoredBlobsBuf { buf, blobs: metas })
@@ -376,17 +394,18 @@ impl<'a> VectoredBlobReader<'a> {
 }
 /// Read planner used in [`crate::tenant::storage_layer::image_layer::ImageLayerIterator`]. It provides a streaming API for
-/// getting read blobs. It returns a batch when `handle` gets called and when the current key would exceed the read_size and
+/// getting read blobs. It returns a batch when `handle` gets called and when the current key would just exceed the read_size and
-/// max_cnt constraints. Underlying it uses [`VectoredReadPlanner`].
+/// max_cnt constraints.
 #[cfg(test)]
 pub struct StreamingVectoredReadPlanner {
-    planner: VectoredReadPlanner,
+    read_builder: Option<VectoredReadBuilder>,
-    /// Max read size per batch
+    // Arguments for previous blob passed into [`StreamingVectoredReadPlanner::handle`]
    prev: Option<(Key, Lsn, u64)>,
    /// Max read size per batch. This is not a strict limit. If there are [0, 100) and [100, 200), while the `max_read_size` is 150,
    /// we will produce a single batch instead of split them.
    max_read_size: u64,
    /// Max item count per batch
    max_cnt: usize,
    /// The first offset of this batch
    this_batch_first_offset: Option<u64>,
    /// Size of the current batch
    cnt: usize,
 }
@@ -397,67 +416,100 @@ impl StreamingVectoredReadPlanner {
        assert!(max_cnt > 0);
        assert!(max_read_size > 0);
        Self {
-            // We want to have exactly one read syscall (plus several others for index lookup) for each `next_batch` call.
+            read_builder: None,
-            // Therefore, we enforce `self.max_read_size` by ourselves instead of using the VectoredReadPlanner's capability,
+            prev: None,
            // to avoid splitting into two I/Os.
            planner: VectoredReadPlanner::new_caller_controlled_max_limit(),
            max_cnt,
            max_read_size,
            this_batch_first_offset: None,
            cnt: 0,
        }
    }
-    fn emit(&mut self, this_batch_first_offset: u64) -> VectoredRead {
+    pub fn handle(&mut self, key: Key, lsn: Lsn, offset: u64) -> Option<VectoredRead> {
-        let planner = std::mem::replace(
+        // Implementation note: internally lag behind by one blob such that
-            &mut self.planner,
+        // we have a start and end offset when initialising [`VectoredRead`]
-            VectoredReadPlanner::new_caller_controlled_max_limit(),
+        let (prev_key, prev_lsn, prev_offset) = match self.prev {
-        );
+            None => {
-        self.this_batch_first_offset = Some(this_batch_first_offset);
+                self.prev = Some((key, lsn, offset));
-        self.cnt = 1;
+                return None;
-        let mut batch = planner.finish();
+            }
-        assert_eq!(batch.len(), 1, "should have exactly one read batch");
+            Some(prev) => prev,
-        batch.pop().unwrap()
+        };
        let res = self.add_blob(prev_key, prev_lsn, prev_offset, offset, false);
        self.prev = Some((key, lsn, offset));
        res
    }
-    pub fn handle(
+    pub fn handle_range_end(&mut self, offset: u64) -> Option<VectoredRead> {
        let res = if let Some((prev_key, prev_lsn, prev_offset)) = self.prev {
            self.add_blob(prev_key, prev_lsn, prev_offset, offset, true)
        } else {
            None
        };
        self.prev = None;
        res
    }
    fn add_blob(
        &mut self,
        key: Key,
        lsn: Lsn,
-        offset: u64,
+        start_offset: u64,
-        flag: BlobFlag,
+        end_offset: u64,
        is_last_blob_in_read: bool,
    ) -> Option<VectoredRead> {
-        if let Some(begin_offset) = self.this_batch_first_offset {
+        match &mut self.read_builder {
-            // Each batch will have at least one item b/c `self.this_batch_first_offset` is set
+            Some(read_builder) => {
-            // after one item gets processed
+                let extended = read_builder.extend(start_offset, end_offset, BlobMeta { key, lsn });
-            if offset - begin_offset > self.max_read_size {
+                assert_eq!(extended, VectoredReadExtended::Yes);
                self.planner.handle_range_end(offset); // End the current batch with the offset
                let batch = self.emit(offset); // Produce a batch
                self.planner.handle(key, lsn, offset, flag); // Add this key to the next batch
                return Some(batch);
            }
-        } else {
+            None => {
-            self.this_batch_first_offset = Some(offset)
+                self.read_builder = {
-        }
+                    let mut blobs_at = VecMap::default();
-        if self.cnt >= self.max_cnt {
+                    blobs_at
-            self.planner.handle_range_end(offset); // End the current batch with the offset
+                        .append(start_offset, BlobMeta { key, lsn })
-            let batch = self.emit(offset); // Produce a batch
+                        .expect("First insertion always succeeds");
            self.planner.handle(key, lsn, offset, flag); // Add this key to the next batch
            return Some(batch);
        }
        self.planner.handle(key, lsn, offset, flag); // Add this key to the current batch
        self.cnt += 1;
        None
    }
-    pub fn handle_range_end(&mut self, offset: u64) -> VectoredRead {
+                    Some(VectoredReadBuilder {
-        self.planner.handle_range_end(offset);
+                        start: start_offset,
-        self.emit(offset)
+                        end: end_offset,
                        blobs_at,
                        max_read_size: None,
                    })
                };
            }
        }
        let read_builder = self.read_builder.as_mut().unwrap();
        self.cnt += 1;
        if is_last_blob_in_read
            || read_builder.size() >= self.max_read_size as usize
            || self.cnt >= self.max_cnt
        {
            let prev_read_builder = self.read_builder.take();
            self.cnt = 0;
            // `current_read_builder` is None in the first iteration
            if let Some(read_builder) = prev_read_builder {
                return Some(read_builder.build());
            }
        }
        None
    }
 }
 #[cfg(test)]
 mod tests {
    use anyhow::Error;
    use crate::context::DownloadBehavior;
    use crate::page_cache::PAGE_SZ;
    use crate::task_mgr::TaskKind;
    use super::super::blob_io::tests::{random_array, write_maybe_compressed};
    use super::*;
    fn validate_read(read: &VectoredRead, offset_range: &[(Key, Lsn, u64, BlobFlag)]) {
@@ -509,8 +561,11 @@ mod tests {
        planner.handle_range_end(652 * 1024);
        let reads = planner.finish();
        assert_eq!(reads.len(), 6);
        // TODO: could remove zero reads to produce 5 reads here
        for (idx, read) in reads.iter().enumerate() {
            validate_read(read, ranges[idx]);
        }
@@ -548,4 +603,187 @@ mod tests {
            validate_read(read, ranges[idx]);
        }
    }
    #[test]
    fn streaming_planner_max_read_size_test() {
        let max_read_size = 128 * 1024;
        let key = Key::MIN;
        let lsn = Lsn(0);
        let blob_descriptions = vec![
            (key, lsn, 0, BlobFlag::None),
            (key, lsn, 32 * 1024, BlobFlag::None),
            (key, lsn, 96 * 1024, BlobFlag::None),
            (key, lsn, 128 * 1024, BlobFlag::None),
            (key, lsn, 198 * 1024, BlobFlag::None),
            (key, lsn, 268 * 1024, BlobFlag::None),
            (key, lsn, 396 * 1024, BlobFlag::None),
            (key, lsn, 652 * 1024, BlobFlag::None),
        ];
        let ranges = [
            &blob_descriptions[0..3],
            &blob_descriptions[3..5],
            &blob_descriptions[5..6],
            &blob_descriptions[6..7],
            &blob_descriptions[7..],
        ];
        let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 1000);
        let mut reads = Vec::new();
        for (key, lsn, offset, _) in blob_descriptions.clone() {
            reads.extend(planner.handle(key, lsn, offset));
        }
        reads.extend(planner.handle_range_end(652 * 1024));
        assert_eq!(reads.len(), ranges.len());
        for (idx, read) in reads.iter().enumerate() {
            validate_read(read, ranges[idx]);
        }
    }
    #[test]
    fn streaming_planner_max_cnt_test() {
        let max_read_size = 1024 * 1024;
        let key = Key::MIN;
        let lsn = Lsn(0);
        let blob_descriptions = vec![
            (key, lsn, 0, BlobFlag::None),
            (key, lsn, 32 * 1024, BlobFlag::None),
            (key, lsn, 96 * 1024, BlobFlag::None),
            (key, lsn, 128 * 1024, BlobFlag::None),
            (key, lsn, 198 * 1024, BlobFlag::None),
            (key, lsn, 268 * 1024, BlobFlag::None),
            (key, lsn, 396 * 1024, BlobFlag::None),
            (key, lsn, 652 * 1024, BlobFlag::None),
        ];
        let ranges = [
            &blob_descriptions[0..2],
            &blob_descriptions[2..4],
            &blob_descriptions[4..6],
            &blob_descriptions[6..],
        ];
        let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 2);
        let mut reads = Vec::new();
        for (key, lsn, offset, _) in blob_descriptions.clone() {
            reads.extend(planner.handle(key, lsn, offset));
        }
        reads.extend(planner.handle_range_end(652 * 1024));
        assert_eq!(reads.len(), ranges.len());
        for (idx, read) in reads.iter().enumerate() {
            validate_read(read, ranges[idx]);
        }
    }
    #[test]
    fn streaming_planner_edge_test() {
        let max_read_size = 1024 * 1024;
        let key = Key::MIN;
        let lsn = Lsn(0);
        {
            let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 1);
            let mut reads = Vec::new();
            reads.extend(planner.handle_range_end(652 * 1024));
            assert!(reads.is_empty());
        }
        {
            let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 1);
            let mut reads = Vec::new();
            reads.extend(planner.handle(key, lsn, 0));
            reads.extend(planner.handle_range_end(652 * 1024));
            assert_eq!(reads.len(), 1);
            validate_read(&reads[0], &[(key, lsn, 0, BlobFlag::None)]);
        }
        {
            let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 1);
            let mut reads = Vec::new();
            reads.extend(planner.handle(key, lsn, 0));
            reads.extend(planner.handle(key, lsn, 128 * 1024));
            reads.extend(planner.handle_range_end(652 * 1024));
            assert_eq!(reads.len(), 2);
            validate_read(&reads[0], &[(key, lsn, 0, BlobFlag::None)]);
            validate_read(&reads[1], &[(key, lsn, 128 * 1024, BlobFlag::None)]);
        }
        {
            let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 2);
            let mut reads = Vec::new();
            reads.extend(planner.handle(key, lsn, 0));
            reads.extend(planner.handle(key, lsn, 128 * 1024));
            reads.extend(planner.handle_range_end(652 * 1024));
            assert_eq!(reads.len(), 1);
            validate_read(
                &reads[0],
                &[
                    (key, lsn, 0, BlobFlag::None),
                    (key, lsn, 128 * 1024, BlobFlag::None),
                ],
            );
        }
    }
    async fn round_trip_test_compressed(blobs: &[Vec<u8>], compression: bool) -> Result<(), Error> {
        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
        let (_temp_dir, pathbuf, offsets) =
            write_maybe_compressed::<true>(blobs, compression, &ctx).await?;
        let file = VirtualFile::open(&pathbuf, &ctx).await?;
        let file_len = std::fs::metadata(&pathbuf)?.len();
        // Multiply by two (compressed data might need more space), and add a few bytes for the header
        let reserved_bytes = blobs.iter().map(|bl| bl.len()).max().unwrap() * 2 + 16;
        let mut buf = BytesMut::with_capacity(reserved_bytes);
        let vectored_blob_reader = VectoredBlobReader::new(&file);
        let meta = BlobMeta {
            key: Key::MIN,
            lsn: Lsn(0),
        };
        for (idx, (blob, offset)) in blobs.iter().zip(offsets.iter()).enumerate() {
            let end = offsets.get(idx + 1).unwrap_or(&file_len);
            if idx + 1 == offsets.len() {
                continue;
            }
            let read_builder = VectoredReadBuilder::new(*offset, *end, meta, 16 * 4096);
            let read = read_builder.build();
            let result = vectored_blob_reader.read_blobs(&read, buf, &ctx).await?;
            assert_eq!(result.blobs.len(), 1);
            let read_blob = &result.blobs[0];
            let read_buf = &result.buf[read_blob.start..read_blob.end];
            assert_eq!(blob, read_buf, "mismatch for idx={idx} at offset={offset}");
            buf = result.buf;
        }
        Ok(())
    }
    #[tokio::test]
    async fn test_really_big_array() -> Result<(), Error> {
        let blobs = &[
            b"test".to_vec(),
            random_array(10 * PAGE_SZ),
            b"hello".to_vec(),
            random_array(66 * PAGE_SZ),
            vec![0xf3; 24 * PAGE_SZ],
            b"foobar".to_vec(),
        ];
        round_trip_test_compressed(blobs, false).await?;
        round_trip_test_compressed(blobs, true).await?;
        Ok(())
    }
    #[tokio::test]
    async fn test_arrays_inc() -> Result<(), Error> {
        let blobs = (0..PAGE_SZ / 8)
            .map(|v| random_array(v * 16))
            .collect::<Vec<_>>();
        round_trip_test_compressed(&blobs, false).await?;
        round_trip_test_compressed(&blobs, true).await?;
        Ok(())
    }
 }
--- a/pageserver/src/trace.rs
+++ b/pageserver/src/trace.rs
@@ -1,36 +0,0 @@
 use bytes::Bytes;
 use camino::Utf8PathBuf;
 use std::{
    fs::{create_dir_all, File},
    io::{BufWriter, Write},
 };
 pub struct Tracer {
    writer: BufWriter<File>,
 }
 impl Drop for Tracer {
    fn drop(&mut self) {
        self.flush()
    }
 }
 impl Tracer {
    pub fn new(path: Utf8PathBuf) -> Self {
        let parent = path.parent().expect("failed to parse parent path");
        create_dir_all(parent).expect("failed to create trace dir");
        let file = File::create(path).expect("failed to create trace file");
        Tracer {
            writer: BufWriter::new(file),
        }
    }
    pub fn trace(&mut self, msg: &Bytes) {
        self.writer.write_all(msg).expect("failed to write trace");
    }
    pub fn flush(&mut self) {
        self.writer.flush().expect("failed to flush trace file");
    }
 }
--- a/pageserver/src/virtual_file/owned_buffers_io/write.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/write.rs
@@ -33,6 +33,7 @@ pub struct BufferedWriter<B, W> {
    /// invariant: always remains Some(buf) except
    /// - while IO is ongoing => goes back to Some() once the IO completed successfully
    /// - after an IO error => stays `None` forever
    ///
    /// In these exceptional cases, it's `None`.
    buf: Option<B>,
 }
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -343,7 +343,33 @@ impl WalIngest {
                        xlog_checkpoint.oldestActiveXid,
                        self.checkpoint.oldestActiveXid
                    );
-                    self.checkpoint.oldestActiveXid = xlog_checkpoint.oldestActiveXid;
+
                    // A shutdown checkpoint has `oldestActiveXid == InvalidTransactionid`,
                    // because at shutdown, all in-progress transactions will implicitly
                    // end. Postgres startup code knows that, and allows hot standby to start
                    // immediately from a shutdown checkpoint.
                    //
                    // In Neon, Postgres hot standby startup always behaves as if starting from
                    // an online checkpoint. It needs a valid `oldestActiveXid` value, so
                    // instead of overwriting self.checkpoint.oldestActiveXid with
                    // InvalidTransactionid from the checkpoint WAL record, update it to a
                    // proper value, knowing that there are no in-progress transactions at this
                    // point, except for prepared transactions.
                    //
                    // See also the neon code changes in the InitWalRecovery() function.
                    if xlog_checkpoint.oldestActiveXid == pg_constants::INVALID_TRANSACTION_ID
                        && info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN
                    {
                        let mut oldest_active_xid = self.checkpoint.nextXid.value as u32;
                        for xid in modification.tline.list_twophase_files(lsn, ctx).await? {
                            if (xid.wrapping_sub(oldest_active_xid) as i32) < 0 {
                                oldest_active_xid = xid;
                            }
                        }
                        self.checkpoint.oldestActiveXid = oldest_active_xid;
                    } else {
                        self.checkpoint.oldestActiveXid = xlog_checkpoint.oldestActiveXid;
                    }
                    // Write a new checkpoint key-value pair on every checkpoint record, even
                    // if nothing really changed. Not strictly required, but it seems nice to
@@ -375,6 +401,7 @@ impl WalIngest {
                if info == pg_constants::XLOG_RUNNING_XACTS {
                    let xlrec = crate::walrecord::XlRunningXacts::decode(&mut buf);
                    self.checkpoint.oldestActiveXid = xlrec.oldest_running_xid;
                    self.checkpoint_modified = true;
                }
            }
            pg_constants::RM_REPLORIGIN_ID => {
@@ -1277,13 +1304,10 @@ impl WalIngest {
            xlrec.pageno, xlrec.oldest_xid, xlrec.oldest_xid_db
        );
-        // Here we treat oldestXid and oldestXidDB
+        // In Postgres, oldestXid and oldestXidDB are updated in memory when the CLOG is
-        // differently from postgres redo routines.
+        // truncated, but a checkpoint record with the updated values isn't written until
-        // In postgres checkpoint.oldestXid lags behind xlrec.oldest_xid
+        // later. In Neon, a server can start at any LSN, not just on a checkpoint record,
-        // until checkpoint happens and updates the value.
+        // so we keep the oldestXid and oldestXidDB up-to-date.
        // Here we can use the most recent value.
        // It's just an optimization, though and can be deleted.
        // TODO Figure out if there will be any issues with replica.
        self.checkpoint.oldestXid = xlrec.oldest_xid;
        self.checkpoint.oldestXidDB = xlrec.oldest_xid_db;
        self.checkpoint_modified = true;
--- a/pgxn/neon/Makefile
+++ b/pgxn/neon/Makefile
@@ -6,6 +6,7 @@ OBJS = \
 	$(WIN32RES) \
 	extension_server.o \
 	file_cache.o \
 	hll.o \
 	libpagestore.o \
 	neon.o \
 	neon_utils.o \
@@ -22,7 +23,7 @@ SHLIB_LINK_INTERNAL = $(libpq)
 SHLIB_LINK = -lcurl
 EXTENSION = neon
-DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql neon--1.2--1.3.sql neon--1.3--1.2.sql neon--1.2--1.1.sql neon--1.1--1.0.sql
+DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql neon--1.2--1.3.sql neon--1.3--1.2.sql neon--1.2--1.1.sql neon--1.1--1.0.sql  neon--1.3--1.4.sql neon--1.4--1.3.sql
 PGFILEDESC = "neon - cloud storage for PostgreSQL"
 EXTRA_CLEAN = \
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -26,7 +26,6 @@
 #include "miscadmin.h"
 #include "pagestore_client.h"
 #include "common/hashfn.h"
 #include "lib/hyperloglog.h"
 #include "pgstat.h"
 #include "postmaster/bgworker.h"
 #include RELFILEINFO_HDR
@@ -40,6 +39,8 @@
 #include "utils/dynahash.h"
 #include "utils/guc.h"
 #include "hll.h"
 /*
 * Local file cache is used to temporary store relations pages in local file system.
 * All blocks of all relations are stored inside one file and addressed using shared hash map.
@@ -62,7 +63,6 @@
 #define BLOCKS_PER_CHUNK	128 /* 1Mb chunk */
 #define MB					((uint64)1024*1024)
 #define HYPER_LOG_LOG_BIT_WIDTH   10
 #define SIZE_MB_TO_CHUNKS(size) ((uint32)((size) * MB / BLCKSZ / BLOCKS_PER_CHUNK))
 typedef struct FileCacheEntry
@@ -87,8 +87,7 @@ typedef struct FileCacheControl
 	uint64		writes;
 	dlist_head	lru;			/* double linked list for LRU replacement
 								 * algorithm */
-	hyperLogLogState wss_estimation; /* estimation of wroking set size */
+	HyperLogLogState wss_estimation; /* estimation of working set size */
 	uint8_t		hyperloglog_hashes[(1 << HYPER_LOG_LOG_BIT_WIDTH) + 1];
 } FileCacheControl;
 static HTAB *lfc_hash;
@@ -238,12 +237,7 @@ lfc_shmem_startup(void)
 		dlist_init(&lfc_ctl->lru);
 		/* Initialize hyper-log-log structure for estimating working set size */
-		initHyperLogLog(&lfc_ctl->wss_estimation, HYPER_LOG_LOG_BIT_WIDTH);
+		initSHLL(&lfc_ctl->wss_estimation);
 		/* We need hashes in shared memory */
 		pfree(lfc_ctl->wss_estimation.hashesArr);
 		memset(lfc_ctl->hyperloglog_hashes, 0, sizeof lfc_ctl->hyperloglog_hashes);
 		lfc_ctl->wss_estimation.hashesArr = lfc_ctl->hyperloglog_hashes;
 		/* Recreate file cache on restart */
 		fd = BasicOpenFile(lfc_path, O_RDWR | O_CREAT | O_TRUNC);
@@ -545,7 +539,7 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	/* Approximate working set */
 	tag.blockNum = blkno;
-	addHyperLogLog(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag)));
+	addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag)));
 	if (entry == NULL || (entry->bitmap[chunk_offs >> 5] & (1 << (chunk_offs & 31))) == 0)
 	{
@@ -986,20 +980,38 @@ local_cache_pages(PG_FUNCTION_ARGS)
 		SRF_RETURN_DONE(funcctx);
 }
 PG_FUNCTION_INFO_V1(approximate_working_set_size_seconds);
 Datum
 approximate_working_set_size_seconds(PG_FUNCTION_ARGS)
 {
 	if (lfc_size_limit != 0)
 	{
 		int32 dc;
 		time_t duration = PG_ARGISNULL(0) ? (time_t)-1 : PG_GETARG_INT32(0);
 		LWLockAcquire(lfc_lock, LW_SHARED);
 		dc = (int32) estimateSHLL(&lfc_ctl->wss_estimation, duration);
 		LWLockRelease(lfc_lock);
 		PG_RETURN_INT32(dc);
 	}
 	PG_RETURN_NULL();
 }
 PG_FUNCTION_INFO_V1(approximate_working_set_size);
 Datum
 approximate_working_set_size(PG_FUNCTION_ARGS)
 {
 	int32 dc = -1;
 	if (lfc_size_limit != 0)
 	{
 		int32 dc;
 		bool reset = PG_GETARG_BOOL(0);
 		LWLockAcquire(lfc_lock, reset ? LW_EXCLUSIVE : LW_SHARED);
-		dc = (int32) estimateHyperLogLog(&lfc_ctl->wss_estimation);
+		dc = (int32) estimateSHLL(&lfc_ctl->wss_estimation, (time_t)-1);
 		if (reset)
-			memset(lfc_ctl->hyperloglog_hashes, 0, sizeof lfc_ctl->hyperloglog_hashes);
+			memset(lfc_ctl->wss_estimation.regs, 0, sizeof lfc_ctl->wss_estimation.regs);
 		LWLockRelease(lfc_lock);
 		PG_RETURN_INT32(dc);
 	}
-	PG_RETURN_INT32(dc);
+	PG_RETURN_NULL();
 }
--- a/pgxn/neon/hll.c
+++ b/pgxn/neon/hll.c
@@ -0,0 +1,193 @@
 /*-------------------------------------------------------------------------
 *
 * hll.c
 *	  Sliding HyperLogLog cardinality estimator
 *
 * Portions Copyright (c) 2014-2023, PostgreSQL Global Development Group
 *
 * Implements https://hal.science/hal-00465313/document
 * 
 * Based on Hideaki Ohno's C++ implementation.  This is probably not ideally
 * suited to estimating the cardinality of very large sets;  in particular, we
 * have not attempted to further optimize the implementation as described in
 * the Heule, Nunkesser and Hall paper "HyperLogLog in Practice: Algorithmic
 * Engineering of a State of The Art Cardinality Estimation Algorithm".
 *
 * A sparse representation of HyperLogLog state is used, with fixed space
 * overhead.
 *
 * The copyright terms of Ohno's original version (the MIT license) follow.
 *
 * IDENTIFICATION
 *	  src/backend/lib/hyperloglog.c
 *
 *-------------------------------------------------------------------------
 */
 /*
 * Copyright (c) 2013 Hideaki Ohno <hide.o.j55{at}gmail.com>
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the 'Software'), to
 * deal in the Software without restriction, including without limitation the
 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 * sell copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 */
 #include <math.h>
 #include "postgres.h"
 #include "funcapi.h"
 #include "port/pg_bitutils.h"
 #include "utils/timestamp.h"
 #include "hll.h"
 #define POW_2_32			(4294967296.0)
 #define NEG_POW_2_32		(-4294967296.0)
 #define ALPHA_MM ((0.7213 / (1.0 + 1.079 / HLL_N_REGISTERS)) * HLL_N_REGISTERS * HLL_N_REGISTERS)
 /*
 * Worker for addHyperLogLog().
 *
 * Calculates the position of the first set bit in first b bits of x argument
 * starting from the first, reading from most significant to least significant
 * bits.
 *
 * Example (when considering fist 10 bits of x):
 *
 * rho(x = 0b1000000000)   returns 1
 * rho(x = 0b0010000000)   returns 3
 * rho(x = 0b0000000000)   returns b + 1
 *
 * "The binary address determined by the first b bits of x"
 *
 * Return value "j" used to index bit pattern to watch.
 */
 static inline uint8
 rho(uint32 x, uint8 b)
 {
 	uint8		j = 1;
 	if (x == 0)
 		return b + 1;
 	j = 32 - pg_leftmost_one_pos32(x);
 	if (j > b)
 		return b + 1;
 	return j;
 }
 /*
 * Initialize HyperLogLog track state
 */
 void
 initSHLL(HyperLogLogState *cState)
 {
 	memset(cState->regs, 0, sizeof(cState->regs));
 }
 /*
 * Adds element to the estimator, from caller-supplied hash.
 *
 * It is critical that the hash value passed be an actual hash value, typically
 * generated using hash_any().  The algorithm relies on a specific bit-pattern
 * observable in conjunction with stochastic averaging.  There must be a
 * uniform distribution of bits in hash values for each distinct original value
 * observed.
 */
 void
 addSHLL(HyperLogLogState *cState, uint32 hash)
 {
 	uint8		count;
 	uint32		index;
 	size_t		i;
 	size_t		j;
 	TimestampTz	now = GetCurrentTimestamp();
 	/* Use the first "k" (registerWidth) bits as a zero based index */
 	index = hash >> HLL_C_BITS;
 	/* Compute the rank of the remaining 32 - "k" (registerWidth) bits */
 	count = rho(hash << HLL_BIT_WIDTH, HLL_C_BITS);
 	cState->regs[index][count] = now;
 }
 static uint8
 getMaximum(const TimestampTz* reg, TimestampTz since)
 {
 	uint8 max = 0;
 	for (size_t i = 0; i < HLL_C_BITS + 1; i++)
 	{
 		if (reg[i] >= since)
 		{
 			max = i;
 		}
 	}
 	return max;
 }
 /*
 * Estimates cardinality, based on elements added so far
 */
 double
 estimateSHLL(HyperLogLogState *cState, time_t duration)
 {
 	double		result;
 	double		sum = 0.0;
 	size_t		i;
 	uint8       R[HLL_N_REGISTERS];
 	/* 0 indicates uninitialized timestamp, so if we need to cover the whole range than starts with 1 */
 	TimestampTz since = duration == (time_t)-1 ? 1 : GetCurrentTimestamp() - duration * USECS_PER_SEC;
 	for (i = 0; i < HLL_N_REGISTERS; i++)
 	{
 		R[i] = getMaximum(cState->regs[i], since);
 		sum += 1.0 / pow(2.0, R[i]);
 	}
 	/* result set to "raw" HyperLogLog estimate (E in the HyperLogLog paper) */
 	result = ALPHA_MM / sum;
 	if (result <= (5.0 / 2.0) * HLL_N_REGISTERS)
 	{
 		/* Small range correction */
 		int			zero_count = 0;
 		for (i = 0; i < HLL_N_REGISTERS; i++)
 		{
 			zero_count += R[i] == 0;
 		}
 		if (zero_count != 0)
 			result = HLL_N_REGISTERS * log((double) HLL_N_REGISTERS /
 										   zero_count);
 	}
 	else if (result > (1.0 / 30.0) * POW_2_32)
 	{
 		/* Large range correction */
 		result = NEG_POW_2_32 * log(1.0 - (result / POW_2_32));
 	}
 	return result;
 }
--- a/pgxn/neon/hll.h
+++ b/pgxn/neon/hll.h
@@ -0,0 +1,86 @@
 /*-------------------------------------------------------------------------
 *
 * hll.h
 *	  Sliding HyperLogLog cardinality estimator
 *
 * Portions Copyright (c) 2014-2023, PostgreSQL Global Development Group
 *
 * Implements https://hal.science/hal-00465313/document
 * 
 * Based on Hideaki Ohno's C++ implementation.  This is probably not ideally
 * suited to estimating the cardinality of very large sets;  in particular, we
 * have not attempted to further optimize the implementation as described in
 * the Heule, Nunkesser and Hall paper "HyperLogLog in Practice: Algorithmic
 * Engineering of a State of The Art Cardinality Estimation Algorithm".
 *
 * A sparse representation of HyperLogLog state is used, with fixed space
 * overhead.
 *
 * The copyright terms of Ohno's original version (the MIT license) follow.
 *
 * IDENTIFICATION
 *	  src/backend/lib/hyperloglog.c
 *
 *-------------------------------------------------------------------------
 */
 /*
 * Copyright (c) 2013 Hideaki Ohno <hide.o.j55{at}gmail.com>
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the 'Software'), to
 * deal in the Software without restriction, including without limitation the
 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 * sell copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 */
 #ifndef HLL_H
 #define HLL_H
 #define HLL_BIT_WIDTH   10
 #define HLL_C_BITS      (32 - HLL_BIT_WIDTH)
 #define HLL_N_REGISTERS (1 << HLL_BIT_WIDTH)
 /*
 * HyperLogLog is an approximate technique for computing the number of distinct
 * entries in a set.  Importantly, it does this by using a fixed amount of
 * memory.  See the 2007 paper "HyperLogLog: the analysis of a near-optimal
 * cardinality estimation algorithm" for more.
 *
 * Instead of a single counter for every bits register, we have a timestamp
 * for every valid number of bits we can encounter. Every time we encounter
 * a certain number of bits, we update the timestamp in those registers to
 * the current timestamp.
 *
 * We can query the sketch's stored cardinality for the range of some timestamp
 * up to now: For each register, we return the highest bits bucket that has a
 * modified timestamp >= the query timestamp. This value is the number of bits
 * for this register in the normal HLL calculation.
 *
 * The memory usage is 2^B * (C + 1) * sizeof(TimetampTz), or 184kiB.
 * Usage could be halved if we decide to reduce the required time dimension
 * precision; as 32 bits in second precision should be enough for statistics.
 * However, that is not yet implemented.
 */
 typedef struct HyperLogLogState
 {
 	TimestampTz regs[HLL_N_REGISTERS][HLL_C_BITS + 1];
 } HyperLogLogState;
 extern void   initSHLL(HyperLogLogState *cState);
 extern void   addSHLL(HyperLogLogState *cState, uint32 hash);
 extern double estimateSHLL(HyperLogLogState *cState, time_t dutration);
 #endif
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -427,12 +427,17 @@ pageserver_connect(shardno_t shard_no, int elevel)
 		values[n_pgsql_params] = NULL;
 		shard->conn = PQconnectStartParams(keywords, values, 1);
-		if (!shard->conn)
+		if (PQstatus(shard->conn) == CONNECTION_BAD)
 		{
-			neon_shard_log(shard_no, elevel, "Failed to connect to pageserver: out of memory");
+			char	   *msg = pchomp(PQerrorMessage(shard->conn));
 			CLEANUP_AND_DISCONNECT(shard);
 			ereport(elevel,
 					(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
 						errmsg(NEON_TAG "[shard %d] could not establish connection to pageserver", shard_no),
 						errdetail_internal("%s", msg)));
 			pfree(msg);
 			return false;
 		}
 		shard->state = PS_Connecting_Startup;
 		/* fallthrough */
 	}
--- a/Show More
+++ b/Show More