compress cache key

proxy: cache a compressed version of the node info
2026-03-11 20:30:37 +00:00 · 2024-06-28 09:12:18 +01:00 · 2024-06-28 09:04:54 +01:00
215 changed files with 3475 additions and 8985 deletions
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -114,8 +114,6 @@ runs:
        export PLATFORM=${PLATFORM:-github-actions-selfhosted}
        export POSTGRES_DISTRIB_DIR=${POSTGRES_DISTRIB_DIR:-/tmp/neon/pg_install}
        export DEFAULT_PG_VERSION=${PG_VERSION#v}
-        export LD_LIBRARY_PATH=${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/lib
-        export BENCHMARK_CONNSTR=${BENCHMARK_CONNSTR:-}

        if [ "${BUILD_TYPE}" = "remote" ]; then
          export REMOTE_ENV=1
@@ -180,15 +178,7 @@ runs:

        # Wake up the cluster if we use remote neon instance
        if [ "${{ inputs.build_type }}" = "remote" ] && [ -n "${BENCHMARK_CONNSTR}" ]; then
-          QUERIES=("SELECT version()")
-          if [[ "${PLATFORM}" = "neon"* ]]; then
-            QUERIES+=("SHOW neon.tenant_id")
-            QUERIES+=("SHOW neon.timeline_id")
-          fi
-
-          for q in "${QUERIES[@]}"; do
-            ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/psql ${BENCHMARK_CONNSTR} -c "${q}"
-          done
+          ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/psql ${BENCHMARK_CONNSTR} -c "SELECT version();"
        fi

        # Run the tests.
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -99,14 +99,7 @@ jobs:
        # Set --sparse-ordering option of pytest-order plugin
        # to ensure tests are running in order of appears in the file.
        # It's important for test_perf_pgbench.py::test_pgbench_remote_* tests
-        extra_params:
-          -m remote_cluster
-          --sparse-ordering
-          --timeout 5400
-          --ignore test_runner/performance/test_perf_olap.py
-          --ignore test_runner/performance/test_perf_pgvector_queries.py
-          --ignore test_runner/performance/test_logical_replication.py
-          --ignore test_runner/performance/test_physical_replication.py
+        extra_params: -m remote_cluster --sparse-ordering --timeout 5400 --ignore test_runner/performance/test_perf_olap.py --ignore test_runner/performance/test_perf_pgvector_queries.py
      env:
        BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -132,69 +125,6 @@ jobs:
      env:
        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}

-  replication-tests:
-    env:
-      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
-      DEFAULT_PG_VERSION: 14
-      TEST_OUTPUT: /tmp/test_output
-      BUILD_TYPE: remote
-      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
-      PLATFORM: "neon-staging"
-
-    runs-on: [ self-hosted, us-east-2, x64 ]
-    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
-      options: --init
-
-    steps:
-    - uses: actions/checkout@v4
-
-    - name: Download Neon artifact
-      uses: ./.github/actions/download
-      with:
-        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
-        path: /tmp/neon/
-        prefix: latest
-
-    - name: Run benchmark
-      uses: ./.github/actions/run-python-test-set
-      with:
-        build_type: ${{ env.BUILD_TYPE }}
-        test_selection: performance/test_logical_replication.py
-        run_in_parallel: false
-        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
-        extra_params: -m remote_cluster --timeout 5400
-      env:
-        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
-        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
-        NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }}
-
-    - name: Run benchmark
-      uses: ./.github/actions/run-python-test-set
-      with:
-        build_type: ${{ env.BUILD_TYPE }}
-        test_selection: performance/test_physical_replication.py
-        run_in_parallel: false
-        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
-        extra_params: -m remote_cluster --timeout 5400
-      env:
-        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
-        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
-        NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }}
-
-    - name: Create Allure report
-      if: ${{ !cancelled() }}
-      uses: ./.github/actions/allure-report-generate
-
-    - name: Post to a Slack channel
-      if: ${{ github.event.schedule && failure() }}
-      uses: slackapi/slack-github-action@v1
-      with:
-        channel-id: "C033QLM5P7D" # dev-staging-stream
-        slack-message: "Periodic replication testing: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
-      env:
-        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
-
  generate-matrices:
    if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
    # Create matrices for the benchmarking jobs, so we run benchmarks on rds only once a week (on Saturday)
@@ -309,6 +239,11 @@ jobs:
        path: /tmp/neon/
        prefix: latest

+    - name: Add Postgres binaries to PATH
+      run: |
+        ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
+        echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
+
    - name: Create Neon Project
      if: contains(fromJson('["neon-captest-new", "neon-captest-freetier", "neonvm-captest-new", "neonvm-captest-freetier"]'), matrix.platform)
      id: create-neon-project
@@ -347,6 +282,16 @@ jobs:

        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT

+        QUERIES=("SELECT version()")
+        if [[ "${PLATFORM}" = "neon"* ]]; then
+          QUERIES+=("SHOW neon.tenant_id")
+          QUERIES+=("SHOW neon.timeline_id")
+        fi
+
+        for q in "${QUERIES[@]}"; do
+          psql ${CONNSTR} -c "${q}"
+        done
+
    - name: Benchmark init
      uses: ./.github/actions/run-python-test-set
      with:
@@ -432,13 +377,26 @@ jobs:
        path: /tmp/neon/
        prefix: latest

+    - name: Add Postgres binaries to PATH
+      run: |
+        ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
+        echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
+
    - name: Set up Connection String
      id: set-up-connstr
      run: |
        CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR }}
-
+        
        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT

+        QUERIES=("SELECT version()")
+        QUERIES+=("SHOW neon.tenant_id")
+        QUERIES+=("SHOW neon.timeline_id")
+        
+        for q in "${QUERIES[@]}"; do
+          psql ${CONNSTR} -c "${q}"
+        done
+
    - name: Benchmark pgvector hnsw indexing
      uses: ./.github/actions/run-python-test-set
      with:
@@ -459,12 +417,12 @@ jobs:
        test_selection: performance/test_perf_pgvector_queries.py
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
-        extra_params: -m remote_cluster --timeout 21600
+        extra_params: -m remote_cluster --timeout 21600 
      env:
        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
-
+    
    - name: Create Allure report
      if: ${{ !cancelled() }}
      uses: ./.github/actions/allure-report-generate
@@ -519,6 +477,11 @@ jobs:
        path: /tmp/neon/
        prefix: latest

+    - name: Add Postgres binaries to PATH
+      run: |
+        ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
+        echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
+
    - name: Set up Connection String
      id: set-up-connstr
      run: |
@@ -540,6 +503,16 @@ jobs:

        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT

+        QUERIES=("SELECT version()")
+        if [[ "${PLATFORM}" = "neon"* ]]; then
+          QUERIES+=("SHOW neon.tenant_id")
+          QUERIES+=("SHOW neon.timeline_id")
+        fi
+
+        for q in "${QUERIES[@]}"; do
+          psql ${CONNSTR} -c "${q}"
+        done
+
    - name: ClickBench benchmark
      uses: ./.github/actions/run-python-test-set
      with:
@@ -607,6 +580,11 @@ jobs:
        path: /tmp/neon/
        prefix: latest

+    - name: Add Postgres binaries to PATH
+      run: |
+        ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
+        echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
+
    - name: Get Connstring Secret Name
      run: |
        case "${PLATFORM}" in
@@ -635,6 +613,16 @@ jobs:

        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT

+        QUERIES=("SELECT version()")
+        if [[ "${PLATFORM}" = "neon"* ]]; then
+          QUERIES+=("SHOW neon.tenant_id")
+          QUERIES+=("SHOW neon.timeline_id")
+        fi
+
+        for q in "${QUERIES[@]}"; do
+          psql ${CONNSTR} -c "${q}"
+        done
+
    - name: Run TPC-H benchmark
      uses: ./.github/actions/run-python-test-set
      with:
@@ -693,6 +681,11 @@ jobs:
        path: /tmp/neon/
        prefix: latest

+    - name: Add Postgres binaries to PATH
+      run: |
+        ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
+        echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
+
    - name: Set up Connection String
      id: set-up-connstr
      run: |
@@ -714,6 +707,16 @@ jobs:

        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT

+        QUERIES=("SELECT version()")
+        if [[ "${PLATFORM}" = "neon"* ]]; then
+          QUERIES+=("SHOW neon.tenant_id")
+          QUERIES+=("SHOW neon.timeline_id")
+        fi
+
+        for q in "${QUERIES[@]}"; do
+          psql ${CONNSTR} -c "${q}"
+        done
+
    - name: Run user examples
      uses: ./.github/actions/run-python-test-set
      with:
--- a/.github/workflows/build-build-tools-image.yml
+++ b/.github/workflows/build-build-tools-image.yml
@@ -63,16 +63,14 @@ jobs:
          mkdir -p /tmp/.docker-custom
          echo DOCKER_CONFIG=/tmp/.docker-custom >> $GITHUB_ENV

-      - uses: docker/setup-buildx-action@v3
-        with:
-          cache-binary: false
+      - uses: docker/setup-buildx-action@v2

-      - uses: docker/login-action@v3
+      - uses: docker/login-action@v2
        with:
          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}

-      - uses: docker/build-push-action@v6
+      - uses: docker/build-push-action@v4
        with:
          context: .
          provenance: false
@@ -84,7 +82,6 @@ jobs:
          tags: neondatabase/build-tools:${{ inputs.image-tag }}-${{ matrix.arch }}

      - name: Remove custom docker config directory
-        if: always()
        run: |
          rm -rf /tmp/.docker-custom

--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -30,7 +30,7 @@ jobs:
    if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
    uses: ./.github/workflows/check-permissions.yml
    with:
-      github-event-name: ${{ github.event_name }}
+      github-event-name: ${{ github.event_name}}

  cancel-previous-e2e-tests:
    needs: [ check-permissions ]
@@ -335,8 +335,6 @@ jobs:

      - name: Run cargo build
        run: |
-          PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
-          export PQ_LIB_DIR
          ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests

      # Do install *before* running rust tests because they might recompile the
@@ -385,11 +383,6 @@ jobs:
        env:
          NEXTEST_RETRIES: 3
        run: |
-          PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
-          export PQ_LIB_DIR
-          LD_LIBRARY_PATH=$(pwd)/pg_install/v16/lib
-          export LD_LIBRARY_PATH
-
          #nextest does not yet support running doctests
          cargo test --doc $CARGO_FLAGS $CARGO_FEATURES

@@ -751,16 +744,14 @@ jobs:
        run: |
          mkdir -p .docker-custom
          echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
-      - uses: docker/setup-buildx-action@v3
-        with:
-          cache-binary: false
+      - uses: docker/setup-buildx-action@v2

      - uses: docker/login-action@v3
        with:
          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}

-      - uses: docker/build-push-action@v6
+      - uses: docker/build-push-action@v5
        with:
          context: .
          build-args: |
@@ -831,12 +822,11 @@ jobs:
        run: |
          mkdir -p .docker-custom
          echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
-      - uses: docker/setup-buildx-action@v3
+      - uses: docker/setup-buildx-action@v2
        with:
-          cache-binary: false
          # Disable parallelism for docker buildkit.
          # As we already build everything with `make -j$(nproc)`, running it in additional level of parallelisam blows up the Runner.
-          buildkitd-config-inline: |
+          config-inline: |
            [worker.oci]
              max-parallelism = 1

@@ -852,7 +842,7 @@ jobs:
          password: ${{ secrets.AWS_SECRET_KEY_DEV }}

      - name: Build compute-node image
-        uses: docker/build-push-action@v6
+        uses: docker/build-push-action@v5
        with:
          context: .
          build-args: |
@@ -871,7 +861,7 @@ jobs:

      - name: Build neon extensions test image
        if: matrix.version == 'v16'
-        uses: docker/build-push-action@v6
+        uses: docker/build-push-action@v5
        with:
          context: .
          build-args: |
@@ -892,7 +882,7 @@ jobs:
      - name: Build compute-tools image
        # compute-tools are Postgres independent, so build it only once
        if: matrix.version == 'v16'
-        uses: docker/build-push-action@v6
+        uses: docker/build-push-action@v5
        with:
          target: compute-tools-image
          context: .
@@ -1336,7 +1326,6 @@ jobs:
        env:
          BUCKET: neon-github-public-dev
          PREFIX: artifacts/latest
-          COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
        run: |
          # Update compatibility snapshot for the release
          for pg_version in v14 v15 v16; do
@@ -1350,7 +1339,7 @@ jobs:

          # Update Neon artifact for the release (reuse already uploaded artifact)
          for build_type in debug release; do
-            OLD_PREFIX=artifacts/${COMMIT_SHA}/${GITHUB_RUN_ID}
+            OLD_PREFIX=artifacts/${GITHUB_RUN_ID}
            FILENAME=neon-${{ runner.os }}-${{ runner.arch }}-${build_type}-artifact.tar.zst

            S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${OLD_PREFIX} | jq -r '.Contents[]?.Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
@@ -1369,31 +1358,3 @@ jobs:
    with:
      from-tag: ${{ needs.build-build-tools-image.outputs.image-tag }}
    secrets: inherit
-
-  # This job simplifies setting branch protection rules (in GitHub UI)
-  # by allowing to set only this job instead of listing many others.
-  # It also makes it easier to rename or parametrise jobs (using matrix)
-  # which requires changes in branch protection rules
-  #
-  # Note, that we can't add external check (like `neon-cloud-e2e`) we still need to use GitHub UI for that.
-  #
-  # https://github.com/neondatabase/neon/settings/branch_protection_rules
-  conclusion:
-    if: always()
-    # Format `needs` differently to make the list more readable.
-    # Usually we do `needs: [...]`
-    needs:
-      - check-codestyle-python
-      - check-codestyle-rust
-      - regress-tests
-      - test-images
-    runs-on: ubuntu-22.04
-    steps:
-      # The list of possible results:
-      # https://docs.github.com/en/actions/learn-github-actions/contexts#needs-context
-      - name: Fail the job if any of the dependencies do not succeed
-        run: exit 1
-        if: |
-          contains(needs.*.result, 'failure')
-          || contains(needs.*.result, 'cancelled')
-          || contains(needs.*.result, 'skipped')
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -232,19 +232,12 @@ jobs:

      - name: Run cargo build
        run: |
-          PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
-          export PQ_LIB_DIR
          mold -run cargo build --locked $CARGO_FLAGS $CARGO_FEATURES --bins --tests -j$(nproc)

      - name: Run cargo test
        env:
          NEXTEST_RETRIES: 3
        run: |
-          PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
-          export PQ_LIB_DIR
-          LD_LIBRARY_PATH=$(pwd)/pg_install/v16/lib
-          export LD_LIBRARY_PATH
-
          cargo nextest run $CARGO_FEATURES -j$(nproc)

          # Run separate tests for real S3
@@ -385,7 +378,7 @@ jobs:
        run: make walproposer-lib -j$(nproc)

      - name: Produce the build stats
-        run: PQ_LIB_DIR=$(pwd)/pg_install/v16/lib cargo build --all --release --timings -j$(nproc)
+        run: cargo build --all --release --timings -j$(nproc)

      - name: Upload the build stats
        id: upload-stats
--- a/.github/workflows/periodic_pagebench.yml
+++ b/.github/workflows/periodic_pagebench.yml
@@ -1,155 +0,0 @@
-name: Periodic pagebench performance test on dedicated EC2 machine in eu-central-1 region
-
-on:
-  schedule:
-    # * is a special character in YAML so you have to quote this string
-    #          ┌───────────── minute (0 - 59)
-    #          │ ┌───────────── hour (0 - 23)
-    #          │ │ ┌───────────── day of the month (1 - 31)
-    #          │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
-    #          │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
-    - cron:  '0 18 * * *' # Runs at 6 PM UTC every day
-  workflow_dispatch: # Allows manual triggering of the workflow
-    inputs:
-      commit_hash:
-        type: string
-        description: 'The long neon repo commit hash for the system under test (pageserver) to be tested.'
-        required: false
-        default: ''
-
-defaults:
-  run:
-    shell: bash -euo pipefail {0}
-
-concurrency:
-  group: ${{ github.workflow }}
-  cancel-in-progress: false
-
-jobs:
-  trigger_bench_on_ec2_machine_in_eu_central_1:
-    runs-on: [ self-hosted, gen3, small ]
-    container:
-      image: neondatabase/build-tools:pinned
-      credentials:
-        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
-      options: --init
-    timeout-minutes: 360  # Set the timeout to 6 hours
-    env:
-      API_KEY: ${{ secrets.PERIODIC_PAGEBENCH_EC2_RUNNER_API_KEY }}
-      RUN_ID: ${{ github.run_id }}
-      AWS_ACCESS_KEY_ID: ${{ secrets.AWS_EC2_US_TEST_RUNNER_ACCESS_KEY_ID }}
-      AWS_SECRET_ACCESS_KEY : ${{ secrets.AWS_EC2_US_TEST_RUNNER_ACCESS_KEY_SECRET }}
-      AWS_DEFAULT_REGION : "eu-central-1"
-      AWS_INSTANCE_ID : "i-02a59a3bf86bc7e74"
-    steps:
-    # we don't need the neon source code because we run everything remotely
-    # however we still need the local github actions to run the allure step below
-    - uses: actions/checkout@v4
-
-    - name: Show my own (github runner) external IP address - usefull for IP allowlisting
-      run: curl https://ifconfig.me
-
-    - name: Start EC2 instance and wait for the instance to boot up
-      run: |
-        aws ec2 start-instances --instance-ids $AWS_INSTANCE_ID
-        aws ec2 wait instance-running --instance-ids $AWS_INSTANCE_ID
-        sleep 60 # sleep some time to allow cloudinit and our API server to start up
-
-    - name: Determine public IP of the EC2 instance and set env variable EC2_MACHINE_URL_US
-      run: |
-        public_ip=$(aws ec2 describe-instances --instance-ids $AWS_INSTANCE_ID --query 'Reservations[*].Instances[*].PublicIpAddress' --output text)
-        echo "Public IP of the EC2 instance: $public_ip"
-        echo "EC2_MACHINE_URL_US=https://${public_ip}:8443" >> $GITHUB_ENV
-
-    - name: Determine commit hash
-      env:
-        INPUT_COMMIT_HASH: ${{ github.event.inputs.commit_hash }}
-      run: |
-        if [ -z "$INPUT_COMMIT_HASH" ]; then
-          echo "COMMIT_HASH=$(curl -s https://api.github.com/repos/neondatabase/neon/commits/main | jq -r '.sha')" >> $GITHUB_ENV
-        else
-          echo "COMMIT_HASH=$INPUT_COMMIT_HASH" >> $GITHUB_ENV
-        fi
-
-    - name: Start Bench with run_id   
-      run: |
-        curl -k -X 'POST' \
-        "${EC2_MACHINE_URL_US}/start_test/${GITHUB_RUN_ID}" \
-        -H 'accept: application/json' \
-        -H 'Content-Type: application/json' \
-        -H "Authorization: Bearer $API_KEY" \
-        -d "{\"neonRepoCommitHash\": \"${COMMIT_HASH}\"}"
-
-    - name: Poll Test Status
-      id: poll_step
-      run: |
-        status=""
-        while [[ "$status" != "failure" && "$status" != "success" ]]; do
-          response=$(curl -k -X 'GET' \
-          "${EC2_MACHINE_URL_US}/test_status/${GITHUB_RUN_ID}" \
-          -H 'accept: application/json' \
-          -H "Authorization: Bearer $API_KEY")
-          echo "Response: $response"
-          set +x
-          status=$(echo $response | jq -r '.status')
-          echo "Test status: $status"
-          if [[ "$status" == "failure" ]]; then
-            echo "Test failed"
-            exit 1 # Fail the job step if status is failure
-          elif [[ "$status" == "success" || "$status" == "null" ]]; then
-            break
-          elif [[ "$status" == "too_many_runs" ]]; then
-            echo "Too many runs already running"
-            echo "too_many_runs=true" >> "$GITHUB_OUTPUT"
-            exit 1
-          fi
-
-          sleep 60 # Poll every 60 seconds
-        done
-
-    - name: Retrieve Test Logs
-      if: always() && steps.poll_step.outputs.too_many_runs != 'true'
-      run: |
-        curl -k -X 'GET' \
-        "${EC2_MACHINE_URL_US}/test_log/${GITHUB_RUN_ID}" \
-        -H 'accept: application/gzip' \
-        -H "Authorization: Bearer $API_KEY" \
-        --output "test_log_${GITHUB_RUN_ID}.gz"
-    
-    - name: Unzip Test Log and Print it into this job's log
-      if: always() && steps.poll_step.outputs.too_many_runs != 'true'
-      run: |
-        gzip -d "test_log_${GITHUB_RUN_ID}.gz"
-        cat "test_log_${GITHUB_RUN_ID}"
-
-    - name: Create Allure report
-      env:
-        AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-        AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
-      if: ${{ !cancelled() }}
-      uses: ./.github/actions/allure-report-generate
-
-    - name: Post to a Slack channel
-      if: ${{ github.event.schedule && failure() }}
-      uses: slackapi/slack-github-action@v1
-      with:
-        channel-id: "C033QLM5P7D" # dev-staging-stream
-        slack-message: "Periodic pagebench testing on dedicated hardware: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
-      env:
-        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
-
-    - name: Cleanup Test Resources
-      if: always() 
-      run: |
-        curl -k -X 'POST' \
-        "${EC2_MACHINE_URL_US}/cleanup_test/${GITHUB_RUN_ID}" \
-        -H 'accept: application/json' \
-        -H "Authorization: Bearer $API_KEY" \
-        -d ''
-
-    - name: Stop EC2 instance and wait for the instance to be stopped
-      if: always() && steps.poll_step.outputs.too_many_runs != 'true'
-      run: |
-        aws ec2 stop-instances --instance-ids $AWS_INSTANCE_ID
-        aws ec2 wait instance-stopped --instance-ids $AWS_INSTANCE_ID
--- a/.github/workflows/pg-clients.yml
+++ b/.github/workflows/pg-clients.yml
@@ -1,115 +0,0 @@
-name: Test Postgres client libraries
-
-on:
-  schedule:
-    # * is a special character in YAML so you have to quote this string
-    #          ┌───────────── minute (0 - 59)
-    #          │ ┌───────────── hour (0 - 23)
-    #          │ │ ┌───────────── day of the month (1 - 31)
-    #          │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
-    #          │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
-    - cron:  '23 02 * * *' # run once a day, timezone is utc
-  pull_request:
-    paths:
-      - '.github/workflows/pg-clients.yml'
-      - 'test_runner/pg_clients/**'
-      - 'poetry.lock'
-  workflow_dispatch:
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref_name }}
-  cancel-in-progress: ${{ github.event_name == 'pull_request' }}
-
-defaults:
-  run:
-    shell: bash -euxo pipefail {0}
-
-env:
-  DEFAULT_PG_VERSION: 16
-  PLATFORM: neon-captest-new
-  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
-  AWS_DEFAULT_REGION: eu-central-1
-
-jobs:
-  check-permissions:
-    if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
-    uses: ./.github/workflows/check-permissions.yml
-    with:
-      github-event-name: ${{ github.event_name }}
-
-  check-build-tools-image:
-    needs: [ check-permissions ]
-    uses: ./.github/workflows/check-build-tools-image.yml
-
-  build-build-tools-image:
-    needs: [ check-build-tools-image ]
-    uses: ./.github/workflows/build-build-tools-image.yml
-    with:
-      image-tag: ${{ needs.check-build-tools-image.outputs.image-tag }}
-    secrets: inherit
-
-  test-postgres-client-libs:
-    needs: [ build-build-tools-image ]
-    runs-on: ubuntu-22.04
-
-    container:
-      image: ${{ needs.build-build-tools-image.outputs.image }}
-      credentials:
-        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
-      options: --init --user root
-
-    steps:
-    - uses: actions/checkout@v4
-
-    - name: Download Neon artifact
-      uses: ./.github/actions/download
-      with:
-        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
-        path: /tmp/neon/
-        prefix: latest
-
-    - name: Create Neon Project
-      id: create-neon-project
-      uses: ./.github/actions/neon-project-create
-      with:
-        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
-        postgres_version: ${{ env.DEFAULT_PG_VERSION }}
-
-    - name: Run tests
-      uses: ./.github/actions/run-python-test-set
-      with:
-        build_type: remote
-        test_selection: pg_clients
-        run_in_parallel: false
-        extra_params: -m remote_cluster
-        pg_version: ${{ env.DEFAULT_PG_VERSION }}
-      env:
-        BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
-
-    - name: Delete Neon Project
-      if: always()
-      uses: ./.github/actions/neon-project-delete
-      with:
-        project_id: ${{ steps.create-neon-project.outputs.project_id }}
-        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
-
-    - name: Create Allure report
-      if: ${{ !cancelled() }}
-      id: create-allure-report
-      uses: ./.github/actions/allure-report-generate
-      with:
-        store-test-results-into-db: true
-      env:
-        REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
-
-    - name: Post to a Slack channel
-      if: github.event.schedule && failure()
-      uses: slackapi/slack-github-action@v1
-      with:
-        channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream
-        slack-message: |
-          Testing Postgres clients: <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|${{ job.status }}> (<${{ steps.create-allure-report.outputs.report-url }}|test report>)
-      env:
-        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
--- a/.github/workflows/pg_clients.yml
+++ b/.github/workflows/pg_clients.yml
@@ -0,0 +1,98 @@
+name: Test Postgres client libraries
+
+on:
+  schedule:
+    # * is a special character in YAML so you have to quote this string
+    #          ┌───────────── minute (0 - 59)
+    #          │ ┌───────────── hour (0 - 23)
+    #          │ │ ┌───────────── day of the month (1 - 31)
+    #          │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
+    #          │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
+    - cron:  '23 02 * * *' # run once a day, timezone is utc
+
+  workflow_dispatch:
+
+concurrency:
+  # Allow only one workflow per any non-`main` branch.
+  group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
+  cancel-in-progress: true
+
+jobs:
+  test-postgres-client-libs:
+    # TODO: switch to gen2 runner, requires docker
+    runs-on: ubuntu-22.04
+
+    env:
+      DEFAULT_PG_VERSION: 14
+      TEST_OUTPUT: /tmp/test_output
+
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v4
+
+    - uses: actions/setup-python@v4
+      with:
+        python-version: 3.9
+
+    - name: Install Poetry
+      uses: snok/install-poetry@v1
+
+    - name: Cache poetry deps
+      uses: actions/cache@v4
+      with:
+        path: ~/.cache/pypoetry/virtualenvs
+        key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-ubunutu-latest-${{ hashFiles('poetry.lock') }}
+
+    - name: Install Python deps
+      shell: bash -euxo pipefail {0}
+      run: ./scripts/pysync
+
+    - name: Create Neon Project
+      id: create-neon-project
+      uses: ./.github/actions/neon-project-create
+      with:
+        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
+        postgres_version: ${{ env.DEFAULT_PG_VERSION }}
+
+    - name: Run pytest
+      env:
+        REMOTE_ENV: 1
+        BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
+        POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
+      shell: bash -euxo pipefail {0}
+      run: |
+        # Test framework expects we have psql binary;
+        # but since we don't really need it in this test, let's mock it
+        mkdir -p "$POSTGRES_DISTRIB_DIR/v${DEFAULT_PG_VERSION}/bin" && touch "$POSTGRES_DISTRIB_DIR/v${DEFAULT_PG_VERSION}/bin/psql";
+        ./scripts/pytest \
+          --junitxml=$TEST_OUTPUT/junit.xml \
+          --tb=short \
+          --verbose \
+          -m "remote_cluster" \
+          -rA "test_runner/pg_clients"
+
+    - name: Delete Neon Project
+      if: ${{ always() }}
+      uses: ./.github/actions/neon-project-delete
+      with:
+        project_id: ${{ steps.create-neon-project.outputs.project_id }}
+        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
+
+    # We use GitHub's action upload-artifact because `ubuntu-latest` doesn't have configured AWS CLI.
+    # It will be fixed after switching to gen2 runner
+    - name: Upload python test logs
+      if: always()
+      uses: actions/upload-artifact@v4
+      with:
+        retention-days: 7
+        name: python-test-pg_clients-${{ runner.os }}-${{ runner.arch }}-stage-logs
+        path: ${{ env.TEST_OUTPUT }}
+
+    - name: Post to a Slack channel
+      if: ${{ github.event.schedule && failure() }}
+      uses: slackapi/slack-github-action@v1
+      with:
+        channel-id: "C033QLM5P7D" # dev-staging-stream
+        slack-message: "Testing Postgres clients: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+      env:
+        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1397,9 +1397,9 @@ dependencies = [

 [[package]]
 name = "crc32c"
-version = "0.6.8"
+version = "0.6.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3a47af21622d091a8f0fb295b88bc886ac74efcc613efc19f5d0b21de5c89e47"
+checksum = "89254598aa9b9fa608de44b3ae54c810f0f06d755e24c50177f1f8f31ff50ce2"
 dependencies = [
 "rustc_version",
 ]
@@ -1651,16 +1651,6 @@ dependencies = [
 "rusticata-macros",
 ]

-[[package]]
-name = "deranged"
-version = "0.3.11"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b42b6fa04a440b495c8b04d0e71b707c585f83cb9cb28cf8cd0d976c315e31b4"
-dependencies = [
- "powerfmt",
- "serde",
-]
-
 [[package]]
 name = "desim"
 version = "0.1.0"
@@ -3018,9 +3008,9 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"

 [[package]]
 name = "measured"
-version = "0.0.22"
+version = "0.0.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3051f3a030d55d680cdef6ca50e80abd1182f8da29f2344a7c9cb575721138f0"
+checksum = "652bc741286361c06de8cb4d89b21a6437f120c508c51713663589eeb9928ac5"
 dependencies = [
 "bytes",
 "crossbeam-utils",
@@ -3036,9 +3026,9 @@ dependencies = [

 [[package]]
 name = "measured-derive"
-version = "0.0.22"
+version = "0.0.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b9e6777fc80a575f9503d908c8b498782a6c3ee88a06cb416dc3941401e43b94"
+checksum = "6ea497f33e1e856a376c32ad916f69a0bd3c597db1f912a399f842b01a4a685d"
 dependencies = [
 "heck 0.5.0",
 "proc-macro2",
@@ -3048,9 +3038,9 @@ dependencies = [

 [[package]]
 name = "measured-process"
-version = "0.0.22"
+version = "0.0.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7c4b80445aeb08e832d87bf1830049a924cdc1d6b7ef40b6b9b365bff17bf8ec"
+checksum = "b364ccb66937a814b6b2ad751d1a2f7a9d5a78c761144036825fb36bb0771000"
 dependencies = [
 "libc",
 "measured",
@@ -3285,12 +3275,6 @@ dependencies = [
 "num-traits",
 ]

-[[package]]
-name = "num-conv"
-version = "0.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9"
-
 [[package]]
 name = "num-integer"
 version = "0.1.45"
@@ -3683,7 +3667,6 @@ dependencies = [
 "sysinfo",
 "tenant_size_model",
 "thiserror",
- "tikv-jemallocator",
 "tokio",
 "tokio-epoll-uring",
 "tokio-io-timeout",
@@ -4037,7 +4020,7 @@ dependencies = [
 [[package]]
 name = "postgres"
 version = "0.19.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#cff6927e4f58b1af6ecc2ee7279df1f2ff537295"
 dependencies = [
 "bytes",
 "fallible-iterator",
@@ -4050,7 +4033,7 @@ dependencies = [
 [[package]]
 name = "postgres-protocol"
 version = "0.6.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#cff6927e4f58b1af6ecc2ee7279df1f2ff537295"
 dependencies = [
 "base64 0.20.0",
 "byteorder",
@@ -4069,7 +4052,7 @@ dependencies = [
 [[package]]
 name = "postgres-types"
 version = "0.2.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#cff6927e4f58b1af6ecc2ee7279df1f2ff537295"
 dependencies = [
 "bytes",
 "fallible-iterator",
@@ -4094,7 +4077,6 @@ dependencies = [
 "tokio-postgres",
 "tokio-postgres-rustls",
 "tokio-rustls 0.25.0",
- "tokio-util",
 "tracing",
 "workspace_hack",
 ]
@@ -4135,12 +4117,6 @@ dependencies = [
 "workspace_hack",
 ]

-[[package]]
-name = "powerfmt"
-version = "0.2.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391"
-
 [[package]]
 name = "ppv-lite86"
 version = "0.2.17"
@@ -5420,9 +5396,9 @@ checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4"

 [[package]]
 name = "serde"
-version = "1.0.203"
+version = "1.0.183"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7253ab4de971e72fb7be983802300c30b5a7f0c2e56fab8abfc6a214307c0094"
+checksum = "32ac8da02677876d532745a130fc9d8e6edfa81a269b107c5b00829b91d8eb3c"
 dependencies = [
 "serde_derive",
 ]
@@ -5439,9 +5415,9 @@ dependencies = [

 [[package]]
 name = "serde_derive"
-version = "1.0.203"
+version = "1.0.183"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "500cbc0ebeb6f46627f50f3f5811ccf6bf00643be300b4c3eabc0ef55dc5b5ba"
+checksum = "aafe972d60b0b9bee71a91b92fee2d4fb3c9d7e8f6b179aa99f27203d99a4816"
 dependencies = [
 "proc-macro2",
 "quote",
@@ -6131,15 +6107,12 @@ dependencies = [

 [[package]]
 name = "time"
-version = "0.3.36"
+version = "0.3.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5dfd88e563464686c916c7e46e623e520ddc6d79fa6641390f2e3fa86e83e885"
+checksum = "8f3403384eaacbca9923fa06940178ac13e4edb725486d70e8e15881d0c836cc"
 dependencies = [
- "deranged",
 "itoa",
 "js-sys",
- "num-conv",
- "powerfmt",
 "serde",
 "time-core",
 "time-macros",
@@ -6147,17 +6120,16 @@ dependencies = [

 [[package]]
 name = "time-core"
-version = "0.1.2"
+version = "0.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3"
+checksum = "7300fbefb4dadc1af235a9cef3737cea692a9d97e1b9cbcd4ebdae6f8868e6fb"

 [[package]]
 name = "time-macros"
-version = "0.2.18"
+version = "0.2.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3f252a68540fde3a3877aeea552b832b40ab9a69e318efd078774a01ddee1ccf"
+checksum = "372950940a5f07bf38dbe211d7283c9e6d7327df53794992d293e534c733d09b"
 dependencies = [
- "num-conv",
 "time-core",
 ]

@@ -6253,7 +6225,7 @@ dependencies = [
 [[package]]
 name = "tokio-postgres"
 version = "0.7.7"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#cff6927e4f58b1af6ecc2ee7279df1f2ff537295"
 dependencies = [
 "async-trait",
 "byteorder",
@@ -6839,7 +6811,6 @@ dependencies = [
 "tokio-stream",
 "tokio-tar",
 "tokio-util",
- "toml_edit 0.19.10",
 "tracing",
 "tracing-error",
 "tracing-subscriber",
@@ -7455,12 +7426,13 @@ dependencies = [
 "clap",
 "clap_builder",
 "crossbeam-utils",
- "deranged",
 "either",
 "fail",
 "futures-channel",
+ "futures-core",
 "futures-executor",
 "futures-io",
+ "futures-sink",
 "futures-util",
 "getrandom 0.2.11",
 "hashbrown 0.14.5",
@@ -7478,9 +7450,7 @@ dependencies = [
 "num-traits",
 "once_cell",
 "parquet",
- "proc-macro2",
 "prost",
- "quote",
 "rand 0.8.5",
 "regex",
 "regex-automata 0.4.3",
@@ -7497,7 +7467,6 @@ dependencies = [
 "syn 1.0.109",
 "syn 2.0.52",
 "sync_wrapper",
- "tikv-jemalloc-sys",
 "time",
 "time-macros",
 "tokio",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -111,8 +111,8 @@ lasso = "0.7"
 leaky-bucket = "1.0.1"
 libc = "0.2"
 md5 = "0.7.0"
-measured = { version = "0.0.22", features=["lasso"] }
-measured-process = { version = "0.0.22" }
+measured = { version = "0.0.21", features=["lasso"] }
+measured-process = { version = "0.0.21" }
 memoffset = "0.8"
 nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] }
 notify = "6.0.0"
--- a/5
+++ b/5
@@ -42,13 +42,12 @@ ARG CACHEPOT_BUCKET=neon-github-dev
 COPY --from=pg-build /home/nonroot/pg_install/v14/include/postgresql/server pg_install/v14/include/postgresql/server
 COPY --from=pg-build /home/nonroot/pg_install/v15/include/postgresql/server pg_install/v15/include/postgresql/server
 COPY --from=pg-build /home/nonroot/pg_install/v16/include/postgresql/server pg_install/v16/include/postgresql/server
-COPY --from=pg-build /home/nonroot/pg_install/v16/lib                       pg_install/v16/lib
 COPY --chown=nonroot . .

 # Show build caching stats to check if it was used in the end.
 # Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats.
 RUN set -e \
-    && PQ_LIB_DIR=$(pwd)/pg_install/v16/lib RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment" cargo build \
+    && RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment" cargo build  \
      --bin pg_sni_router  \
      --bin pageserver  \
      --bin pagectl  \
@@ -57,7 +56,6 @@ RUN set -e \
      --bin storage_controller  \
      --bin proxy  \
      --bin neon_local \
-      --bin storage_scrubber \
      --locked --release \
    && cachepot -s

@@ -84,7 +82,6 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_controller  /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy               /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local          /usr/local/bin
-COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_scrubber    /usr/local/bin

 COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/
 COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/
--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -1,13 +1,5 @@
 FROM debian:bullseye-slim

-# Use ARG as a build-time environment variable here to allow.
-# It's not supposed to be set outside.
-# Alternatively it can be obtained using the following command
-# ```
-# . /etc/os-release && echo "${VERSION_CODENAME}"
-# ```
-ARG DEBIAN_VERSION_CODENAME=bullseye
-
 # Add nonroot user
 RUN useradd -ms /bin/bash nonroot -b /home
 SHELL ["/bin/bash", "-c"]
@@ -34,6 +26,7 @@ RUN set -e \
        liblzma-dev \
        libncurses5-dev \
        libncursesw5-dev \
+        libpq-dev \
        libreadline-dev \
        libseccomp-dev \
        libsqlite3-dev \
@@ -74,24 +67,12 @@ RUN curl -sL "https://github.com/peak/s5cmd/releases/download/v${S5CMD_VERSION}/
 # LLVM
 ENV LLVM_VERSION=18
 RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \
-    && echo "deb http://apt.llvm.org/${DEBIAN_VERSION_CODENAME}/ llvm-toolchain-${DEBIAN_VERSION_CODENAME}-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \
+    && echo "deb http://apt.llvm.org/bullseye/ llvm-toolchain-bullseye-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \
    && apt update \
    && apt install -y clang-${LLVM_VERSION} llvm-${LLVM_VERSION} \
    && bash -c 'for f in /usr/bin/clang*-${LLVM_VERSION} /usr/bin/llvm*-${LLVM_VERSION}; do ln -s "${f}" "${f%-${LLVM_VERSION}}"; done' \
    && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*

-# Install docker
-RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg \
-    && echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/debian ${DEBIAN_VERSION_CODENAME} stable" > /etc/apt/sources.list.d/docker.list \
-    && apt update \
-    && apt install -y docker-ce docker-ce-cli \
-    && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
-
-# Configure sudo & docker
-RUN usermod -aG sudo nonroot && \
-    echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers && \
-    usermod -aG docker nonroot
-
 # AWS CLI
 RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "awscliv2.zip" \
    && unzip -q awscliv2.zip \
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -798,11 +798,7 @@ impl ComputeNode {
        // In this case we need to connect with old `zenith_admin` name
        // and create new user. We cannot simply rename connected user,
        // but we can create a new one and grant it all privileges.
-        let mut connstr = self.connstr.clone();
-        connstr
-            .query_pairs_mut()
-            .append_pair("application_name", "apply_config");
-
+        let connstr = self.connstr.clone();
        let mut client = match Client::connect(connstr.as_str(), NoTls) {
            Err(e) => match e.code() {
                Some(&SqlState::INVALID_PASSWORD)
@@ -871,19 +867,15 @@ impl ComputeNode {

        // Run migrations separately to not hold up cold starts
        thread::spawn(move || {
-            let mut connstr = connstr.clone();
-            connstr
-                .query_pairs_mut()
-                .append_pair("application_name", "migrations");
-
            let mut client = Client::connect(connstr.as_str(), NoTls)?;
            handle_migrations(&mut client).context("apply_config handle_migrations")
        });
        Ok(())
    }

-    // Wrapped this around `pg_ctl reload`, but right now we don't use
-    // `pg_ctl` for start / stop.
+    // We could've wrapped this around `pg_ctl reload`, but right now we don't use
+    // `pg_ctl` for start / stop, so this just seems much easier to do as we already
+    // have opened connection to Postgres and superuser access.
    #[instrument(skip_all)]
    fn pg_reload_conf(&self) -> Result<()> {
        let pgctl_bin = Path::new(&self.pgbin).parent().unwrap().join("pg_ctl");
@@ -1395,9 +1387,7 @@ pub fn forward_termination_signal() {
    let pg_pid = PG_PID.load(Ordering::SeqCst);
    if pg_pid != 0 {
        let pg_pid = nix::unistd::Pid::from_raw(pg_pid as i32);
-        // Use 'fast' shutdown (SIGINT) because it also creates a shutdown checkpoint, which is important for
-        // ROs to get a list of running xacts faster instead of going through the CLOG.
-        // See https://www.postgresql.org/docs/current/server-shutdown.html for the list of modes and signals.
-        kill(pg_pid, Signal::SIGINT).ok();
+        // use 'immediate' shutdown (SIGQUIT): https://www.postgresql.org/docs/current/server-shutdown.html
+        kill(pg_pid, Signal::SIGQUIT).ok();
    }
 }
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -11,7 +11,6 @@ pub mod logger;
 pub mod catalog;
 pub mod compute;
 pub mod extension_server;
-mod migration;
 pub mod monitor;
 pub mod params;
 pub mod pg_helpers;
--- a/compute_tools/src/migration.rs
+++ b/compute_tools/src/migration.rs
@@ -1,100 +0,0 @@
-use anyhow::{Context, Result};
-use postgres::Client;
-use tracing::info;
-
-pub(crate) struct MigrationRunner<'m> {
-    client: &'m mut Client,
-    migrations: &'m [&'m str],
-}
-
-impl<'m> MigrationRunner<'m> {
-    pub fn new(client: &'m mut Client, migrations: &'m [&'m str]) -> Self {
-        Self { client, migrations }
-    }
-
-    fn get_migration_id(&mut self) -> Result<i64> {
-        let query = "SELECT id FROM neon_migration.migration_id";
-        let row = self
-            .client
-            .query_one(query, &[])
-            .context("run_migrations get migration_id")?;
-
-        Ok(row.get::<&str, i64>("id"))
-    }
-
-    fn update_migration_id(&mut self) -> Result<()> {
-        let setval = format!(
-            "UPDATE neon_migration.migration_id SET id={}",
-            self.migrations.len()
-        );
-
-        self.client
-            .simple_query(&setval)
-            .context("run_migrations update id")?;
-
-        Ok(())
-    }
-
-    fn prepare_migrations(&mut self) -> Result<()> {
-        let query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
-        self.client.simple_query(query)?;
-
-        let query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)";
-        self.client.simple_query(query)?;
-
-        let query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING";
-        self.client.simple_query(query)?;
-
-        let query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin";
-        self.client.simple_query(query)?;
-
-        let query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC";
-        self.client.simple_query(query)?;
-
-        Ok(())
-    }
-
-    pub fn run_migrations(mut self) -> Result<()> {
-        self.prepare_migrations()?;
-
-        let mut current_migration: usize = self.get_migration_id()? as usize;
-        let starting_migration_id = current_migration;
-
-        let query = "BEGIN";
-        self.client
-            .simple_query(query)
-            .context("run_migrations begin")?;
-
-        while current_migration < self.migrations.len() {
-            let migration = self.migrations[current_migration];
-
-            if migration.starts_with("-- SKIP") {
-                info!("Skipping migration id={}", current_migration);
-            } else {
-                info!(
-                    "Running migration id={}:\n{}\n",
-                    current_migration, migration
-                );
-                self.client.simple_query(migration).with_context(|| {
-                    format!("run_migration current_migration={}", current_migration)
-                })?;
-            }
-
-            current_migration += 1;
-        }
-
-        self.update_migration_id()?;
-
-        let query = "COMMIT";
-        self.client
-            .simple_query(query)
-            .context("run_migrations commit")?;
-
-        info!(
-            "Ran {} migrations",
-            (self.migrations.len() - starting_migration_id)
-        );
-
-        Ok(())
-    }
-}
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -489,7 +489,7 @@ pub fn handle_postgres_logs(stderr: std::process::ChildStderr) -> JoinHandle<()>
 /// Read Postgres logs from `stderr` until EOF. Buffer is flushed on one of the following conditions:
 /// - next line starts with timestamp
 /// - EOF
-/// - no new lines were written for the last 100 milliseconds
+/// - no new lines were written for the last second
 async fn handle_postgres_logs_async(stderr: tokio::process::ChildStderr) -> Result<()> {
    let mut lines = tokio::io::BufReader::new(stderr).lines();
    let timeout_duration = Duration::from_millis(100);
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -10,7 +10,6 @@ use tracing::{error, info, info_span, instrument, span_enabled, warn, Level};

 use crate::config;
 use crate::logger::inlinify;
-use crate::migration::MigrationRunner;
 use crate::params::PG_HBA_ALL_MD5;
 use crate::pg_helpers::*;

@@ -792,7 +791,69 @@ pub fn handle_migrations(client: &mut Client) -> Result<()> {
        include_str!("./migrations/0008-revoke_replication_for_previously_allowed_roles.sql"),
    ];

-    MigrationRunner::new(client, &migrations).run_migrations()?;
+    let mut func = || {
+        let query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
+        client.simple_query(query)?;
+
+        let query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)";
+        client.simple_query(query)?;
+
+        let query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING";
+        client.simple_query(query)?;
+
+        let query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin";
+        client.simple_query(query)?;
+
+        let query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC";
+        client.simple_query(query)?;
+        Ok::<_, anyhow::Error>(())
+    };
+    func().context("handle_migrations prepare")?;
+
+    let query = "SELECT id FROM neon_migration.migration_id";
+    let row = client
+        .query_one(query, &[])
+        .context("handle_migrations get migration_id")?;
+    let mut current_migration: usize = row.get::<&str, i64>("id") as usize;
+    let starting_migration_id = current_migration;
+
+    let query = "BEGIN";
+    client
+        .simple_query(query)
+        .context("handle_migrations begin")?;
+
+    while current_migration < migrations.len() {
+        let migration = &migrations[current_migration];
+        if migration.starts_with("-- SKIP") {
+            info!("Skipping migration id={}", current_migration);
+        } else {
+            info!(
+                "Running migration id={}:\n{}\n",
+                current_migration, migration
+            );
+            client.simple_query(migration).with_context(|| {
+                format!("handle_migrations current_migration={}", current_migration)
+            })?;
+        }
+        current_migration += 1;
+    }
+    let setval = format!(
+        "UPDATE neon_migration.migration_id SET id={}",
+        migrations.len()
+    );
+    client
+        .simple_query(&setval)
+        .context("handle_migrations update id")?;
+
+    let query = "COMMIT";
+    client
+        .simple_query(query)
+        .context("handle_migrations commit")?;
+
+    info!(
+        "Ran {} migrations",
+        (migrations.len() - starting_migration_id)
+    );

    Ok(())
 }
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -21,8 +21,10 @@ use pageserver_api::config::{
    DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT,
    DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT,
 };
-use pageserver_api::controller_api::{PlacementPolicy, TenantCreateRequest};
-use pageserver_api::models::{ShardParameters, TimelineCreateRequest, TimelineInfo};
+use pageserver_api::controller_api::PlacementPolicy;
+use pageserver_api::models::{
+    ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo,
+};
 use pageserver_api::shard::{ShardCount, ShardStripeSize, TenantShardId};
 use postgres_backend::AuthType;
 use postgres_connection::parse_host_port;
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -325,16 +325,11 @@ impl LocalEnv {
        }
    }

-    pub fn pg_dir(&self, pg_version: u32, dir_name: &str) -> anyhow::Result<PathBuf> {
-        Ok(self.pg_distrib_dir(pg_version)?.join(dir_name))
-    }
-
    pub fn pg_bin_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
-        self.pg_dir(pg_version, "bin")
+        Ok(self.pg_distrib_dir(pg_version)?.join("bin"))
    }
-
    pub fn pg_lib_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
-        self.pg_dir(pg_version, "lib")
+        Ok(self.pg_distrib_dir(pg_version)?.join("lib"))
    }

    pub fn pageserver_bin(&self) -> PathBuf {
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -15,8 +15,10 @@ use std::time::Duration;

 use anyhow::{bail, Context};
 use camino::Utf8PathBuf;
+use futures::SinkExt;
 use pageserver_api::models::{
-    self, AuxFilePolicy, LocationConfig, TenantHistorySize, TenantInfo, TimelineInfo,
+    self, AuxFilePolicy, LocationConfig, ShardParameters, TenantHistorySize, TenantInfo,
+    TimelineInfo,
 };
 use pageserver_api::shard::TenantShardId;
 use pageserver_client::mgmt_api;
@@ -395,6 +397,28 @@ impl PageServerNode {
        }
    }

+    pub async fn tenant_create(
+        &self,
+        new_tenant_id: TenantId,
+        generation: Option<u32>,
+        settings: HashMap<&str, &str>,
+    ) -> anyhow::Result<TenantId> {
+        let config = Self::parse_config(settings.clone())?;
+
+        let request = models::TenantCreateRequest {
+            new_tenant_id: TenantShardId::unsharded(new_tenant_id),
+            generation,
+            config,
+            shard_parameters: ShardParameters::default(),
+            // Placement policy is not meaningful for creations not done via storage controller
+            placement_policy: None,
+        };
+        if !settings.is_empty() {
+            bail!("Unrecognized tenant settings: {settings:?}")
+        }
+        Ok(self.http_client.tenant_create(&request).await?)
+    }
+
    pub async fn tenant_config(
        &self,
        tenant_id: TenantId,
@@ -565,39 +589,60 @@ impl PageServerNode {
        pg_wal: Option<(Lsn, PathBuf)>,
        pg_version: u32,
    ) -> anyhow::Result<()> {
+        let (client, conn) = self.page_server_psql_client().await?;
+        // The connection object performs the actual communication with the database,
+        // so spawn it off to run on its own.
+        tokio::spawn(async move {
+            if let Err(e) = conn.await {
+                eprintln!("connection error: {}", e);
+            }
+        });
+        let client = std::pin::pin!(client);
+
        // Init base reader
        let (start_lsn, base_tarfile_path) = base;
        let base_tarfile = tokio::fs::File::open(base_tarfile_path).await?;
-        let base_tarfile =
-            mgmt_api::ReqwestBody::wrap_stream(tokio_util::io::ReaderStream::new(base_tarfile));
+        let base_tarfile = tokio_util::io::ReaderStream::new(base_tarfile);

        // Init wal reader if necessary
        let (end_lsn, wal_reader) = if let Some((end_lsn, wal_tarfile_path)) = pg_wal {
            let wal_tarfile = tokio::fs::File::open(wal_tarfile_path).await?;
-            let wal_reader =
-                mgmt_api::ReqwestBody::wrap_stream(tokio_util::io::ReaderStream::new(wal_tarfile));
+            let wal_reader = tokio_util::io::ReaderStream::new(wal_tarfile);
            (end_lsn, Some(wal_reader))
        } else {
            (start_lsn, None)
        };

-        // Import base
-        self.http_client
-            .import_basebackup(
-                tenant_id,
-                timeline_id,
-                start_lsn,
-                end_lsn,
-                pg_version,
-                base_tarfile,
-            )
-            .await?;
+        let copy_in = |reader, cmd| {
+            let client = &client;
+            async move {
+                let writer = client.copy_in(&cmd).await?;
+                let writer = std::pin::pin!(writer);
+                let mut writer = writer.sink_map_err(|e| {
+                    std::io::Error::new(std::io::ErrorKind::Other, format!("{e}"))
+                });
+                let mut reader = std::pin::pin!(reader);
+                writer.send_all(&mut reader).await?;
+                writer.into_inner().finish().await?;
+                anyhow::Ok(())
+            }
+        };

+        // Import base
+        copy_in(
+            base_tarfile,
+            format!(
+                "import basebackup {tenant_id} {timeline_id} {start_lsn} {end_lsn} {pg_version}"
+            ),
+        )
+        .await?;
        // Import wal if necessary
        if let Some(wal_reader) = wal_reader {
-            self.http_client
-                .import_wal(tenant_id, timeline_id, start_lsn, end_lsn, wal_reader)
-                .await?;
+            copy_in(
+                wal_reader,
+                format!("import wal {tenant_id} {timeline_id} {start_lsn} {end_lsn}"),
+            )
+            .await?;
        }

        Ok(())
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -5,11 +5,12 @@ use crate::{
 use camino::{Utf8Path, Utf8PathBuf};
 use pageserver_api::{
    controller_api::{
-        NodeConfigureRequest, NodeRegisterRequest, TenantCreateRequest, TenantCreateResponse,
-        TenantLocateResponse, TenantShardMigrateRequest, TenantShardMigrateResponse,
+        NodeConfigureRequest, NodeRegisterRequest, TenantCreateResponse, TenantLocateResponse,
+        TenantShardMigrateRequest, TenantShardMigrateResponse,
    },
    models::{
-        TenantShardSplitRequest, TenantShardSplitResponse, TimelineCreateRequest, TimelineInfo,
+        TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse,
+        TimelineCreateRequest, TimelineInfo,
    },
    shard::{ShardStripeSize, TenantShardId},
 };
@@ -155,16 +156,16 @@ impl StorageController {
        .expect("non-Unicode path")
    }

-    /// Find the directory containing postgres subdirectories, such `bin` and `lib`
+    /// Find the directory containing postgres binaries, such as `initdb` and `pg_ctl`
    ///
    /// This usually uses STORAGE_CONTROLLER_POSTGRES_VERSION of postgres, but will fall back
    /// to other versions if that one isn't found.  Some automated tests create circumstances
    /// where only one version is available in pg_distrib_dir, such as `test_remote_extensions`.
-    async fn get_pg_dir(&self, dir_name: &str) -> anyhow::Result<Utf8PathBuf> {
+    pub async fn get_pg_bin_dir(&self) -> anyhow::Result<Utf8PathBuf> {
        let prefer_versions = [STORAGE_CONTROLLER_POSTGRES_VERSION, 15, 14];

        for v in prefer_versions {
-            let path = Utf8PathBuf::from_path_buf(self.env.pg_dir(v, dir_name)?).unwrap();
+            let path = Utf8PathBuf::from_path_buf(self.env.pg_bin_dir(v)?).unwrap();
            if tokio::fs::try_exists(&path).await? {
                return Ok(path);
            }
@@ -172,20 +173,11 @@ impl StorageController {

        // Fall through
        anyhow::bail!(
-            "Postgres directory '{}' not found in {}",
-            dir_name,
-            self.env.pg_distrib_dir.display(),
+            "Postgres binaries not found in {}",
+            self.env.pg_distrib_dir.display()
        );
    }

-    pub async fn get_pg_bin_dir(&self) -> anyhow::Result<Utf8PathBuf> {
-        self.get_pg_dir("bin").await
-    }
-
-    pub async fn get_pg_lib_dir(&self) -> anyhow::Result<Utf8PathBuf> {
-        self.get_pg_dir("lib").await
-    }
-
    /// Readiness check for our postgres process
    async fn pg_isready(&self, pg_bin_dir: &Utf8Path) -> anyhow::Result<bool> {
        let bin_path = pg_bin_dir.join("pg_isready");
@@ -238,17 +230,12 @@ impl StorageController {
            .unwrap()
            .join("storage_controller_db");
        let pg_bin_dir = self.get_pg_bin_dir().await?;
-        let pg_lib_dir = self.get_pg_lib_dir().await?;
        let pg_log_path = pg_data_path.join("postgres.log");

        if !tokio::fs::try_exists(&pg_data_path).await? {
            // Initialize empty database
            let initdb_path = pg_bin_dir.join("initdb");
            let mut child = Command::new(&initdb_path)
-                .envs(vec![
-                    ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
-                    ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
-                ])
                .args(["-D", pg_data_path.as_ref()])
                .spawn()
                .expect("Failed to spawn initdb");
@@ -283,10 +270,7 @@ impl StorageController {
            &self.env.base_data_dir,
            pg_bin_dir.join("pg_ctl").as_std_path(),
            db_start_args,
-            vec![
-                ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
-                ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
-            ],
+            [],
            background_process::InitialPidFile::Create(self.postgres_pid_file()),
            retry_timeout,
            || self.pg_isready(&pg_bin_dir),
@@ -341,10 +325,7 @@ impl StorageController {
            &self.env.base_data_dir,
            &self.env.storage_controller_bin(),
            args,
-            vec![
-                ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
-                ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
-            ],
+            [],
            background_process::InitialPidFile::Create(self.pid_file()),
            retry_timeout,
            || async {
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -4,13 +4,13 @@ use std::{str::FromStr, time::Duration};
 use clap::{Parser, Subcommand};
 use pageserver_api::{
    controller_api::{
-        NodeAvailabilityWrapper, NodeDescribeResponse, ShardSchedulingPolicy, TenantCreateRequest,
+        NodeAvailabilityWrapper, NodeDescribeResponse, ShardSchedulingPolicy,
        TenantDescribeResponse, TenantPolicyRequest,
    },
    models::{
        EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary,
-        ShardParameters, TenantConfig, TenantConfigRequest, TenantShardSplitRequest,
-        TenantShardSplitResponse,
+        ShardParameters, TenantConfig, TenantConfigRequest, TenantCreateRequest,
+        TenantShardSplitRequest, TenantShardSplitResponse,
    },
    shard::{ShardStripeSize, TenantShardId},
 };
@@ -336,18 +336,14 @@ async fn main() -> anyhow::Result<()> {
                .await?;
        }
        Command::TenantCreate { tenant_id } => {
-            storcon_client
-                .dispatch(
-                    Method::POST,
-                    "v1/tenant".to_string(),
-                    Some(TenantCreateRequest {
-                        new_tenant_id: TenantShardId::unsharded(tenant_id),
-                        generation: None,
-                        shard_parameters: ShardParameters::default(),
-                        placement_policy: Some(PlacementPolicy::Attached(1)),
-                        config: TenantConfig::default(),
-                    }),
-                )
+            vps_client
+                .tenant_create(&TenantCreateRequest {
+                    new_tenant_id: TenantShardId::unsharded(tenant_id),
+                    generation: None,
+                    shard_parameters: ShardParameters::default(),
+                    placement_policy: Some(PlacementPolicy::Attached(1)),
+                    config: TenantConfig::default(),
+                })
                .await?;
        }
        Command::TenantDelete { tenant_id } => {
--- a/docs/rfcs/033-storage-controller-drain-and-fill.md
+++ b/docs/rfcs/033-storage-controller-drain-and-fill.md
@@ -1,345 +0,0 @@
-# Graceful Restarts of Storage Controller Managed Clusters
-
-## Summary
-This RFC describes new storage controller APIs for draining and filling tenant shards from/on pageserver nodes.
-It also covers how these new APIs should be used by an orchestrator (e.g. Ansible) in order to implement
-graceful cluster restarts.
-
-## Motivation
-
-Pageserver restarts cause read availablity downtime for tenants.
-
-For example pageserver-3 @ us-east-1 was unavailable for a randomly
-picked tenant (which requested on-demand activation) for around 30 seconds
-during the restart at 2024-04-03 16:37 UTC.
-
-Note that lots of shutdowns on loaded pageservers do not finish within the
-[10 second systemd enforced timeout](https://github.com/neondatabase/aws/blob/0a5280b383e43c063d43cbf87fa026543f6d6ad4/.github/ansible/systemd/pageserver.service#L16). This means we are shutting down without flushing ephemeral layers
-and have to reingest data in order to serve requests after restarting, potentially making first request latencies worse.
-
-This problem is not yet very acutely felt in storage controller managed pageservers since
-tenant density is much lower there. However, we are planning on eventually migrating all
-pageservers to storage controller management, so it makes sense to solve the issue proactively.
-
-## Requirements
-
- Pageserver re-deployments cause minimal downtime for tenants
- The storage controller exposes HTTP API hooks for draining and filling tenant shards
-from a given pageserver. Said hooks can be used by an orchestrator proces or a human operator.
- The storage controller exposes some HTTP API to cancel draining and filling background operations.
- Failures to drain or fill the node should not be fatal. In such cases, cluster restarts should proceed
-as usual (with downtime).
- Progress of draining/filling is visible through metrics
-
-## Non Goals
-
- Integration with the control plane
- Graceful restarts for large non-HA tenants.
-
-## Impacted Components
-
- storage controller
- deployment orchestrator (i.e. Ansible)
- pageserver (indirectly)
-
-## Terminology
-
-** Draining ** is the process through which all tenant shards that can be migrated from a given pageserver
-are distributed across the rest of the cluster.
-
-** Filling ** is the symmetric opposite of draining. In this process tenant shards are migrated onto a given
-pageserver until the cluster reaches a resonable, quiescent distribution of tenant shards across pageservers.
-
-** Node scheduling policies ** act as constraints to the scheduler. For instance, when a
-node is set in the `Paused` policy, no further shards will be scheduled on it.
-
-** Node ** is a pageserver. Term is used interchangeably in this RFC.
-
-** Deployment orchestrator ** is a generic term for whatever drives our deployments.
-Currently, it's an Ansible playbook.
-
-## Background
-
-### Storage Controller Basics (skip if already familiar)
-
-Fundamentally, the storage controller is a reconciler which aims to move from the observed mapping between pageservers and tenant shards to an intended mapping. Pageserver nodes and tenant shards metadata is durably persisted in a database, but note that the mapping between the two entities is not durably persisted. Instead, this mapping (*observed state*) is constructed at startup by sending `GET location_config` requests to registered pageservers.
-
-An internal scheduler maps tenant shards to pageservers while respecting certain constraints. The result of scheduling is the *intent state*. When the intent state changes, a *reconciliation* will inform pageservers about the new assigment via `PUT location_config` requests and will notify the compute via the configured hook.
-
-### Background Optimizations
-
-The storage controller performs scheduling optimizations in the background. It will
-migrate attachments to warm secondaries and replace secondaries in order to balance
-the cluster out.
-
-### Reconciliations Concurrency Limiting
-
-There's a hard limit on the number of reconciles that the storage controller
-can have in flight at any given time. To get an idea of scales, the limit is
-128 at the time of writing.
-
-## Implementation
-
-Note: this section focuses on the core functionality of the graceful restart process.
-It doesn't neccesarily describe the most efficient approach. Optimizations are described
-separately in a later section.
-
-### Overall Flow
-
-This section describes how to implement graceful restarts from the perspective
-of Ansible, the deployment orchestrator. Pageservers are already restarted sequentially.
-The orchestrator shall implement the following epilogue and prologue steps for each
-pageserver restart:
-
-#### Prologue
-
-The orchestrator shall first fetch the pageserver node id from the control plane or
-the pageserver it aims to restart directly. Next, it issues an HTTP request
-to the storage controller in order to start the drain of said pageserver node.
-All error responses are retried with a short back-off. When a 202 (Accepted)
-HTTP code is returned, the drain has started. Now the orchestrator polls the
-node status endpoint exposed by the storage controller in order to await the
-end of the drain process. When the `policy` field of the node status response
-becomes `PauseForRestart`, the drain has completed and the orchestrator can
-proceed with restarting the pageserver.
-
-The prologue is subject to an overall timeout. It will have a value in the ballpark
-of minutes. As storage controller managed pageservers become more loaded this timeout
-will likely have to increase.
-
-#### Epilogue
-
-After restarting the pageserver, the orchestrator issues an HTTP request
-to the storage controller to kick off the filling process. This API call
-may be retried for all error codes with a short backoff. This also serves
-as a synchronization primitive as the fill will be refused if the pageserver
-has not yet re-attached to the storage controller. When a 202(Accepted) HTTP
-code is returned, the fill has started. Now the orchestrator polls the node
-status endpoint exposed by the storage controller in order to await the end of
-the filling process. When the `policy` field of the node status response becomes
-`Active`, the fill has completed and the orchestrator may proceed to the next pageserver.
-
-Again, the epilogue is subject to an overall timeout. We can start off with
-using the same timeout as for the prologue, but can also consider relying on
-the storage controller's background optimizations with a shorter timeout.
-
-In the case that the deployment orchestrator times out, it attempts to cancel
-the fill. This operation shall be retried with a short back-off. If it ultimately
-fails it will require manual intervention to set the nodes scheduling policy to
-`NodeSchedulingPolicy::Active`. Not doing that is not immediately problematic,
-but it constrains the scheduler as mentioned previously.
-
-### Node Scheduling Policy State Machine
-
-The state machine below encodes the behaviours discussed above and
-the various failover situations described in a later section.
-
-Assuming no failures and/or timeouts the flow should be:
-`Active -> Draining -> PauseForRestart -> Active -> Filling -> Active`
-
-```
-                          Operator requested drain
-               +-----------------------------------------+
-               |                                         |
-       +-------+-------+                         +-------v-------+
-       |               |                         |               |
-       |     Pause     |             +----------->    Draining   +----------+
-       |               |             |           |               |          |
-       +---------------+             |           +-------+-------+          |
-                                     |                   |                  |
-                                     |                   |                  |
-                      Drain requested|                   |                  |
-                                     |                   |Drain complete    | Drain failed
-                                     |                   |                  | Cancelled/PS reattach/Storcon restart
-                                     |                   |                  |
-                             +-------+-------+           |                  |
-                             |               |           |                  |
-               +-------------+    Active     <-----------+------------------+
-               |             |               |           |
-Fill requested |             +---^---^-------+           |
-               |                 |   |                   |
-               |                 |   |                   |
-               |                 |   |                   |
-               |   Fill completed|   |                   |
-               |                 |   |PS reattach        |
-               |                 |   |after restart      |
-       +-------v-------+         |   |           +-------v-------+
-       |               |         |   |           |               |
-       |    Filling    +---------+   +-----------+PauseForRestart|
-       |               |                         |               |
-       +---------------+                         +---------------+
-```
-
-### Draining/Filling APIs
-
-The storage controller API to trigger the draining of a given node is:
-`PUT /v1/control/node/:node_id/{drain,fill}`.
-
-The following HTTP non-success return codes are used.
-All of them are safely retriable from the perspective of the storage controller.
- 404: Requested node was not found
- 503: Requested node is known to the storage controller, but unavailable
- 412: Drain precondition failed: there is no other node to drain to or the node's schedulling policy forbids draining
- 409: A {drain, fill} is already in progress. Only one such background operation
-is allowed per node.
-
-When the drain is accepted and commenced a 202 HTTP code is returned.
-
-Drains and fills shall be cancellable by the deployment orchestrator or a
-human operator via: `DELETE /v1/control/node/:node_id/{drain,fill}`. A 200
-response is returned when the cancelation is successful. Errors are retriable.
-
-### Drain Process
-
-Before accpeting a drain request the following validations is applied:
-* Ensure that the node is known the storage controller
-* Ensure that the schedulling policy is `NodeSchedulingPolicy::Active` or `NodeSchedulingPolicy::Pause`
-* Ensure that another drain or fill is not already running on the node
-* Ensure that a drain is possible (i.e. check that there is at least one
-schedulable node to drain to)
-
-After accepting the drain, the scheduling policy of the node is set to
-`NodeSchedulingPolicy::Draining` and persisted in both memory and the database.
-This disallows the optimizer from adding or removing shards from the node which
-is desirable to avoid them racing.
-
-Next, a separate Tokio task is spawned to manage the draining. For each tenant
-shard attached to the node being drained, demote the node to a secondary and
-attempt to schedule the node away. Scheduling might fail due to unsatisfiable
-constraints, but that is fine. Draining is a best effort process since it might
-not always be possible to cut over all shards.
-
-Importantly, this task manages the concurrency of issued reconciles in order to
-avoid drowning out the target pageservers and to allow other important reconciles
-to proceed.
-
-Once the triggered reconciles have finished or timed out, set the node's scheduling
-policy to `NodeSchedulingPolicy::PauseForRestart` to signal the end of the drain.
-
-A note on non HA tenants: These tenants do not have secondaries, so by the description
-above, they would not be migrated. It makes sense to skip them (especially the large ones)
-since, depending on tenant size, this might be more disruptive than the restart since the
-pageserver we've moved to do will need to on-demand download the entire working set for the tenant.
-We can consider expanding to small non-HA tenants in the future.
-
-### Fill Process
-
-Before accpeting a fill request the following validations is applied:
-* Ensure that the node is known the storage controller
-* Ensure that the schedulling policy is `NodeSchedulingPolicy::Active`.
-This is the only acceptable policy for the fill starting state. When a node re-attaches,
-it set the scheduling policy to `NodeSchedulingPolicy::Active` if it was equal to
-`NodeSchedulingPolicy::PauseForRestart` or `NodeSchedulingPolicy::Draining` (possible end states for a node drain).
-* Ensure that another drain or fill is not already running on the node
-
-After accepting the drain, the scheduling policy of the node is set to
-`NodeSchedulingPolicy::Filling` and persisted in both memory and the database.
-This disallows the optimizer from adding or removing shards from the node which
-is desirable to avoid them racing.
-
-Next, a separate Tokio task is spawned to manage the draining. For each tenant
-shard where the filled node is a secondary, promote the secondary. This is done
-until we run out of shards or the counts of attached shards become balanced across
-the cluster.
-
-Like for draining, the concurrency of spawned reconciles is limited.
-
-### Failure Modes & Handling
-
-Failures are generally handled by transition back into the `Active`
-(neutral) state. This simplifies the implementation greatly at the
-cost of adding transitions to the state machine. For example, we
-could detect the `Draining` state upon restart and proceed with a drain,
-but how should the storage controller know that's what the orchestrator
-needs still?
-
-#### Storage Controller Crash
-
-When the storage controller starts up reset the node scheduling policy
-of all nodes in states `Draining`, `Filling` or `PauseForRestart` to
-`Active`. The rationale is that when the storage controller restarts,
-we have lost context of what the deployment orchestrator wants. It also
-has the benefit of making things easier to reason about.
-
-#### Pageserver Crash During Drain
-
-The pageserver will attempt to re-attach during restart at which
-point the node scheduling policy will be set back to `Active`, thus
-reenabling the scheduler to use the node.
-
-#### Non-drained Pageserver Crash During Drain
-
-What should happen when a pageserver we are draining to crashes during the
-process. Two reasonable options are: cancel the drain and focus on the failover
-*or* do both, but prioritise failover. Since the number of concurrent reconciles
-produced by drains/fills are limited, we get the later behaviour for free.
-My suggestion is we take this approach, but the cancellation option is trivial
-to implement as well.
-
-#### Pageserver Crash During Fill
-
-The pageserver will attempt to re-attach during restart at which
-point the node scheduling policy will be set back to `Active`, thus
-reenabling the scheduler to use the node.
-
-#### Pageserver Goes unavailable During Drain/Fill
-
-The drain and fill jobs handle this by stopping early. When the pageserver
-is detected as online by storage controller heartbeats, reset its scheduling
-policy to `Active`. If a restart happens instead, see the pageserver crash
-failure mode.
-
-#### Orchestrator Drain Times Out
-
-Orchestrator will still proceed with the restart.
-When the pageserver re-attaches, the scheduling policy is set back to
-`Active`.
-
-#### Orchestrator Fill Times Out
-
-Orchestrator will attempt to cancel the fill operation. If that fails,
-the fill will continue until it quiesces and the node will be left
-in the `Filling` scheduling policy. This hinders the scheduler, but is
-otherwise harmless. A human operator can handle this by setting the scheduling
-policy to `Active`, or we can bake in a fill timeout into the storage controller.
-
-## Optimizations
-
-### Location Warmth
-
-When cutting over to a secondary, the storage controller will wait for it to
-become "warm" (i.e. download enough of the tenants data). This means that some
-reconciliations can take significantly longer than others and hold up precious
-reconciliations units. As an optimization, the drain stage can only cut over
-tenants that are already "warm". Similarly, the fill stage can prioritise the
-"warmest" tenants in the fill.
-
-Given that the number of tenants by the storage controller will be fairly low
-for the foreseable future, the first implementation could simply query the tenants
-for secondary status. This doesn't scale well with increasing tenant counts, so
-eventually we will need new pageserver API endpoints to report the sets of
-"warm" and "cold" nodes.
-
-## Alternatives Considered
-
-### Draining and Filling Purely as Scheduling Constraints
-
-At its core, the storage controller is a big background loop that detects changes
-in the environment and reacts on them. One could express draining and filling
-of nodes purely in terms of constraining the scheduler (as opposed to having
-such background tasks).
-
-While theoretically nice, I think that's harder to implement and more importantly operate and reason about.
-Consider cancellation of a drain/fill operation. We would have to update the scheduler state, create
-an entirely new schedule (intent state) and start work on applying that. It gets trickier if we wish
-to cancel the reconciliation tasks spawned by drain/fill nodes. How would we know which ones belong
-to the conceptual drain/fill? One could add labels to reconciliations, but it gets messy in my opinion.
-
-It would also mean that reconciliations themselves have side effects that persist in the database
-(persist something to the databse when the drain is done), which I'm not conceptually fond of.
-
-## Proof of Concept
-
-This RFC is accompanied by a POC which implements nearly everything mentioned here
-apart from the optimizations and some of the failure handling:
-https://github.com/neondatabase/neon/pull/7682
--- a/libs/metrics/src/hll.rs
+++ b/libs/metrics/src/hll.rs
@@ -13,7 +13,11 @@ use std::{

 use measured::{
    label::{LabelGroupVisitor, LabelName, LabelValue, LabelVisitor},
-    metric::{counter::CounterState, name::MetricNameEncoder, Metric, MetricType, MetricVec},
+    metric::{
+        group::{Encoding, MetricValue},
+        name::MetricNameEncoder,
+        Metric, MetricType, MetricVec,
+    },
    text::TextEncoder,
    LabelGroup,
 };
@@ -140,7 +144,6 @@ impl<const N: usize> HyperLogLogState<N> {
        })
    }
 }
-
 impl<W: std::io::Write, const N: usize> measured::metric::MetricEncoding<TextEncoder<W>>
    for HyperLogLogState<N>
 {
@@ -179,13 +182,12 @@ impl<W: std::io::Write, const N: usize> measured::metric::MetricEncoding<TextEnc
            .into_iter()
            .enumerate()
            .try_for_each(|(hll_shard, val)| {
-                CounterState::new(val as u64).collect_into(
-                    &(),
+                enc.write_metric_value(
+                    name.by_ref(),
                    labels.by_ref().compose_with(HllShardLabel {
                        hll_shard: hll_shard as i64,
                    }),
-                    name.by_ref(),
-                    enc,
+                    MetricValue::Int(val as i64),
                )
            })
    }
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -9,7 +9,7 @@ use measured::{
    metric::{
        counter::CounterState,
        gauge::GaugeState,
-        group::Encoding,
+        group::{Encoding, MetricValue},
        name::{MetricName, MetricNameEncoder},
        MetricEncoding, MetricFamilyEncoding,
    },
@@ -171,11 +171,8 @@ fn write_gauge<Enc: Encoding>(
    labels: impl LabelGroup,
    name: impl MetricNameEncoder,
    enc: &mut Enc,
-) -> Result<(), Enc::Err>
-where
-    GaugeState: MetricEncoding<Enc>,
-{
-    GaugeState::new(x).collect_into(&(), labels, name, enc)
+) -> Result<(), Enc::Err> {
+    enc.write_metric_value(name, labels, MetricValue::Int(x))
 }

 #[derive(Default)]
@@ -547,6 +544,15 @@ impl<T: Encoding> Encoding for Inc<T> {
    fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> {
        self.0.write_help(name, help)
    }
+
+    fn write_metric_value(
+        &mut self,
+        name: impl MetricNameEncoder,
+        labels: impl LabelGroup,
+        value: MetricValue,
+    ) -> Result<(), Self::Err> {
+        self.0.write_metric_value(name, labels, value)
+    }
 }

 impl<T: Encoding> MetricEncoding<Inc<T>> for MeasuredCounterPairState
@@ -573,6 +579,15 @@ impl<T: Encoding> Encoding for Dec<T> {
    fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> {
        self.0.write_help(name, help)
    }
+
+    fn write_metric_value(
+        &mut self,
+        name: impl MetricNameEncoder,
+        labels: impl LabelGroup,
+        value: MetricValue,
+    ) -> Result<(), Self::Err> {
+        self.0.write_metric_value(name, labels, value)
+    }
 }

 /// Write the dec counter to the encoder
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -11,27 +11,6 @@ use crate::{
    shard::{ShardStripeSize, TenantShardId},
 };

-#[derive(Serialize, Deserialize, Debug)]
-#[serde(deny_unknown_fields)]
-pub struct TenantCreateRequest {
-    pub new_tenant_id: TenantShardId,
-    #[serde(default)]
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub generation: Option<u32>,
-
-    // If omitted, create a single shard with TenantShardId::unsharded()
-    #[serde(default)]
-    #[serde(skip_serializing_if = "ShardParameters::is_unsharded")]
-    pub shard_parameters: ShardParameters,
-
-    #[serde(default)]
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub placement_policy: Option<PlacementPolicy>,
-
-    #[serde(flatten)]
-    pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
-}
-
 #[derive(Serialize, Deserialize)]
 pub struct TenantCreateResponseShard {
    pub shard_id: TenantShardId,
@@ -301,19 +280,4 @@ mod test {
        assert_eq!(serde_json::from_str::<PlacementPolicy>(&encoded)?, v);
        Ok(())
    }
-
-    #[test]
-    fn test_reject_unknown_field() {
-        let id = TenantId::generate();
-        let create_request = serde_json::json!({
-            "new_tenant_id": id.to_string(),
-            "unknown_field": "unknown_value".to_string(),
-        });
-        let err = serde_json::from_value::<TenantCreateRequest>(create_request).unwrap_err();
-        assert!(
-            err.to_string().contains("unknown field `unknown_field`"),
-            "expect unknown field `unknown_field` error, got: {}",
-            err
-        );
-    }
 }
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -29,7 +29,7 @@ pub const KEY_SIZE: usize = 18;
 /// See [`Key::to_i128`] for more information on the encoding.
 pub const METADATA_KEY_SIZE: usize = 16;

-/// The key prefix start range for the metadata keys. All keys with the first byte >= 0x60 is a metadata key.
+/// The key prefix start range for the metadata keys. All keys with the first byte >= 0x40 is a metadata key.
 pub const METADATA_KEY_BEGIN_PREFIX: u8 = 0x60;
 pub const METADATA_KEY_END_PREFIX: u8 = 0x7F;

--- a/libs/pageserver_api/src/keyspace.rs
+++ b/libs/pageserver_api/src/keyspace.rs
@@ -17,16 +17,6 @@ pub struct KeySpace {
    pub ranges: Vec<Range<Key>>,
 }

-impl std::fmt::Display for KeySpace {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "[")?;
-        for range in &self.ranges {
-            write!(f, "{}..{},", range.start, range.end)?;
-        }
-        write!(f, "]")
-    }
-}
-
 /// A wrapper type for sparse keyspaces.
 #[derive(Clone, Debug, Default, PartialEq, Eq)]
 pub struct SparseKeySpace(pub KeySpace);
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -9,7 +9,6 @@ use std::{
    collections::HashMap,
    io::{BufRead, Read},
    num::{NonZeroU64, NonZeroUsize},
-    str::FromStr,
    sync::atomic::AtomicUsize,
    time::{Duration, SystemTime},
 };
@@ -26,6 +25,7 @@ use utils::{
    serde_system_time,
 };

+use crate::controller_api::PlacementPolicy;
 use crate::{
    reltag::RelTag,
    shard::{ShardCount, ShardStripeSize, TenantShardId},
@@ -229,11 +229,6 @@ pub struct TimelineCreateRequest {
    pub pg_version: Option<u32>,
 }

-#[derive(Serialize, Deserialize, Clone)]
-pub struct LsnLeaseRequest {
-    pub lsn: Lsn,
-}
-
 #[derive(Serialize, Deserialize)]
 pub struct TenantShardSplitRequest {
    pub new_shard_count: u8,
@@ -276,6 +271,28 @@ impl Default for ShardParameters {
    }
 }

+#[derive(Serialize, Deserialize, Debug)]
+#[serde(deny_unknown_fields)]
+pub struct TenantCreateRequest {
+    pub new_tenant_id: TenantShardId,
+    #[serde(default)]
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub generation: Option<u32>,
+
+    // If omitted, create a single shard with TenantShardId::unsharded()
+    #[serde(default)]
+    #[serde(skip_serializing_if = "ShardParameters::is_unsharded")]
+    pub shard_parameters: ShardParameters,
+
+    // This parameter is only meaningful in requests sent to the storage controller
+    #[serde(default)]
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub placement_policy: Option<PlacementPolicy>,
+
+    #[serde(flatten)]
+    pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
+}
+
 /// An alternative representation of `pageserver::tenant::TenantConf` with
 /// simpler types.
 #[derive(Serialize, Deserialize, Debug, Default, Clone, Eq, PartialEq)]
@@ -438,41 +455,6 @@ pub enum CompactionAlgorithm {
    Tiered,
 }

-#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
-pub enum ImageCompressionAlgorithm {
-    // Disabled for writes, support decompressing during read path
-    Disabled,
-    /// Zstandard compression. Level 0 means and None mean the same (default level). Levels can be negative as well.
-    /// For details, see the [manual](http://facebook.github.io/zstd/zstd_manual.html).
-    Zstd {
-        level: Option<i8>,
-    },
-}
-
-impl FromStr for ImageCompressionAlgorithm {
-    type Err = anyhow::Error;
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        let mut components = s.split(['(', ')']);
-        let first = components
-            .next()
-            .ok_or_else(|| anyhow::anyhow!("empty string"))?;
-        match first {
-            "disabled" => Ok(ImageCompressionAlgorithm::Disabled),
-            "zstd" => {
-                let level = if let Some(v) = components.next() {
-                    let v: i8 = v.parse()?;
-                    Some(v)
-                } else {
-                    None
-                };
-
-                Ok(ImageCompressionAlgorithm::Zstd { level })
-            }
-            _ => anyhow::bail!("invalid specifier '{first}'"),
-        }
-    }
-}
-
 #[derive(Eq, PartialEq, Debug, Clone, Serialize, Deserialize)]
 pub struct CompactionAlgorithmSettings {
    pub kind: CompactionAlgorithm,
@@ -565,6 +547,10 @@ pub struct LocationConfigListResponse {
    pub tenant_shards: Vec<(TenantShardId, Option<LocationConfig>)>,
 }

+#[derive(Serialize, Deserialize)]
+#[serde(transparent)]
+pub struct TenantCreateResponse(pub TenantId);
+
 #[derive(Serialize)]
 pub struct StatusResponse {
    pub id: NodeId,
@@ -684,16 +670,6 @@ pub struct TimelineInfo {
    pub current_physical_size: Option<u64>, // is None when timeline is Unloaded
    pub current_logical_size_non_incremental: Option<u64>,

-    /// How many bytes of WAL are within this branch's pitr_interval.  If the pitr_interval goes
-    /// beyond the branch's branch point, we only count up to the branch point.
-    pub pitr_history_size: u64,
-
-    /// Whether this branch's branch point is within its ancestor's PITR interval (i.e. any
-    /// ancestor data used by this branch would have been retained anyway).  If this is false, then
-    /// this branch may be imposing a cost on the ancestor by causing it to retain layers that it would
-    /// otherwise be able to GC.
-    pub within_ancestor_pitr: bool,
-
    pub timeline_dir_layer_file_size_sum: Option<u64>,

    pub wal_source_connstr: Option<String>,
@@ -1531,6 +1507,18 @@ mod tests {

    #[test]
    fn test_reject_unknown_field() {
+        let id = TenantId::generate();
+        let create_request = json!({
+            "new_tenant_id": id.to_string(),
+            "unknown_field": "unknown_value".to_string(),
+        });
+        let err = serde_json::from_value::<TenantCreateRequest>(create_request).unwrap_err();
+        assert!(
+            err.to_string().contains("unknown field `unknown_field`"),
+            "expect unknown field `unknown_field` error, got: {}",
+            err
+        );
+
        let id = TenantId::generate();
        let config_request = json!({
            "tenant_id": id.to_string(),
@@ -1665,25 +1653,4 @@ mod tests {
            AuxFilePolicy::CrossValidation
        );
    }
-
-    #[test]
-    fn test_image_compression_algorithm_parsing() {
-        use ImageCompressionAlgorithm::*;
-        assert_eq!(
-            ImageCompressionAlgorithm::from_str("disabled").unwrap(),
-            Disabled
-        );
-        assert_eq!(
-            ImageCompressionAlgorithm::from_str("zstd").unwrap(),
-            Zstd { level: None }
-        );
-        assert_eq!(
-            ImageCompressionAlgorithm::from_str("zstd(18)").unwrap(),
-            Zstd { level: Some(18) }
-        );
-        assert_eq!(
-            ImageCompressionAlgorithm::from_str("zstd(-3)").unwrap(),
-            Zstd { level: Some(-3) }
-        );
-    }
 }
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -1,42 +1,59 @@
-//! See docs/rfcs/031-sharding-static.md for an overview of sharding.
-//!
-//! This module contains a variety of types used to represent the concept of sharding
-//! a Neon tenant across multiple physical shards.  Since there are quite a few of these,
-//! we provide an summary here.
-//!
-//! Types used to describe shards:
-//! - [`ShardCount`] describes how many shards make up a tenant, plus the magic `unsharded` value
-//!   which identifies a tenant which is not shard-aware.  This means its storage paths do not include
-//!   a shard suffix.
-//! - [`ShardNumber`] is simply the zero-based index of a shard within a tenant.
-//! - [`ShardIndex`] is the 2-tuple of `ShardCount` and `ShardNumber`, it's just like a `TenantShardId`
-//!   without the tenant ID.  This is useful for things that are implicitly scoped to a particular
-//!   tenant, such as layer files.
-//! - [`ShardIdentity`]` is the full description of a particular shard's parameters, in sufficient
-//!   detail to convert a [`Key`] to a [`ShardNumber`] when deciding where to write/read.
-//! - The [`ShardSlug`] is a terse formatter for ShardCount and ShardNumber, written as
-//!   four hex digits.  An unsharded tenant is `0000`.
-//! - [`TenantShardId`] is the unique ID of a particular shard within a particular tenant
-//!
-//! Types used to describe the parameters for data distribution in a sharded tenant:
-//! - [`ShardStripeSize`] controls how long contiguous runs of [`Key`]s (stripes) are when distributed across
-//!   multiple shards.  Its value is given in 8kiB pages.
-//! - [`ShardLayout`] describes the data distribution scheme, and at time of writing is
-//!   always zero: this is provided for future upgrades that might introduce different
-//!   data distribution schemes.
-//!
-//! Examples:
-//! - A legacy unsharded tenant has one shard with ShardCount(0), ShardNumber(0), and its slug is 0000
-//! - A single sharded tenant has one shard with ShardCount(1), ShardNumber(0), and its slug is 0001
-//! - In a tenant with 4 shards, each shard has ShardCount(N), ShardNumber(i) where i in 0..N-1 (inclusive),
-//!   and their slugs are 0004, 0104, 0204, and 0304.
+use std::{ops::RangeInclusive, str::FromStr};

 use crate::{key::Key, models::ShardParameters};
+use hex::FromHex;
 use postgres_ffi::relfile_utils::INIT_FORKNUM;
 use serde::{Deserialize, Serialize};
+use utils::id::TenantId;

-#[doc(inline)]
-pub use ::utils::shard::*;
+/// See docs/rfcs/031-sharding-static.md for an overview of sharding.
+///
+/// This module contains a variety of types used to represent the concept of sharding
+/// a Neon tenant across multiple physical shards.  Since there are quite a few of these,
+/// we provide an summary here.
+///
+/// Types used to describe shards:
+/// - [`ShardCount`] describes how many shards make up a tenant, plus the magic `unsharded` value
+///   which identifies a tenant which is not shard-aware.  This means its storage paths do not include
+///   a shard suffix.
+/// - [`ShardNumber`] is simply the zero-based index of a shard within a tenant.
+/// - [`ShardIndex`] is the 2-tuple of `ShardCount` and `ShardNumber`, it's just like a `TenantShardId`
+///   without the tenant ID.  This is useful for things that are implicitly scoped to a particular
+///   tenant, such as layer files.
+/// - [`ShardIdentity`]` is the full description of a particular shard's parameters, in sufficient
+///   detail to convert a [`Key`] to a [`ShardNumber`] when deciding where to write/read.
+/// - The [`ShardSlug`] is a terse formatter for ShardCount and ShardNumber, written as
+///   four hex digits.  An unsharded tenant is `0000`.
+/// - [`TenantShardId`] is the unique ID of a particular shard within a particular tenant
+///
+/// Types used to describe the parameters for data distribution in a sharded tenant:
+/// - [`ShardStripeSize`] controls how long contiguous runs of [`Key`]s (stripes) are when distributed across
+///   multiple shards.  Its value is given in 8kiB pages.
+/// - [`ShardLayout`] describes the data distribution scheme, and at time of writing is
+///   always zero: this is provided for future upgrades that might introduce different
+///   data distribution schemes.
+///
+/// Examples:
+/// - A legacy unsharded tenant has one shard with ShardCount(0), ShardNumber(0), and its slug is 0000
+/// - A single sharded tenant has one shard with ShardCount(1), ShardNumber(0), and its slug is 0001
+/// - In a tenant with 4 shards, each shard has ShardCount(N), ShardNumber(i) where i in 0..N-1 (inclusive),
+///   and their slugs are 0004, 0104, 0204, and 0304.
+
+#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
+pub struct ShardNumber(pub u8);
+
+#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
+pub struct ShardCount(u8);
+
+/// Combination of ShardNumber and ShardCount.  For use within the context of a particular tenant,
+/// when we need to know which shard we're dealing with, but do not need to know the full
+/// ShardIdentity (because we won't be doing any page->shard mapping), and do not need to know
+/// the fully qualified TenantShardId.
+#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
+pub struct ShardIndex {
+    pub shard_number: ShardNumber,
+    pub shard_count: ShardCount,
+}

 /// The ShardIdentity contains enough information to map a [`Key`] to a [`ShardNumber`],
 /// and to check whether that [`ShardNumber`] is the same as the current shard.
@@ -48,6 +65,362 @@ pub struct ShardIdentity {
    layout: ShardLayout,
 }

+/// Formatting helper, for generating the `shard_id` label in traces.
+struct ShardSlug<'a>(&'a TenantShardId);
+
+/// TenantShardId globally identifies a particular shard in a particular tenant.
+///
+/// These are written as `<TenantId>-<ShardSlug>`, for example:
+///   # The second shard in a two-shard tenant
+///   072f1291a5310026820b2fe4b2968934-0102
+///
+/// If the `ShardCount` is _unsharded_, the `TenantShardId` is written without
+/// a shard suffix and is equivalent to the encoding of a `TenantId`: this enables
+/// an unsharded [`TenantShardId`] to be used interchangably with a [`TenantId`].
+///
+/// The human-readable encoding of an unsharded TenantShardId, such as used in API URLs,
+/// is both forward and backward compatible with TenantId: a legacy TenantId can be
+/// decoded as a TenantShardId, and when re-encoded it will be parseable
+/// as a TenantId.
+#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
+pub struct TenantShardId {
+    pub tenant_id: TenantId,
+    pub shard_number: ShardNumber,
+    pub shard_count: ShardCount,
+}
+
+impl ShardCount {
+    pub const MAX: Self = Self(u8::MAX);
+
+    /// The internal value of a ShardCount may be zero, which means "1 shard, but use
+    /// legacy format for TenantShardId that excludes the shard suffix", also known
+    /// as [`TenantShardId::unsharded`].
+    ///
+    /// This method returns the actual number of shards, i.e. if our internal value is
+    /// zero, we return 1 (unsharded tenants have 1 shard).
+    pub fn count(&self) -> u8 {
+        if self.0 > 0 {
+            self.0
+        } else {
+            1
+        }
+    }
+
+    /// The literal internal value: this is **not** the number of shards in the
+    /// tenant, as we have a special zero value for legacy unsharded tenants.  Use
+    /// [`Self::count`] if you want to know the cardinality of shards.
+    pub fn literal(&self) -> u8 {
+        self.0
+    }
+
+    /// Whether the `ShardCount` is for an unsharded tenant, so uses one shard but
+    /// uses the legacy format for `TenantShardId`. See also the documentation for
+    /// [`Self::count`].
+    pub fn is_unsharded(&self) -> bool {
+        self.0 == 0
+    }
+
+    /// `v` may be zero, or the number of shards in the tenant.  `v` is what
+    /// [`Self::literal`] would return.
+    pub const fn new(val: u8) -> Self {
+        Self(val)
+    }
+}
+
+impl ShardNumber {
+    pub const MAX: Self = Self(u8::MAX);
+}
+
+impl TenantShardId {
+    pub fn unsharded(tenant_id: TenantId) -> Self {
+        Self {
+            tenant_id,
+            shard_number: ShardNumber(0),
+            shard_count: ShardCount(0),
+        }
+    }
+
+    /// The range of all TenantShardId that belong to a particular TenantId.  This is useful when
+    /// you have a BTreeMap of TenantShardId, and are querying by TenantId.
+    pub fn tenant_range(tenant_id: TenantId) -> RangeInclusive<Self> {
+        RangeInclusive::new(
+            Self {
+                tenant_id,
+                shard_number: ShardNumber(0),
+                shard_count: ShardCount(0),
+            },
+            Self {
+                tenant_id,
+                shard_number: ShardNumber::MAX,
+                shard_count: ShardCount::MAX,
+            },
+        )
+    }
+
+    pub fn shard_slug(&self) -> impl std::fmt::Display + '_ {
+        ShardSlug(self)
+    }
+
+    /// Convenience for code that has special behavior on the 0th shard.
+    pub fn is_shard_zero(&self) -> bool {
+        self.shard_number == ShardNumber(0)
+    }
+
+    /// The "unsharded" value is distinct from simply having a single shard: it represents
+    /// a tenant which is not shard-aware at all, and whose storage paths will not include
+    /// a shard suffix.
+    pub fn is_unsharded(&self) -> bool {
+        self.shard_number == ShardNumber(0) && self.shard_count.is_unsharded()
+    }
+
+    /// Convenience for dropping the tenant_id and just getting the ShardIndex: this
+    /// is useful when logging from code that is already in a span that includes tenant ID, to
+    /// keep messages reasonably terse.
+    pub fn to_index(&self) -> ShardIndex {
+        ShardIndex {
+            shard_number: self.shard_number,
+            shard_count: self.shard_count,
+        }
+    }
+
+    /// Calculate the children of this TenantShardId when splitting the overall tenant into
+    /// the given number of shards.
+    pub fn split(&self, new_shard_count: ShardCount) -> Vec<TenantShardId> {
+        let effective_old_shard_count = std::cmp::max(self.shard_count.0, 1);
+        let mut child_shards = Vec::new();
+        for shard_number in 0..ShardNumber(new_shard_count.0).0 {
+            // Key mapping is based on a round robin mapping of key hash modulo shard count,
+            // so our child shards are the ones which the same keys would map to.
+            if shard_number % effective_old_shard_count == self.shard_number.0 {
+                child_shards.push(TenantShardId {
+                    tenant_id: self.tenant_id,
+                    shard_number: ShardNumber(shard_number),
+                    shard_count: new_shard_count,
+                })
+            }
+        }
+
+        child_shards
+    }
+}
+
+impl<'a> std::fmt::Display for ShardSlug<'a> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "{:02x}{:02x}",
+            self.0.shard_number.0, self.0.shard_count.0
+        )
+    }
+}
+
+impl std::fmt::Display for TenantShardId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        if self.shard_count != ShardCount(0) {
+            write!(f, "{}-{}", self.tenant_id, self.shard_slug())
+        } else {
+            // Legacy case (shard_count == 0) -- format as just the tenant id.  Note that this
+            // is distinct from the normal single shard case (shard count == 1).
+            self.tenant_id.fmt(f)
+        }
+    }
+}
+
+impl std::fmt::Debug for TenantShardId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        // Debug is the same as Display: the compact hex representation
+        write!(f, "{}", self)
+    }
+}
+
+impl std::str::FromStr for TenantShardId {
+    type Err = hex::FromHexError;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        // Expect format: 16 byte TenantId, '-', 1 byte shard number, 1 byte shard count
+        if s.len() == 32 {
+            // Legacy case: no shard specified
+            Ok(Self {
+                tenant_id: TenantId::from_str(s)?,
+                shard_number: ShardNumber(0),
+                shard_count: ShardCount(0),
+            })
+        } else if s.len() == 37 {
+            let bytes = s.as_bytes();
+            let tenant_id = TenantId::from_hex(&bytes[0..32])?;
+            let mut shard_parts: [u8; 2] = [0u8; 2];
+            hex::decode_to_slice(&bytes[33..37], &mut shard_parts)?;
+            Ok(Self {
+                tenant_id,
+                shard_number: ShardNumber(shard_parts[0]),
+                shard_count: ShardCount(shard_parts[1]),
+            })
+        } else {
+            Err(hex::FromHexError::InvalidStringLength)
+        }
+    }
+}
+
+impl From<[u8; 18]> for TenantShardId {
+    fn from(b: [u8; 18]) -> Self {
+        let tenant_id_bytes: [u8; 16] = b[0..16].try_into().unwrap();
+
+        Self {
+            tenant_id: TenantId::from(tenant_id_bytes),
+            shard_number: ShardNumber(b[16]),
+            shard_count: ShardCount(b[17]),
+        }
+    }
+}
+
+impl ShardIndex {
+    pub fn new(number: ShardNumber, count: ShardCount) -> Self {
+        Self {
+            shard_number: number,
+            shard_count: count,
+        }
+    }
+    pub fn unsharded() -> Self {
+        Self {
+            shard_number: ShardNumber(0),
+            shard_count: ShardCount(0),
+        }
+    }
+
+    /// The "unsharded" value is distinct from simply having a single shard: it represents
+    /// a tenant which is not shard-aware at all, and whose storage paths will not include
+    /// a shard suffix.
+    pub fn is_unsharded(&self) -> bool {
+        self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
+    }
+
+    /// For use in constructing remote storage paths: concatenate this with a TenantId
+    /// to get a fully qualified TenantShardId.
+    ///
+    /// Backward compat: this function returns an empty string if Self::is_unsharded, such
+    /// that the legacy pre-sharding remote key format is preserved.
+    pub fn get_suffix(&self) -> String {
+        if self.is_unsharded() {
+            "".to_string()
+        } else {
+            format!("-{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
+        }
+    }
+}
+
+impl std::fmt::Display for ShardIndex {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
+    }
+}
+
+impl std::fmt::Debug for ShardIndex {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        // Debug is the same as Display: the compact hex representation
+        write!(f, "{}", self)
+    }
+}
+
+impl std::str::FromStr for ShardIndex {
+    type Err = hex::FromHexError;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        // Expect format: 1 byte shard number, 1 byte shard count
+        if s.len() == 4 {
+            let bytes = s.as_bytes();
+            let mut shard_parts: [u8; 2] = [0u8; 2];
+            hex::decode_to_slice(bytes, &mut shard_parts)?;
+            Ok(Self {
+                shard_number: ShardNumber(shard_parts[0]),
+                shard_count: ShardCount(shard_parts[1]),
+            })
+        } else {
+            Err(hex::FromHexError::InvalidStringLength)
+        }
+    }
+}
+
+impl From<[u8; 2]> for ShardIndex {
+    fn from(b: [u8; 2]) -> Self {
+        Self {
+            shard_number: ShardNumber(b[0]),
+            shard_count: ShardCount(b[1]),
+        }
+    }
+}
+
+impl Serialize for TenantShardId {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        if serializer.is_human_readable() {
+            serializer.collect_str(self)
+        } else {
+            // Note: while human encoding of [`TenantShardId`] is backward and forward
+            // compatible, this binary encoding is not.
+            let mut packed: [u8; 18] = [0; 18];
+            packed[0..16].clone_from_slice(&self.tenant_id.as_arr());
+            packed[16] = self.shard_number.0;
+            packed[17] = self.shard_count.0;
+
+            packed.serialize(serializer)
+        }
+    }
+}
+
+impl<'de> Deserialize<'de> for TenantShardId {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        struct IdVisitor {
+            is_human_readable_deserializer: bool,
+        }
+
+        impl<'de> serde::de::Visitor<'de> for IdVisitor {
+            type Value = TenantShardId;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
+                if self.is_human_readable_deserializer {
+                    formatter.write_str("value in form of hex string")
+                } else {
+                    formatter.write_str("value in form of integer array([u8; 18])")
+                }
+            }
+
+            fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
+            where
+                A: serde::de::SeqAccess<'de>,
+            {
+                let s = serde::de::value::SeqAccessDeserializer::new(seq);
+                let id: [u8; 18] = Deserialize::deserialize(s)?;
+                Ok(TenantShardId::from(id))
+            }
+
+            fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
+            where
+                E: serde::de::Error,
+            {
+                TenantShardId::from_str(v).map_err(E::custom)
+            }
+        }
+
+        if deserializer.is_human_readable() {
+            deserializer.deserialize_str(IdVisitor {
+                is_human_readable_deserializer: true,
+            })
+        } else {
+            deserializer.deserialize_tuple(
+                18,
+                IdVisitor {
+                    is_human_readable_deserializer: false,
+                },
+            )
+        }
+    }
+}
+
 /// Stripe size in number of pages
 #[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
 pub struct ShardStripeSize(pub u32);
@@ -212,6 +585,77 @@ impl ShardIdentity {
    }
 }

+impl Serialize for ShardIndex {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        if serializer.is_human_readable() {
+            serializer.collect_str(self)
+        } else {
+            // Binary encoding is not used in index_part.json, but is included in anticipation of
+            // switching various structures (e.g. inter-process communication, remote metadata) to more
+            // compact binary encodings in future.
+            let mut packed: [u8; 2] = [0; 2];
+            packed[0] = self.shard_number.0;
+            packed[1] = self.shard_count.0;
+            packed.serialize(serializer)
+        }
+    }
+}
+
+impl<'de> Deserialize<'de> for ShardIndex {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        struct IdVisitor {
+            is_human_readable_deserializer: bool,
+        }
+
+        impl<'de> serde::de::Visitor<'de> for IdVisitor {
+            type Value = ShardIndex;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
+                if self.is_human_readable_deserializer {
+                    formatter.write_str("value in form of hex string")
+                } else {
+                    formatter.write_str("value in form of integer array([u8; 2])")
+                }
+            }
+
+            fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
+            where
+                A: serde::de::SeqAccess<'de>,
+            {
+                let s = serde::de::value::SeqAccessDeserializer::new(seq);
+                let id: [u8; 2] = Deserialize::deserialize(s)?;
+                Ok(ShardIndex::from(id))
+            }
+
+            fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
+            where
+                E: serde::de::Error,
+            {
+                ShardIndex::from_str(v).map_err(E::custom)
+            }
+        }
+
+        if deserializer.is_human_readable() {
+            deserializer.deserialize_str(IdVisitor {
+                is_human_readable_deserializer: true,
+            })
+        } else {
+            deserializer.deserialize_tuple(
+                2,
+                IdVisitor {
+                    is_human_readable_deserializer: false,
+                },
+            )
+        }
+    }
+}
+
 /// Whether this key is always held on shard 0 (e.g. shard 0 holds all SLRU keys
 /// in order to be able to serve basebackup requests without peer communication).
 fn key_is_shard0(key: &Key) -> bool {
@@ -293,9 +737,7 @@ pub fn describe(

 #[cfg(test)]
 mod tests {
-    use std::str::FromStr;
-
-    use utils::{id::TenantId, Hex};
+    use utils::Hex;

    use super::*;

--- a/libs/postgres_backend/Cargo.toml
+++ b/libs/postgres_backend/Cargo.toml
@@ -13,7 +13,6 @@ rustls.workspace = true
 serde.workspace = true
 thiserror.workspace = true
 tokio.workspace = true
-tokio-util.workspace = true
 tokio-rustls.workspace = true
 tracing.workspace = true

@@ -24,4 +23,4 @@ workspace_hack.workspace = true
 once_cell.workspace = true
 rustls-pemfile.workspace = true
 tokio-postgres.workspace = true
-tokio-postgres-rustls.workspace = true
+tokio-postgres-rustls.workspace = true
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -16,7 +16,6 @@ use std::{fmt, io};
 use std::{future::Future, str::FromStr};
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_rustls::TlsAcceptor;
-use tokio_util::sync::CancellationToken;
 use tracing::{debug, error, info, trace, warn};

 use pq_proto::framed::{ConnectionError, Framed, FramedReader, FramedWriter};
@@ -401,15 +400,21 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
    }

    /// Wrapper for run_message_loop() that shuts down socket when we are done
-    pub async fn run(
+    pub async fn run<F, S>(
        mut self,
        handler: &mut impl Handler<IO>,
-        cancel: &CancellationToken,
-    ) -> Result<(), QueryError> {
-        let ret = self.run_message_loop(handler, cancel).await;
+        shutdown_watcher: F,
+    ) -> Result<(), QueryError>
+    where
+        F: Fn() -> S + Clone,
+        S: Future,
+    {
+        let ret = self
+            .run_message_loop(handler, shutdown_watcher.clone())
+            .await;

        tokio::select! {
-            _ = cancel.cancelled() => {
+            _ = shutdown_watcher() => {
                // do nothing; we most likely got already stopped by shutdown and will log it next.
            }
            _ = self.framed.shutdown() => {
@@ -439,17 +444,21 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
        }
    }

-    async fn run_message_loop(
+    async fn run_message_loop<F, S>(
        &mut self,
        handler: &mut impl Handler<IO>,
-        cancel: &CancellationToken,
-    ) -> Result<(), QueryError> {
+        shutdown_watcher: F,
+    ) -> Result<(), QueryError>
+    where
+        F: Fn() -> S,
+        S: Future,
+    {
        trace!("postgres backend to {:?} started", self.peer_addr);

        tokio::select!(
            biased;

-            _ = cancel.cancelled() => {
+            _ = shutdown_watcher() => {
                // We were requested to shut down.
                tracing::info!("shutdown request received during handshake");
                return Err(QueryError::Shutdown)
@@ -464,7 +473,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
        let mut query_string = Bytes::new();
        while let Some(msg) = tokio::select!(
            biased;
-            _ = cancel.cancelled() => {
+            _ = shutdown_watcher() => {
                // We were requested to shut down.
                tracing::info!("shutdown request received in run_message_loop");
                return Err(QueryError::Shutdown)
@@ -476,7 +485,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
            let result = self.process_message(handler, msg, &mut query_string).await;
            tokio::select!(
                biased;
-                _ = cancel.cancelled() => {
+                _ = shutdown_watcher() => {
                    // We were requested to shut down.
                    tracing::info!("shutdown request received during response flush");

@@ -663,17 +672,11 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
        assert!(self.state < ProtoState::Authentication);
        let have_tls = self.tls_config.is_some();
        match msg {
-            FeStartupPacket::SslRequest { direct } => {
+            FeStartupPacket::SslRequest => {
                debug!("SSL requested");

-                if !direct {
-                    self.write_message(&BeMessage::EncryptionResponse(have_tls))
-                        .await?;
-                } else if !have_tls {
-                    return Err(QueryError::Other(anyhow::anyhow!(
-                        "direct SSL negotiation but no TLS support"
-                    )));
-                }
+                self.write_message(&BeMessage::EncryptionResponse(have_tls))
+                    .await?;

                if have_tls {
                    self.start_tls().await?;
--- a/libs/postgres_backend/tests/simple_select.rs
+++ b/libs/postgres_backend/tests/simple_select.rs
@@ -3,14 +3,13 @@ use once_cell::sync::Lazy;
 use postgres_backend::{AuthType, Handler, PostgresBackend, QueryError};
 use pq_proto::{BeMessage, RowDescriptor};
 use std::io::Cursor;
-use std::sync::Arc;
+use std::{future, sync::Arc};
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio::net::{TcpListener, TcpStream};
 use tokio_postgres::config::SslMode;
 use tokio_postgres::tls::MakeTlsConnect;
 use tokio_postgres::{Config, NoTls, SimpleQueryMessage};
 use tokio_postgres_rustls::MakeRustlsConnect;
-use tokio_util::sync::CancellationToken;

 // generate client, server test streams
 async fn make_tcp_pair() -> (TcpStream, TcpStream) {
@@ -51,7 +50,7 @@ async fn simple_select() {

    tokio::spawn(async move {
        let mut handler = TestHandler {};
-        pgbackend.run(&mut handler, &CancellationToken::new()).await
+        pgbackend.run(&mut handler, future::pending::<()>).await
    });

    let conf = Config::new();
@@ -103,7 +102,7 @@ async fn simple_select_ssl() {

    tokio::spawn(async move {
        let mut handler = TestHandler {};
-        pgbackend.run(&mut handler, &CancellationToken::new()).await
+        pgbackend.run(&mut handler, future::pending::<()>).await
    });

    let client_cfg = rustls::ClientConfig::builder()
--- a/libs/postgres_connection/src/lib.rs
+++ b/libs/postgres_connection/src/lib.rs
@@ -144,20 +144,7 @@ impl PgConnectionConfig {
            // implement and this function is hardly a bottleneck. The function is only called around
            // establishing a new connection.
            #[allow(unstable_name_collisions)]
-            config.options(
-                &self
-                    .options
-                    .iter()
-                    .map(|s| {
-                        if s.contains(['\\', ' ']) {
-                            Cow::Owned(s.replace('\\', "\\\\").replace(' ', "\\ "))
-                        } else {
-                            Cow::Borrowed(s.as_str())
-                        }
-                    })
-                    .intersperse(Cow::Borrowed(" ")) // TODO: use impl from std once it's stabilized
-                    .collect::<String>(),
-            );
+            config.options(&encode_options(&self.options));
        }
        config
    }
@@ -178,6 +165,21 @@ impl PgConnectionConfig {
    }
 }

+#[allow(unstable_name_collisions)]
+fn encode_options(options: &[String]) -> String {
+    options
+        .iter()
+        .map(|s| {
+            if s.contains(['\\', ' ']) {
+                Cow::Owned(s.replace('\\', "\\\\").replace(' ', "\\ "))
+            } else {
+                Cow::Borrowed(s.as_str())
+            }
+        })
+        .intersperse(Cow::Borrowed(" ")) // TODO: use impl from std once it's stabilized
+        .collect::<String>()
+}
+
 impl fmt::Display for PgConnectionConfig {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        // The password is intentionally hidden and not part of this display string.
@@ -206,7 +208,7 @@ impl fmt::Debug for PgConnectionConfig {

 #[cfg(test)]
 mod tests_pg_connection_config {
-    use crate::PgConnectionConfig;
+    use crate::{encode_options, PgConnectionConfig};
    use once_cell::sync::Lazy;
    use url::Host;

@@ -255,18 +257,12 @@ mod tests_pg_connection_config {

    #[test]
    fn test_with_options() {
-        let cfg = PgConnectionConfig::new_host_port(STUB_HOST.clone(), 123).extend_options([
-            "hello",
-            "world",
-            "with space",
-            "and \\ backslashes",
+        let options = encode_options(&[
+            "hello".to_owned(),
+            "world".to_owned(),
+            "with space".to_owned(),
+            "and \\ backslashes".to_owned(),
        ]);
-        assert_eq!(cfg.host(), &*STUB_HOST);
-        assert_eq!(cfg.port(), 123);
-        assert_eq!(cfg.raw_address(), "stub.host.example:123");
-        assert_eq!(
-            cfg.to_tokio_postgres_config().get_options(),
-            Some("hello world with\\ space and\\ \\\\\\ backslashes")
-        );
+        assert_eq!(options, "hello world with\\ space and\\ \\\\\\ backslashes");
    }
 }
--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -356,28 +356,6 @@ impl CheckPoint {
        }
        false
    }
-
-    /// Advance next multi-XID/offset to those given in arguments.
-    ///
-    /// It's important that this handles wraparound correctly. This should match the
-    /// MultiXactAdvanceNextMXact() logic in PostgreSQL's xlog_redo() function.
-    ///
-    /// Returns 'true' if the Checkpoint was updated.
-    pub fn update_next_multixid(&mut self, multi_xid: u32, multi_offset: u32) -> bool {
-        let mut modified = false;
-
-        if multi_xid.wrapping_sub(self.nextMulti) as i32 > 0 {
-            self.nextMulti = multi_xid;
-            modified = true;
-        }
-
-        if multi_offset.wrapping_sub(self.nextMultiOffset) as i32 > 0 {
-            self.nextMultiOffset = multi_offset;
-            modified = true;
-        }
-
-        modified
-    }
 }

 /// Generate new, empty WAL segment, with correct block headers at the first
--- a/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs
+++ b/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs
@@ -202,53 +202,6 @@ pub fn test_update_next_xid() {
    assert_eq!(checkpoint.nextXid.value, 2048);
 }

-#[test]
-pub fn test_update_next_multixid() {
-    let checkpoint_buf = [0u8; std::mem::size_of::<CheckPoint>()];
-    let mut checkpoint = CheckPoint::decode(&checkpoint_buf).unwrap();
-
-    // simple case
-    checkpoint.nextMulti = 20;
-    checkpoint.nextMultiOffset = 20;
-    checkpoint.update_next_multixid(1000, 2000);
-    assert_eq!(checkpoint.nextMulti, 1000);
-    assert_eq!(checkpoint.nextMultiOffset, 2000);
-
-    // No change
-    checkpoint.update_next_multixid(500, 900);
-    assert_eq!(checkpoint.nextMulti, 1000);
-    assert_eq!(checkpoint.nextMultiOffset, 2000);
-
-    // Close to wraparound, but not wrapped around yet
-    checkpoint.nextMulti = 0xffff0000;
-    checkpoint.nextMultiOffset = 0xfffe0000;
-    checkpoint.update_next_multixid(0xffff00ff, 0xfffe00ff);
-    assert_eq!(checkpoint.nextMulti, 0xffff00ff);
-    assert_eq!(checkpoint.nextMultiOffset, 0xfffe00ff);
-
-    // Wraparound
-    checkpoint.update_next_multixid(1, 900);
-    assert_eq!(checkpoint.nextMulti, 1);
-    assert_eq!(checkpoint.nextMultiOffset, 900);
-
-    // Wraparound nextMulti to 0.
-    //
-    // It's a bit surprising that nextMulti can be 0, because that's a special value
-    // (InvalidMultiXactId). However, that's how Postgres does it at multi-xid wraparound:
-    // nextMulti wraps around to 0, but then when the next multi-xid is assigned, it skips
-    // the 0 and the next multi-xid actually assigned is 1.
-    checkpoint.nextMulti = 0xffff0000;
-    checkpoint.nextMultiOffset = 0xfffe0000;
-    checkpoint.update_next_multixid(0, 0xfffe00ff);
-    assert_eq!(checkpoint.nextMulti, 0);
-    assert_eq!(checkpoint.nextMultiOffset, 0xfffe00ff);
-
-    // Wraparound nextMultiOffset to 0
-    checkpoint.update_next_multixid(0, 0);
-    assert_eq!(checkpoint.nextMulti, 0);
-    assert_eq!(checkpoint.nextMultiOffset, 0);
-}
-
 #[test]
 pub fn test_encode_logical_message() {
    let expected = [
--- a/libs/pq_proto/src/framed.rs
+++ b/libs/pq_proto/src/framed.rs
@@ -44,9 +44,9 @@ impl ConnectionError {
 /// Wraps async io `stream`, providing messages to write/flush + read Postgres
 /// messages.
 pub struct Framed<S> {
-    pub stream: S,
-    pub read_buf: BytesMut,
-    pub write_buf: BytesMut,
+    stream: S,
+    read_buf: BytesMut,
+    write_buf: BytesMut,
 }

 impl<S> Framed<S> {
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -39,39 +39,14 @@ pub enum FeMessage {
    PasswordMessage(Bytes),
 }

-#[derive(Clone, Copy, PartialEq, PartialOrd)]
-pub struct ProtocolVersion(u32);
-
-impl ProtocolVersion {
-    pub const fn new(major: u16, minor: u16) -> Self {
-        Self((major as u32) << 16 | minor as u32)
-    }
-    pub const fn minor(self) -> u16 {
-        self.0 as u16
-    }
-    pub const fn major(self) -> u16 {
-        (self.0 >> 16) as u16
-    }
-}
-
-impl fmt::Debug for ProtocolVersion {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        f.debug_list()
-            .entry(&self.major())
-            .entry(&self.minor())
-            .finish()
-    }
-}
-
 #[derive(Debug)]
 pub enum FeStartupPacket {
    CancelRequest(CancelKeyData),
-    SslRequest {
-        direct: bool,
-    },
+    SslRequest,
    GssEncRequest,
    StartupMessage {
-        version: ProtocolVersion,
+        major_version: u32,
+        minor_version: u32,
        params: StartupMessageParams,
    },
 }
@@ -326,23 +301,11 @@ impl FeStartupPacket {
    /// different from [`FeMessage::parse`] because startup messages don't have
    /// message type byte; otherwise, its comments apply.
    pub fn parse(buf: &mut BytesMut) -> Result<Option<FeStartupPacket>, ProtocolError> {
-        /// <https://github.com/postgres/postgres/blob/ca481d3c9ab7bf69ff0c8d71ad3951d407f6a33c/src/include/libpq/pqcomm.h#L118>
        const MAX_STARTUP_PACKET_LENGTH: usize = 10000;
-        const RESERVED_INVALID_MAJOR_VERSION: u16 = 1234;
-        /// <https://github.com/postgres/postgres/blob/ca481d3c9ab7bf69ff0c8d71ad3951d407f6a33c/src/include/libpq/pqcomm.h#L132>
-        const CANCEL_REQUEST_CODE: ProtocolVersion = ProtocolVersion::new(1234, 5678);
-        /// <https://github.com/postgres/postgres/blob/ca481d3c9ab7bf69ff0c8d71ad3951d407f6a33c/src/include/libpq/pqcomm.h#L166>
-        const NEGOTIATE_SSL_CODE: ProtocolVersion = ProtocolVersion::new(1234, 5679);
-        /// <https://github.com/postgres/postgres/blob/ca481d3c9ab7bf69ff0c8d71ad3951d407f6a33c/src/include/libpq/pqcomm.h#L167>
-        const NEGOTIATE_GSS_CODE: ProtocolVersion = ProtocolVersion::new(1234, 5680);
-
-        // <https://github.com/postgres/postgres/blob/04bcf9e19a4261fe9c7df37c777592c2e10c32a7/src/backend/tcop/backend_startup.c#L378-L382>
-        // First byte indicates standard SSL handshake message
-        // (It can't be a Postgres startup length because in network byte order
-        // that would be a startup packet hundreds of megabytes long)
-        if buf.first() == Some(&0x16) {
-            return Ok(Some(FeStartupPacket::SslRequest { direct: true }));
-        }
+        const RESERVED_INVALID_MAJOR_VERSION: u32 = 1234;
+        const CANCEL_REQUEST_CODE: u32 = 5678;
+        const NEGOTIATE_SSL_CODE: u32 = 5679;
+        const NEGOTIATE_GSS_CODE: u32 = 5680;

        // need at least 4 bytes with packet len
        if buf.len() < 4 {
@@ -375,10 +338,12 @@ impl FeStartupPacket {
        let mut msg = buf.split_to(len).freeze();
        msg.advance(4); // consume len

-        let request_code = ProtocolVersion(msg.get_u32());
+        let request_code = msg.get_u32();
+        let req_hi = request_code >> 16;
+        let req_lo = request_code & ((1 << 16) - 1);
        // StartupMessage, CancelRequest, SSLRequest etc are differentiated by request code.
-        let message = match request_code {
-            CANCEL_REQUEST_CODE => {
+        let message = match (req_hi, req_lo) {
+            (RESERVED_INVALID_MAJOR_VERSION, CANCEL_REQUEST_CODE) => {
                if msg.remaining() != 8 {
                    return Err(ProtocolError::BadMessage(
                        "CancelRequest message is malformed, backend PID / secret key missing"
@@ -390,22 +355,21 @@ impl FeStartupPacket {
                    cancel_key: msg.get_i32(),
                })
            }
-            NEGOTIATE_SSL_CODE => {
+            (RESERVED_INVALID_MAJOR_VERSION, NEGOTIATE_SSL_CODE) => {
                // Requested upgrade to SSL (aka TLS)
-                FeStartupPacket::SslRequest { direct: false }
+                FeStartupPacket::SslRequest
            }
-            NEGOTIATE_GSS_CODE => {
+            (RESERVED_INVALID_MAJOR_VERSION, NEGOTIATE_GSS_CODE) => {
                // Requested upgrade to GSSAPI
                FeStartupPacket::GssEncRequest
            }
-            version if version.major() == RESERVED_INVALID_MAJOR_VERSION => {
+            (RESERVED_INVALID_MAJOR_VERSION, unrecognized_code) => {
                return Err(ProtocolError::Protocol(format!(
-                    "Unrecognized request code {}",
-                    version.minor()
+                    "Unrecognized request code {unrecognized_code}"
                )));
            }
            // TODO bail if protocol major_version is not 3?
-            version => {
+            (major_version, minor_version) => {
                // StartupMessage

                let s = str::from_utf8(&msg).map_err(|_e| {
@@ -418,7 +382,8 @@ impl FeStartupPacket {
                })?;

                FeStartupPacket::StartupMessage {
-                    version,
+                    major_version,
+                    minor_version,
                    params: StartupMessageParams {
                        params: msg.slice_ref(s.as_bytes()),
                    },
@@ -557,10 +522,6 @@ pub enum BeMessage<'a> {
    RowDescription(&'a [RowDescriptor<'a>]),
    XLogData(XLogDataBody<'a>),
    NoticeResponse(&'a str),
-    NegotiateProtocolVersion {
-        version: ProtocolVersion,
-        options: &'a [&'a str],
-    },
    KeepAlive(WalSndKeepAlive),
 }

@@ -984,18 +945,6 @@ impl<'a> BeMessage<'a> {
                    buf.put_u8(u8::from(req.request_reply));
                });
            }
-
-            BeMessage::NegotiateProtocolVersion { version, options } => {
-                buf.put_u8(b'v');
-                write_body(buf, |buf| {
-                    buf.put_u32(version.0);
-                    buf.put_u32(options.len() as u32);
-                    for option in options.iter() {
-                        write_cstr(option, buf)?;
-                    }
-                    Ok(())
-                })?
-            }
        }
        Ok(())
    }
--- a/libs/remote_storage/src/config.rs
+++ b/libs/remote_storage/src/config.rs
@@ -1,5 +1,6 @@
 use std::{fmt::Debug, num::NonZeroUsize, str::FromStr, time::Duration};

+use anyhow::bail;
 use aws_sdk_s3::types::StorageClass;
 use camino::Utf8PathBuf;

@@ -175,8 +176,20 @@ fn serialize_storage_class<S: serde::Serializer>(
 impl RemoteStorageConfig {
    pub const DEFAULT_TIMEOUT: Duration = std::time::Duration::from_secs(120);

-    pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result<RemoteStorageConfig> {
-        Ok(utils::toml_edit_ext::deserialize_item(toml)?)
+    pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result<Option<RemoteStorageConfig>> {
+        let document: toml_edit::Document = match toml {
+            toml_edit::Item::Table(toml) => toml.clone().into(),
+            toml_edit::Item::Value(toml_edit::Value::InlineTable(toml)) => {
+                toml.clone().into_table().into()
+            }
+            _ => bail!("toml not a table or inline table"),
+        };
+
+        if document.is_empty() {
+            return Ok(None);
+        }
+
+        Ok(Some(toml_edit::de::from_document(document)?))
    }
 }

@@ -184,7 +197,7 @@ impl RemoteStorageConfig {
 mod tests {
    use super::*;

-    fn parse(input: &str) -> anyhow::Result<RemoteStorageConfig> {
+    fn parse(input: &str) -> anyhow::Result<Option<RemoteStorageConfig>> {
        let toml = input.parse::<toml_edit::Document>().unwrap();
        RemoteStorageConfig::from_toml(toml.as_item())
    }
@@ -194,7 +207,7 @@ mod tests {
        let input = "local_path = '.'
 timeout = '5s'";

-        let config = parse(input).unwrap();
+        let config = parse(input).unwrap().expect("it exists");

        assert_eq!(
            config,
@@ -216,7 +229,7 @@ timeout = '5s'";
    timeout = '7s'
    ";

-        let config = parse(toml).unwrap();
+        let config = parse(toml).unwrap().expect("it exists");

        assert_eq!(
            config,
@@ -244,7 +257,7 @@ timeout = '5s'";
    timeout = '7s'
    ";

-        let config = parse(toml).unwrap();
+        let config = parse(toml).unwrap().expect("it exists");

        assert_eq!(
            config,
--- a/libs/tenant_size_model/src/calculation.rs
+++ b/libs/tenant_size_model/src/calculation.rs
@@ -34,10 +34,10 @@ struct SegmentSize {
 }

 struct SizeAlternatives {
-    /// cheapest alternative if parent is available.
+    // cheapest alternative if parent is available.
    incremental: SegmentSize,

-    /// cheapest alternative if parent node is not available
+    // cheapest alternative if parent node is not available
    non_incremental: Option<SegmentSize>,
 }

--- a/libs/tenant_size_model/src/svg.rs
+++ b/libs/tenant_size_model/src/svg.rs
@@ -3,17 +3,10 @@ use std::fmt::Write;

 const SVG_WIDTH: f32 = 500.0;

-/// Different branch kind for SVG drawing.
-#[derive(PartialEq)]
-pub enum SvgBranchKind {
-    Timeline,
-    Lease,
-}
-
 struct SvgDraw<'a> {
    storage: &'a StorageModel,
    branches: &'a [String],
-    seg_to_branch: &'a [(usize, SvgBranchKind)],
+    seg_to_branch: &'a [usize],
    sizes: &'a [SegmentSizeResult],

    // layout
@@ -49,18 +42,13 @@ fn draw_legend(result: &mut String) -> anyhow::Result<()> {
        "<line x1=\"5\" y1=\"70\" x2=\"15\" y2=\"70\" stroke-width=\"1\" stroke=\"gray\" />"
    )?;
    writeln!(result, "<text x=\"20\" y=\"75\">WAL not retained</text>")?;
-    writeln!(
-        result,
-        "<line x1=\"10\" y1=\"85\" x2=\"10\" y2=\"95\" stroke-width=\"3\" stroke=\"blue\" />"
-    )?;
-    writeln!(result, "<text x=\"20\" y=\"95\">LSN lease</text>")?;
    Ok(())
 }

 pub fn draw_svg(
    storage: &StorageModel,
    branches: &[String],
-    seg_to_branch: &[(usize, SvgBranchKind)],
+    seg_to_branch: &[usize],
    sizes: &SizeResult,
 ) -> anyhow::Result<String> {
    let mut draw = SvgDraw {
@@ -112,7 +100,7 @@ impl<'a> SvgDraw<'a> {

        // Layout the timelines on Y dimension.
        // TODO
-        let mut y = 120.0;
+        let mut y = 100.0;
        let mut branch_y_coordinates = Vec::new();
        for _branch in self.branches {
            branch_y_coordinates.push(y);
@@ -121,7 +109,7 @@ impl<'a> SvgDraw<'a> {

        // Calculate coordinates for each point
        let seg_coordinates = std::iter::zip(segments, self.seg_to_branch)
-            .map(|(seg, (branch_id, _))| {
+            .map(|(seg, branch_id)| {
                let x = (seg.lsn - min_lsn) as f32 / xscale;
                let y = branch_y_coordinates[*branch_id];
                (x, y)
@@ -187,22 +175,6 @@ impl<'a> SvgDraw<'a> {

        // draw a snapshot point if it's needed
        let (coord_x, coord_y) = self.seg_coordinates[seg_id];
-
-        let (_, kind) = &self.seg_to_branch[seg_id];
-        if kind == &SvgBranchKind::Lease {
-            let (x1, y1) = (coord_x, coord_y - 10.0);
-            let (x2, y2) = (coord_x, coord_y + 10.0);
-
-            let style = "stroke-width=\"3\" stroke=\"blue\"";
-
-            writeln!(
-                result,
-                "<line x1=\"{x1}\" y1=\"{y1}\" x2=\"{x2}\" y2=\"{y2}\" {style}>",
-            )?;
-            writeln!(result, "  <title>leased lsn at {}</title>", seg.lsn)?;
-            writeln!(result, "</line>")?;
-        }
-
        if self.sizes[seg_id].method == SegmentMethod::SnapshotHere {
            writeln!(
                result,
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -40,7 +40,6 @@ thiserror.workspace = true
 tokio.workspace = true
 tokio-tar.workspace = true
 tokio-util.workspace = true
-toml_edit.workspace = true
 tracing.workspace = true
 tracing-error.workspace = true
 tracing-subscriber = { workspace = true, features = ["json", "registry"] }
--- a/libs/utils/src/http/request.rs
+++ b/libs/utils/src/http/request.rs
@@ -74,15 +74,6 @@ pub fn parse_query_param<E: fmt::Display, T: FromStr<Err = E>>(
        .transpose()
 }

-pub fn must_parse_query_param<E: fmt::Display, T: FromStr<Err = E>>(
-    request: &Request<Body>,
-    param_name: &str,
-) -> Result<T, ApiError> {
-    parse_query_param(request, param_name)?.ok_or_else(|| {
-        ApiError::BadRequest(anyhow!("no {param_name} specified in query parameters"))
-    })
-}
-
 pub async fn ensure_no_body(request: &mut Request<Body>) -> Result<(), ApiError> {
    match request.body_mut().data().await {
        Some(_) => Err(ApiError::BadRequest(anyhow!("Unexpected request body"))),
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -26,8 +26,6 @@ pub mod auth;
 // utility functions and helper traits for unified unique id generation/serialization etc.
 pub mod id;

-pub mod shard;
-
 mod hex;
 pub use hex::Hex;

@@ -96,8 +94,6 @@ pub mod env;

 pub mod poison;

-pub mod toml_edit_ext;
-
 /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
 ///
 /// we have several cases:
--- a/libs/utils/src/shard.rs
+++ b/libs/utils/src/shard.rs
@@ -1,451 +0,0 @@
-//! See `pageserver_api::shard` for description on sharding.
-
-use std::{ops::RangeInclusive, str::FromStr};
-
-use hex::FromHex;
-use serde::{Deserialize, Serialize};
-
-use crate::id::TenantId;
-
-#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
-pub struct ShardNumber(pub u8);
-
-#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
-pub struct ShardCount(pub u8);
-
-/// Combination of ShardNumber and ShardCount.  For use within the context of a particular tenant,
-/// when we need to know which shard we're dealing with, but do not need to know the full
-/// ShardIdentity (because we won't be doing any page->shard mapping), and do not need to know
-/// the fully qualified TenantShardId.
-#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
-pub struct ShardIndex {
-    pub shard_number: ShardNumber,
-    pub shard_count: ShardCount,
-}
-
-/// Formatting helper, for generating the `shard_id` label in traces.
-pub struct ShardSlug<'a>(&'a TenantShardId);
-
-/// TenantShardId globally identifies a particular shard in a particular tenant.
-///
-/// These are written as `<TenantId>-<ShardSlug>`, for example:
-///   # The second shard in a two-shard tenant
-///   072f1291a5310026820b2fe4b2968934-0102
-///
-/// If the `ShardCount` is _unsharded_, the `TenantShardId` is written without
-/// a shard suffix and is equivalent to the encoding of a `TenantId`: this enables
-/// an unsharded [`TenantShardId`] to be used interchangably with a [`TenantId`].
-///
-/// The human-readable encoding of an unsharded TenantShardId, such as used in API URLs,
-/// is both forward and backward compatible with TenantId: a legacy TenantId can be
-/// decoded as a TenantShardId, and when re-encoded it will be parseable
-/// as a TenantId.
-#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
-pub struct TenantShardId {
-    pub tenant_id: TenantId,
-    pub shard_number: ShardNumber,
-    pub shard_count: ShardCount,
-}
-
-impl ShardCount {
-    pub const MAX: Self = Self(u8::MAX);
-
-    /// The internal value of a ShardCount may be zero, which means "1 shard, but use
-    /// legacy format for TenantShardId that excludes the shard suffix", also known
-    /// as [`TenantShardId::unsharded`].
-    ///
-    /// This method returns the actual number of shards, i.e. if our internal value is
-    /// zero, we return 1 (unsharded tenants have 1 shard).
-    pub fn count(&self) -> u8 {
-        if self.0 > 0 {
-            self.0
-        } else {
-            1
-        }
-    }
-
-    /// The literal internal value: this is **not** the number of shards in the
-    /// tenant, as we have a special zero value for legacy unsharded tenants.  Use
-    /// [`Self::count`] if you want to know the cardinality of shards.
-    pub fn literal(&self) -> u8 {
-        self.0
-    }
-
-    /// Whether the `ShardCount` is for an unsharded tenant, so uses one shard but
-    /// uses the legacy format for `TenantShardId`. See also the documentation for
-    /// [`Self::count`].
-    pub fn is_unsharded(&self) -> bool {
-        self.0 == 0
-    }
-
-    /// `v` may be zero, or the number of shards in the tenant.  `v` is what
-    /// [`Self::literal`] would return.
-    pub const fn new(val: u8) -> Self {
-        Self(val)
-    }
-}
-
-impl ShardNumber {
-    pub const MAX: Self = Self(u8::MAX);
-}
-
-impl TenantShardId {
-    pub fn unsharded(tenant_id: TenantId) -> Self {
-        Self {
-            tenant_id,
-            shard_number: ShardNumber(0),
-            shard_count: ShardCount(0),
-        }
-    }
-
-    /// The range of all TenantShardId that belong to a particular TenantId.  This is useful when
-    /// you have a BTreeMap of TenantShardId, and are querying by TenantId.
-    pub fn tenant_range(tenant_id: TenantId) -> RangeInclusive<Self> {
-        RangeInclusive::new(
-            Self {
-                tenant_id,
-                shard_number: ShardNumber(0),
-                shard_count: ShardCount(0),
-            },
-            Self {
-                tenant_id,
-                shard_number: ShardNumber::MAX,
-                shard_count: ShardCount::MAX,
-            },
-        )
-    }
-
-    pub fn shard_slug(&self) -> impl std::fmt::Display + '_ {
-        ShardSlug(self)
-    }
-
-    /// Convenience for code that has special behavior on the 0th shard.
-    pub fn is_shard_zero(&self) -> bool {
-        self.shard_number == ShardNumber(0)
-    }
-
-    /// The "unsharded" value is distinct from simply having a single shard: it represents
-    /// a tenant which is not shard-aware at all, and whose storage paths will not include
-    /// a shard suffix.
-    pub fn is_unsharded(&self) -> bool {
-        self.shard_number == ShardNumber(0) && self.shard_count.is_unsharded()
-    }
-
-    /// Convenience for dropping the tenant_id and just getting the ShardIndex: this
-    /// is useful when logging from code that is already in a span that includes tenant ID, to
-    /// keep messages reasonably terse.
-    pub fn to_index(&self) -> ShardIndex {
-        ShardIndex {
-            shard_number: self.shard_number,
-            shard_count: self.shard_count,
-        }
-    }
-
-    /// Calculate the children of this TenantShardId when splitting the overall tenant into
-    /// the given number of shards.
-    pub fn split(&self, new_shard_count: ShardCount) -> Vec<TenantShardId> {
-        let effective_old_shard_count = std::cmp::max(self.shard_count.0, 1);
-        let mut child_shards = Vec::new();
-        for shard_number in 0..ShardNumber(new_shard_count.0).0 {
-            // Key mapping is based on a round robin mapping of key hash modulo shard count,
-            // so our child shards are the ones which the same keys would map to.
-            if shard_number % effective_old_shard_count == self.shard_number.0 {
-                child_shards.push(TenantShardId {
-                    tenant_id: self.tenant_id,
-                    shard_number: ShardNumber(shard_number),
-                    shard_count: new_shard_count,
-                })
-            }
-        }
-
-        child_shards
-    }
-}
-
-impl<'a> std::fmt::Display for ShardSlug<'a> {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(
-            f,
-            "{:02x}{:02x}",
-            self.0.shard_number.0, self.0.shard_count.0
-        )
-    }
-}
-
-impl std::fmt::Display for TenantShardId {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        if self.shard_count != ShardCount(0) {
-            write!(f, "{}-{}", self.tenant_id, self.shard_slug())
-        } else {
-            // Legacy case (shard_count == 0) -- format as just the tenant id.  Note that this
-            // is distinct from the normal single shard case (shard count == 1).
-            self.tenant_id.fmt(f)
-        }
-    }
-}
-
-impl std::fmt::Debug for TenantShardId {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        // Debug is the same as Display: the compact hex representation
-        write!(f, "{}", self)
-    }
-}
-
-impl std::str::FromStr for TenantShardId {
-    type Err = hex::FromHexError;
-
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        // Expect format: 16 byte TenantId, '-', 1 byte shard number, 1 byte shard count
-        if s.len() == 32 {
-            // Legacy case: no shard specified
-            Ok(Self {
-                tenant_id: TenantId::from_str(s)?,
-                shard_number: ShardNumber(0),
-                shard_count: ShardCount(0),
-            })
-        } else if s.len() == 37 {
-            let bytes = s.as_bytes();
-            let tenant_id = TenantId::from_hex(&bytes[0..32])?;
-            let mut shard_parts: [u8; 2] = [0u8; 2];
-            hex::decode_to_slice(&bytes[33..37], &mut shard_parts)?;
-            Ok(Self {
-                tenant_id,
-                shard_number: ShardNumber(shard_parts[0]),
-                shard_count: ShardCount(shard_parts[1]),
-            })
-        } else {
-            Err(hex::FromHexError::InvalidStringLength)
-        }
-    }
-}
-
-impl From<[u8; 18]> for TenantShardId {
-    fn from(b: [u8; 18]) -> Self {
-        let tenant_id_bytes: [u8; 16] = b[0..16].try_into().unwrap();
-
-        Self {
-            tenant_id: TenantId::from(tenant_id_bytes),
-            shard_number: ShardNumber(b[16]),
-            shard_count: ShardCount(b[17]),
-        }
-    }
-}
-
-impl ShardIndex {
-    pub fn new(number: ShardNumber, count: ShardCount) -> Self {
-        Self {
-            shard_number: number,
-            shard_count: count,
-        }
-    }
-    pub fn unsharded() -> Self {
-        Self {
-            shard_number: ShardNumber(0),
-            shard_count: ShardCount(0),
-        }
-    }
-
-    /// The "unsharded" value is distinct from simply having a single shard: it represents
-    /// a tenant which is not shard-aware at all, and whose storage paths will not include
-    /// a shard suffix.
-    pub fn is_unsharded(&self) -> bool {
-        self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
-    }
-
-    /// For use in constructing remote storage paths: concatenate this with a TenantId
-    /// to get a fully qualified TenantShardId.
-    ///
-    /// Backward compat: this function returns an empty string if Self::is_unsharded, such
-    /// that the legacy pre-sharding remote key format is preserved.
-    pub fn get_suffix(&self) -> String {
-        if self.is_unsharded() {
-            "".to_string()
-        } else {
-            format!("-{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
-        }
-    }
-}
-
-impl std::fmt::Display for ShardIndex {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
-    }
-}
-
-impl std::fmt::Debug for ShardIndex {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        // Debug is the same as Display: the compact hex representation
-        write!(f, "{}", self)
-    }
-}
-
-impl std::str::FromStr for ShardIndex {
-    type Err = hex::FromHexError;
-
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        // Expect format: 1 byte shard number, 1 byte shard count
-        if s.len() == 4 {
-            let bytes = s.as_bytes();
-            let mut shard_parts: [u8; 2] = [0u8; 2];
-            hex::decode_to_slice(bytes, &mut shard_parts)?;
-            Ok(Self {
-                shard_number: ShardNumber(shard_parts[0]),
-                shard_count: ShardCount(shard_parts[1]),
-            })
-        } else {
-            Err(hex::FromHexError::InvalidStringLength)
-        }
-    }
-}
-
-impl From<[u8; 2]> for ShardIndex {
-    fn from(b: [u8; 2]) -> Self {
-        Self {
-            shard_number: ShardNumber(b[0]),
-            shard_count: ShardCount(b[1]),
-        }
-    }
-}
-
-impl Serialize for TenantShardId {
-    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
-    where
-        S: serde::Serializer,
-    {
-        if serializer.is_human_readable() {
-            serializer.collect_str(self)
-        } else {
-            // Note: while human encoding of [`TenantShardId`] is backward and forward
-            // compatible, this binary encoding is not.
-            let mut packed: [u8; 18] = [0; 18];
-            packed[0..16].clone_from_slice(&self.tenant_id.as_arr());
-            packed[16] = self.shard_number.0;
-            packed[17] = self.shard_count.0;
-
-            packed.serialize(serializer)
-        }
-    }
-}
-
-impl<'de> Deserialize<'de> for TenantShardId {
-    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
-    where
-        D: serde::Deserializer<'de>,
-    {
-        struct IdVisitor {
-            is_human_readable_deserializer: bool,
-        }
-
-        impl<'de> serde::de::Visitor<'de> for IdVisitor {
-            type Value = TenantShardId;
-
-            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
-                if self.is_human_readable_deserializer {
-                    formatter.write_str("value in form of hex string")
-                } else {
-                    formatter.write_str("value in form of integer array([u8; 18])")
-                }
-            }
-
-            fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
-            where
-                A: serde::de::SeqAccess<'de>,
-            {
-                let s = serde::de::value::SeqAccessDeserializer::new(seq);
-                let id: [u8; 18] = Deserialize::deserialize(s)?;
-                Ok(TenantShardId::from(id))
-            }
-
-            fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
-            where
-                E: serde::de::Error,
-            {
-                TenantShardId::from_str(v).map_err(E::custom)
-            }
-        }
-
-        if deserializer.is_human_readable() {
-            deserializer.deserialize_str(IdVisitor {
-                is_human_readable_deserializer: true,
-            })
-        } else {
-            deserializer.deserialize_tuple(
-                18,
-                IdVisitor {
-                    is_human_readable_deserializer: false,
-                },
-            )
-        }
-    }
-}
-
-impl Serialize for ShardIndex {
-    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
-    where
-        S: serde::Serializer,
-    {
-        if serializer.is_human_readable() {
-            serializer.collect_str(self)
-        } else {
-            // Binary encoding is not used in index_part.json, but is included in anticipation of
-            // switching various structures (e.g. inter-process communication, remote metadata) to more
-            // compact binary encodings in future.
-            let mut packed: [u8; 2] = [0; 2];
-            packed[0] = self.shard_number.0;
-            packed[1] = self.shard_count.0;
-            packed.serialize(serializer)
-        }
-    }
-}
-
-impl<'de> Deserialize<'de> for ShardIndex {
-    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
-    where
-        D: serde::Deserializer<'de>,
-    {
-        struct IdVisitor {
-            is_human_readable_deserializer: bool,
-        }
-
-        impl<'de> serde::de::Visitor<'de> for IdVisitor {
-            type Value = ShardIndex;
-
-            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
-                if self.is_human_readable_deserializer {
-                    formatter.write_str("value in form of hex string")
-                } else {
-                    formatter.write_str("value in form of integer array([u8; 2])")
-                }
-            }
-
-            fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
-            where
-                A: serde::de::SeqAccess<'de>,
-            {
-                let s = serde::de::value::SeqAccessDeserializer::new(seq);
-                let id: [u8; 2] = Deserialize::deserialize(s)?;
-                Ok(ShardIndex::from(id))
-            }
-
-            fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
-            where
-                E: serde::de::Error,
-            {
-                ShardIndex::from_str(v).map_err(E::custom)
-            }
-        }
-
-        if deserializer.is_human_readable() {
-            deserializer.deserialize_str(IdVisitor {
-                is_human_readable_deserializer: true,
-            })
-        } else {
-            deserializer.deserialize_tuple(
-                2,
-                IdVisitor {
-                    is_human_readable_deserializer: false,
-                },
-            )
-        }
-    }
-}
--- a/libs/utils/src/toml_edit_ext.rs
+++ b/libs/utils/src/toml_edit_ext.rs
@@ -1,22 +0,0 @@
-#[derive(Debug, thiserror::Error)]
-pub enum Error {
-    #[error("item is not a document")]
-    ItemIsNotADocument,
-    #[error(transparent)]
-    Serde(toml_edit::de::Error),
-}
-
-pub fn deserialize_item<T>(item: &toml_edit::Item) -> Result<T, Error>
-where
-    T: serde::de::DeserializeOwned,
-{
-    let document: toml_edit::Document = match item {
-        toml_edit::Item::Table(toml) => toml.clone().into(),
-        toml_edit::Item::Value(toml_edit::Value::InlineTable(toml)) => {
-            toml.clone().into_table().into()
-        }
-        _ => return Err(Error::ItemIsNotADocument),
-    };
-
-    toml_edit::de::from_document(document).map_err(Error::Serde)
-}
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -62,7 +62,6 @@ sync_wrapper.workspace = true
 sysinfo.workspace = true
 tokio-tar.workspace = true
 thiserror.workspace = true
-tikv-jemallocator.workspace = true
 tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
 tokio-epoll-uring.workspace = true
 tokio-io-timeout.workspace = true
--- a/pageserver/client/Cargo.toml
+++ b/pageserver/client/Cargo.toml
@@ -8,7 +8,7 @@ license.workspace = true
 pageserver_api.workspace = true
 thiserror.workspace = true
 async-trait.workspace = true
-reqwest = { workspace = true, features = [ "stream" ] }
+reqwest.workspace = true
 utils.workspace = true
 serde.workspace = true
 workspace_hack = { version = "0.1", path = "../../workspace_hack" }
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -9,8 +9,6 @@ use utils::{
    lsn::Lsn,
 };

-pub use reqwest::Body as ReqwestBody;
-
 pub mod util;

 #[derive(Debug, Clone)]
@@ -22,9 +20,6 @@ pub struct Client {

 #[derive(thiserror::Error, Debug)]
 pub enum Error {
-    #[error("send request: {0}")]
-    SendRequest(reqwest::Error),
-
    #[error("receive body: {0}")]
    ReceiveBody(reqwest::Error),

@@ -178,30 +173,19 @@ impl Client {
        self.request(Method::GET, uri, ()).await
    }

-    fn start_request<U: reqwest::IntoUrl>(
-        &self,
-        method: Method,
-        uri: U,
-    ) -> reqwest::RequestBuilder {
-        let req = self.client.request(method, uri);
-        if let Some(value) = &self.authorization_header {
-            req.header(reqwest::header::AUTHORIZATION, value)
-        } else {
-            req
-        }
-    }
-
    async fn request_noerror<B: serde::Serialize, U: reqwest::IntoUrl>(
        &self,
        method: Method,
        uri: U,
        body: B,
    ) -> Result<reqwest::Response> {
-        self.start_request(method, uri)
-            .json(&body)
-            .send()
-            .await
-            .map_err(Error::ReceiveBody)
+        let req = self.client.request(method, uri);
+        let req = if let Some(value) = &self.authorization_header {
+            req.header(reqwest::header::AUTHORIZATION, value)
+        } else {
+            req
+        };
+        req.json(&body).send().await.map_err(Error::ReceiveBody)
    }

    async fn request<B: serde::Serialize, U: reqwest::IntoUrl>(
@@ -221,6 +205,15 @@ impl Client {
        Ok(())
    }

+    pub async fn tenant_create(&self, req: &TenantCreateRequest) -> Result<TenantId> {
+        let uri = format!("{}/v1/tenant", self.mgmt_api_endpoint);
+        self.request(Method::POST, &uri, req)
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
+
    /// The tenant deletion API can return 202 if deletion is incomplete, or
    /// 404 if it is complete.  Callers are responsible for checking the status
    /// code and retrying.  Error codes other than 404 will return Err().
@@ -625,53 +618,4 @@ impl Client {
            }),
        }
    }
-
-    pub async fn import_basebackup(
-        &self,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-        base_lsn: Lsn,
-        end_lsn: Lsn,
-        pg_version: u32,
-        basebackup_tarball: ReqwestBody,
-    ) -> Result<()> {
-        let uri = format!(
-            "{}/v1/tenant/{tenant_id}/timeline/{timeline_id}/import_basebackup?base_lsn={base_lsn}&end_lsn={end_lsn}&pg_version={pg_version}",
-            self.mgmt_api_endpoint,
-        );
-        self.start_request(Method::PUT, uri)
-            .body(basebackup_tarball)
-            .send()
-            .await
-            .map_err(Error::SendRequest)?
-            .error_from_body()
-            .await?
-            .json()
-            .await
-            .map_err(Error::ReceiveBody)
-    }
-
-    pub async fn import_wal(
-        &self,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-        start_lsn: Lsn,
-        end_lsn: Lsn,
-        wal_tarball: ReqwestBody,
-    ) -> Result<()> {
-        let uri = format!(
-            "{}/v1/tenant/{tenant_id}/timeline/{timeline_id}/import_wal?start_lsn={start_lsn}&end_lsn={end_lsn}",
-            self.mgmt_api_endpoint,
-        );
-        self.start_request(Method::PUT, uri)
-            .body(wal_tarball)
-            .send()
-            .await
-            .map_err(Error::SendRequest)?
-            .error_from_body()
-            .await?
-            .json()
-            .await
-            .map_err(Error::ReceiveBody)
-    }
 }
--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -178,7 +178,7 @@ async fn main() -> anyhow::Result<()> {
            let toml_item = toml_document
                .get("remote_storage")
                .expect("need remote_storage");
-            let config = RemoteStorageConfig::from_toml(toml_item)?;
+            let config = RemoteStorageConfig::from_toml(toml_item)?.expect("incomplete config");
            let storage = remote_storage::GenericRemoteStorage::from_config(&config);
            let cancel = CancellationToken::new();
            storage
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -348,36 +348,35 @@ where
                    self.add_rel(rel, rel).await?;
                }
            }
-        }

-        for (path, content) in self
-            .timeline
-            .list_aux_files(self.lsn, self.ctx)
-            .await
-            .map_err(|e| BasebackupError::Server(e.into()))?
-        {
-            if path.starts_with("pg_replslot") {
-                let offs = pg_constants::REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN;
-                let restart_lsn = Lsn(u64::from_le_bytes(
-                    content[offs..offs + 8].try_into().unwrap(),
-                ));
-                info!("Replication slot {} restart LSN={}", path, restart_lsn);
-                min_restart_lsn = Lsn::min(min_restart_lsn, restart_lsn);
-            } else if path == "pg_logical/replorigin_checkpoint" {
-                // replorigin_checkoint is written only on compute shutdown, so it contains
-                // deteriorated values. So we generate our own version of this file for the particular LSN
-                // based on information about replorigins extracted from transaction commit records.
-                // In future we will not generate AUX record for "pg_logical/replorigin_checkpoint" at all,
-                // but now we should handle (skip) it for backward compatibility.
-                continue;
-            }
-            let header = new_tar_header(&path, content.len() as u64)?;
-            self.ar
-                .append(&header, &*content)
+            for (path, content) in self
+                .timeline
+                .list_aux_files(self.lsn, self.ctx)
                .await
-                .context("could not add aux file to basebackup tarball")?;
+                .map_err(|e| BasebackupError::Server(e.into()))?
+            {
+                if path.starts_with("pg_replslot") {
+                    let offs = pg_constants::REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN;
+                    let restart_lsn = Lsn(u64::from_le_bytes(
+                        content[offs..offs + 8].try_into().unwrap(),
+                    ));
+                    info!("Replication slot {} restart LSN={}", path, restart_lsn);
+                    min_restart_lsn = Lsn::min(min_restart_lsn, restart_lsn);
+                } else if path == "pg_logical/replorigin_checkpoint" {
+                    // replorigin_checkoint is written only on compute shutdown, so it contains
+                    // deteriorated values. So we generate our own version of this file for the particular LSN
+                    // based on information about replorigins extracted from transaction commit records.
+                    // In future we will not generate AUX record for "pg_logical/replorigin_checkpoint" at all,
+                    // but now we should handle (skip) it for backward compatibility.
+                    continue;
+                }
+                let header = new_tar_header(&path, content.len() as u64)?;
+                self.ar
+                    .append(&header, &*content)
+                    .await
+                    .context("could not add aux file to basebackup tarball")?;
+            }
        }
-
        if min_restart_lsn != Lsn::MAX {
            info!(
                "Min restart LSN for logical replication is {}",
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -47,9 +47,6 @@ use utils::{
 project_git_version!(GIT_VERSION);
 project_build_tag!(BUILD_TAG);

-#[global_allocator]
-static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
-
 const PID_FILE_NAME: &str = "pageserver.pid";

 const FEATURES: &[&str] = &[
@@ -424,10 +421,6 @@ fn start_pageserver(
        background_jobs_can_start: background_jobs_barrier.clone(),
    };

-    info!(config=?conf.l0_flush, "using l0_flush config");
-    let l0_flush_global_state =
-        pageserver::l0_flush::L0FlushGlobalState::new(conf.l0_flush.clone());
-
    // Scan the local 'tenants/' directory and start loading the tenants
    let deletion_queue_client = deletion_queue.new_client();
    let tenant_manager = BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
@@ -436,7 +429,6 @@ fn start_pageserver(
            broker_client: broker_client.clone(),
            remote_storage: remote_storage.clone(),
            deletion_queue_client,
-            l0_flush_global_state,
        },
        order,
        shutdown_pageserver.clone(),
@@ -660,6 +652,7 @@ fn start_pageserver(
                async move {
                    page_service::libpq_listener_main(
                        tenant_manager,
+                        broker_client,
                        pg_auth,
                        pageserver_listener,
                        conf.pg_auth_type,
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -5,7 +5,7 @@
 //! See also `settings.md` for better description on every parameter.

 use anyhow::{anyhow, bail, ensure, Context, Result};
-use pageserver_api::{models::ImageCompressionAlgorithm, shard::TenantShardId};
+use pageserver_api::shard::TenantShardId;
 use remote_storage::{RemotePath, RemoteStorageConfig};
 use serde;
 use serde::de::IntoDeserializer;
@@ -30,11 +30,11 @@ use utils::{
    logging::LogFormat,
 };

+use crate::tenant::timeline::GetVectoredImpl;
 use crate::tenant::vectored_blob_io::MaxVectoredReadBytes;
 use crate::tenant::{config::TenantConfOpt, timeline::GetImpl};
 use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
 use crate::{disk_usage_eviction_task::DiskUsageEvictionTaskConfig, virtual_file::io_engine};
-use crate::{l0_flush::L0FlushConfig, tenant::timeline::GetVectoredImpl};
 use crate::{tenant::config::TenantConf, virtual_file};
 use crate::{TENANT_HEATMAP_BASENAME, TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX};

@@ -50,7 +50,6 @@ pub mod defaults {
        DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_HTTP_LISTEN_PORT, DEFAULT_PG_LISTEN_ADDR,
        DEFAULT_PG_LISTEN_PORT,
    };
-    use pageserver_api::models::ImageCompressionAlgorithm;
    pub use storage_broker::DEFAULT_ENDPOINT as BROKER_DEFAULT_ENDPOINT;

    pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "60 s";
@@ -91,9 +90,6 @@ pub mod defaults {

    pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB

-    pub const DEFAULT_IMAGE_COMPRESSION: ImageCompressionAlgorithm =
-        ImageCompressionAlgorithm::Disabled;
-
    pub const DEFAULT_VALIDATE_VECTORED_GET: bool = true;

    pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
@@ -163,7 +159,7 @@ pub mod defaults {

 #ephemeral_bytes_per_memory_kb = {DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB}

-#[remote_storage]
+[remote_storage]

 "#
    );
@@ -289,16 +285,12 @@ pub struct PageServerConf {

    pub validate_vectored_get: bool,

-    pub image_compression: ImageCompressionAlgorithm,
-
    /// How many bytes of ephemeral layer content will we allow per kilobyte of RAM.  When this
    /// is exceeded, we start proactively closing ephemeral layers to limit the total amount
    /// of ephemeral data.
    ///
    /// Setting this to zero disables limits on total ephemeral layer size.
    pub ephemeral_bytes_per_memory_kb: usize,
-
-    pub l0_flush: L0FlushConfig,
 }

 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -403,11 +395,7 @@ struct PageServerConfigBuilder {

    validate_vectored_get: BuilderValue<bool>,

-    image_compression: BuilderValue<ImageCompressionAlgorithm>,
-
    ephemeral_bytes_per_memory_kb: BuilderValue<usize>,
-
-    l0_flush: BuilderValue<L0FlushConfig>,
 }

 impl PageServerConfigBuilder {
@@ -494,10 +482,8 @@ impl PageServerConfigBuilder {
            max_vectored_read_bytes: Set(MaxVectoredReadBytes(
                NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
            )),
-            image_compression: Set(DEFAULT_IMAGE_COMPRESSION),
            validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET),
            ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
-            l0_flush: Set(L0FlushConfig::default()),
        }
    }
 }
@@ -681,18 +667,10 @@ impl PageServerConfigBuilder {
        self.validate_vectored_get = BuilderValue::Set(value);
    }

-    pub fn get_image_compression(&mut self, value: ImageCompressionAlgorithm) {
-        self.image_compression = BuilderValue::Set(value);
-    }
-
    pub fn get_ephemeral_bytes_per_memory_kb(&mut self, value: usize) {
        self.ephemeral_bytes_per_memory_kb = BuilderValue::Set(value);
    }

-    pub fn l0_flush(&mut self, value: L0FlushConfig) {
-        self.l0_flush = BuilderValue::Set(value);
-    }
-
    pub fn build(self) -> anyhow::Result<PageServerConf> {
        let default = Self::default_values();

@@ -749,9 +727,7 @@ impl PageServerConfigBuilder {
                get_impl,
                max_vectored_read_bytes,
                validate_vectored_get,
-                image_compression,
                ephemeral_bytes_per_memory_kb,
-                l0_flush,
            }
            CUSTOM LOGIC
            {
@@ -942,7 +918,7 @@ impl PageServerConf {
                "http_auth_type" => builder.http_auth_type(parse_toml_from_str(key, item)?),
                "pg_auth_type" => builder.pg_auth_type(parse_toml_from_str(key, item)?),
                "remote_storage" => {
-                    builder.remote_storage_config(Some(RemoteStorageConfig::from_toml(item).context("remote_storage")?))
+                    builder.remote_storage_config(RemoteStorageConfig::from_toml(item)?)
                }
                "tenant_config" => {
                    t_conf = TenantConfOpt::try_from(item.to_owned()).context(format!("failed to parse: '{key}'"))?;
@@ -970,7 +946,7 @@ impl PageServerConf {
                    builder.metric_collection_endpoint(Some(endpoint));
                },
                "metric_collection_bucket" => {
-                    builder.metric_collection_bucket(Some(RemoteStorageConfig::from_toml(item)?))
+                    builder.metric_collection_bucket(RemoteStorageConfig::from_toml(item)?)
                }
                "synthetic_size_calculation_interval" =>
                    builder.synthetic_size_calculation_interval(parse_toml_duration(key, item)?),
@@ -1028,15 +1004,9 @@ impl PageServerConf {
                "validate_vectored_get" => {
                    builder.get_validate_vectored_get(parse_toml_bool("validate_vectored_get", item)?)
                }
-                "image_compression" => {
-                    builder.get_image_compression(parse_toml_from_str("image_compression", item)?)
-                }
                "ephemeral_bytes_per_memory_kb" => {
                    builder.get_ephemeral_bytes_per_memory_kb(parse_toml_u64("ephemeral_bytes_per_memory_kb", item)? as usize)
                }
-                "l0_flush" => {
-                    builder.l0_flush(utils::toml_edit_ext::deserialize_item(item).context("l0_flush")?)
-                }
                _ => bail!("unrecognized pageserver option '{key}'"),
            }
        }
@@ -1118,10 +1088,8 @@ impl PageServerConf {
                NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
                    .expect("Invalid default constant"),
            ),
-            image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
            validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
            ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
-            l0_flush: L0FlushConfig::default(),
        }
    }
 }
@@ -1360,9 +1328,7 @@ background_task_maximum_delay = '334 s'
                        .expect("Invalid default constant")
                ),
                validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
-                image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
                ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
-                l0_flush: L0FlushConfig::default(),
            },
            "Correct defaults should be used when no config values are provided"
        );
@@ -1435,9 +1401,7 @@ background_task_maximum_delay = '334 s'
                        .expect("Invalid default constant")
                ),
                validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
-                image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
                ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
-                l0_flush: L0FlushConfig::default(),
            },
            "Should be able to parse all basic config values correctly"
        );
@@ -1717,19 +1681,6 @@ threshold = "20m"
        }
    }

-    #[test]
-    fn empty_remote_storage_is_error() {
-        let tempdir = tempdir().unwrap();
-        let (workdir, _) = prepare_fs(&tempdir).unwrap();
-        let input = r#"
-remote_storage = {}
-        "#;
-        let doc = toml_edit::Document::from_str(input).unwrap();
-        let err = PageServerConf::parse_and_validate(&doc, &workdir)
-            .expect_err("empty remote_storage field should fail, don't specify it if you want no remote_storage");
-        assert!(format!("{err}").contains("remote_storage"), "{err}");
-    }
-
    fn prepare_fs(tempdir: &Utf8TempDir) -> anyhow::Result<(Utf8PathBuf, Utf8PathBuf)> {
        let tempdir_path = tempdir.path();

--- a/pageserver/src/deletion_queue/validator.rs
+++ b/pageserver/src/deletion_queue/validator.rs
@@ -190,7 +190,7 @@ where
                }
            } else {
                // If we failed validation, then do not apply any of the projected updates
-                info!("Dropped remote consistent LSN updates for tenant {tenant_id} in stale generation {:?}", tenant_lsn_state.generation);
+                warn!("Dropped remote consistent LSN updates for tenant {tenant_id} in stale generation {:?}", tenant_lsn_state.generation);
                metrics::DELETION_QUEUE.dropped_lsn_updates.inc();
            }
        }
@@ -225,7 +225,7 @@ where
                    && (tenant.generation == *validated_generation);

                if !this_list_valid {
-                    info!("Dropping stale deletions for tenant {tenant_id} in generation {:?}, objects may be leaked", tenant.generation);
+                    warn!("Dropping stale deletions for tenant {tenant_id} in generation {:?}, objects may be leaked", tenant.generation);
                    metrics::DELETION_QUEUE.keys_dropped.inc_by(tenant.len() as u64);
                    mutated = true;
                } else {
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -265,19 +265,15 @@ paths:
          type: string
          format: hex
    post:
-      description: Obtains a lease for the given LSN.
-      requestBody:
-        content:
-          application/json:
-            schema:
-              type: object
-              required:
-               - lsn
-              properties:
-                lsn:
-                  description: A LSN to obtain the lease for.
-                  type: string
-                  format: hex
+      description: Obtain lease for the given LSN
+      parameters:
+        - name: lsn
+          in: query
+          required: true
+          schema:
+            type: string
+            format: hex
+          description: A LSN to obtain the lease for
      responses:
        "200":
          description: OK
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -10,7 +10,6 @@ use std::time::Duration;

 use anyhow::{anyhow, Context, Result};
 use enumset::EnumSet;
-use futures::StreamExt;
 use futures::TryFutureExt;
 use humantime::format_rfc3339;
 use hyper::header;
@@ -23,7 +22,6 @@ use pageserver_api::models::ListAuxFilesRequest;
 use pageserver_api::models::LocationConfig;
 use pageserver_api::models::LocationConfigListResponse;
 use pageserver_api::models::LsnLease;
-use pageserver_api::models::LsnLeaseRequest;
 use pageserver_api::models::ShardParameters;
 use pageserver_api::models::TenantDetails;
 use pageserver_api::models::TenantLocationConfigResponse;
@@ -44,19 +42,18 @@ use pageserver_api::shard::TenantShardId;
 use remote_storage::DownloadError;
 use remote_storage::GenericRemoteStorage;
 use remote_storage::TimeTravelError;
-use tenant_size_model::{svg::SvgBranchKind, SizeResult, StorageModel};
-use tokio_util::io::StreamReader;
+use tenant_size_model::{SizeResult, StorageModel};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::auth::JwtAuth;
 use utils::failpoint_support::failpoints_handler;
 use utils::http::endpoint::prometheus_metrics_handler;
 use utils::http::endpoint::request_span;
-use utils::http::request::must_parse_query_param;
 use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};

 use crate::context::{DownloadBehavior, RequestContext};
 use crate::deletion_queue::DeletionQueueClient;
+use crate::metrics::{StorageTimeOperation, STORAGE_TIME_GLOBAL};
 use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::task_mgr::TaskKind;
 use crate::tenant::config::{LocationConf, TenantConfOpt};
@@ -78,12 +75,13 @@ use crate::tenant::timeline::CompactFlags;
 use crate::tenant::timeline::CompactionError;
 use crate::tenant::timeline::Timeline;
 use crate::tenant::GetTimelineError;
+use crate::tenant::SpawnMode;
 use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError};
 use crate::{config::PageServerConf, tenant::mgr};
 use crate::{disk_usage_eviction_task, tenant};
 use pageserver_api::models::{
-    StatusResponse, TenantConfigRequest, TenantInfo, TimelineCreateRequest, TimelineGcRequest,
-    TimelineInfo,
+    StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo,
+    TimelineCreateRequest, TimelineGcRequest, TimelineInfo,
 };
 use utils::{
    auth::SwappableJwtAuth,
@@ -231,7 +229,7 @@ impl From<UpsertLocationError> for ApiError {
            BadRequest(e) => ApiError::BadRequest(e),
            Unavailable(_) => ApiError::ShuttingDown,
            e @ InProgress => ApiError::Conflict(format!("{e}")),
-            Flush(e) | InternalError(e) => ApiError::InternalServerError(e),
+            Flush(e) | Other(e) => ApiError::InternalServerError(e),
        }
    }
 }
@@ -410,8 +408,6 @@ async fn build_timeline_info_common(

    let walreceiver_status = timeline.walreceiver_status();

-    let (pitr_history_size, within_ancestor_pitr) = timeline.get_pitr_history_stats();
-
    let info = TimelineInfo {
        tenant_id: timeline.tenant_shard_id,
        timeline_id: timeline.timeline_id,
@@ -432,8 +428,6 @@ async fn build_timeline_info_common(
        directory_entries_counts: timeline.get_directory_metrics().to_vec(),
        current_physical_size,
        current_logical_size_non_incremental: None,
-        pitr_history_size,
-        within_ancestor_pitr,
        timeline_dir_layer_file_size_sum: None,
        wal_source_connstr,
        last_received_msg_lsn,
@@ -1199,15 +1193,10 @@ fn synthetic_size_html_response(
        timeline_map.insert(ti.timeline_id, index);
        timeline_ids.push(ti.timeline_id.to_string());
    }
-    let seg_to_branch: Vec<(usize, SvgBranchKind)> = inputs
+    let seg_to_branch: Vec<usize> = inputs
        .segments
        .iter()
-        .map(|seg| {
-            (
-                *timeline_map.get(&seg.timeline_id).unwrap(),
-                seg.kind.into(),
-            )
-        })
+        .map(|seg| *timeline_map.get(&seg.timeline_id).unwrap())
        .collect();

    let svg =
@@ -1248,6 +1237,75 @@ pub fn html_response(status: StatusCode, data: String) -> Result<Response<Body>,
    Ok(response)
 }

+/// Helper for requests that may take a generation, which is mandatory
+/// when control_plane_api is set, but otherwise defaults to Generation::none()
+fn get_request_generation(state: &State, req_gen: Option<u32>) -> Result<Generation, ApiError> {
+    if state.conf.control_plane_api.is_some() {
+        req_gen
+            .map(Generation::new)
+            .ok_or(ApiError::BadRequest(anyhow!(
+                "generation attribute missing"
+            )))
+    } else {
+        // Legacy mode: all tenants operate with no generation
+        Ok(Generation::none())
+    }
+}
+
+async fn tenant_create_handler(
+    mut request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let request_data: TenantCreateRequest = json_request(&mut request).await?;
+    let target_tenant_id = request_data.new_tenant_id;
+    check_permission(&request, None)?;
+
+    let _timer = STORAGE_TIME_GLOBAL
+        .get_metric_with_label_values(&[StorageTimeOperation::CreateTenant.into()])
+        .expect("bug")
+        .start_timer();
+
+    let tenant_conf =
+        TenantConfOpt::try_from(&request_data.config).map_err(ApiError::BadRequest)?;
+
+    let state = get_state(&request);
+
+    let generation = get_request_generation(state, request_data.generation)?;
+
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
+
+    let location_conf =
+        LocationConf::attached_single(tenant_conf, generation, &request_data.shard_parameters);
+
+    let new_tenant = state
+        .tenant_manager
+        .upsert_location(
+            target_tenant_id,
+            location_conf,
+            None,
+            SpawnMode::Create,
+            &ctx,
+        )
+        .await?;
+
+    let Some(new_tenant) = new_tenant else {
+        // This should never happen: indicates a bug in upsert_location
+        return Err(ApiError::InternalServerError(anyhow::anyhow!(
+            "Upsert succeeded but didn't return tenant!"
+        )));
+    };
+    // We created the tenant. Existing API semantics are that the tenant
+    // is Active when this function returns.
+    new_tenant
+        .wait_to_become_active(ACTIVE_TENANT_TIMEOUT)
+        .await?;
+
+    json_response(
+        StatusCode::CREATED,
+        TenantCreateResponse(new_tenant.tenant_shard_id().tenant_id),
+    )
+}
+
 async fn get_tenant_config_handler(
    request: Request<Body>,
    _cancel: CancellationToken,
@@ -1309,7 +1367,7 @@ async fn update_tenant_config_handler(

    crate::tenant::Tenant::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf)
        .await
-        .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;
+        .map_err(ApiError::InternalServerError)?;
    tenant.set_new_tenant_config(new_tenant_conf);

    json_response(StatusCode::OK, ())
@@ -1540,13 +1598,15 @@ async fn handle_tenant_break(

 // Obtains an lsn lease on the given timeline.
 async fn lsn_lease_handler(
-    mut request: Request<Body>,
+    request: Request<Body>,
    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
-    let lsn = json_request::<LsnLeaseRequest>(&mut request).await?.lsn;
+
+    let lsn: Lsn = parse_query_param(&request, "lsn")?
+        .ok_or_else(|| ApiError::BadRequest(anyhow!("missing 'lsn' query parameter")))?;

    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);

@@ -2407,189 +2467,6 @@ async fn post_top_tenants(
    )
 }

-async fn put_tenant_timeline_import_basebackup(
-    request: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
-    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
-    let base_lsn: Lsn = must_parse_query_param(&request, "base_lsn")?;
-    let end_lsn: Lsn = must_parse_query_param(&request, "end_lsn")?;
-    let pg_version: u32 = must_parse_query_param(&request, "pg_version")?;
-
-    check_permission(&request, Some(tenant_id))?;
-
-    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
-
-    let span = info_span!("import_basebackup", tenant_id=%tenant_id, timeline_id=%timeline_id, base_lsn=%base_lsn, end_lsn=%end_lsn, pg_version=%pg_version);
-    async move {
-        let state = get_state(&request);
-        let tenant = state
-            .tenant_manager
-            .get_attached_tenant_shard(TenantShardId::unsharded(tenant_id))?;
-
-        let broker_client = state.broker_client.clone();
-
-        let mut body = StreamReader::new(request.into_body().map(|res| {
-            res.map_err(|error| {
-                std::io::Error::new(std::io::ErrorKind::Other, anyhow::anyhow!(error))
-            })
-        }));
-
-        tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
-
-        let timeline = tenant
-            .create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx)
-            .map_err(ApiError::InternalServerError)
-            .await?;
-
-        // TODO mark timeline as not ready until it reaches end_lsn.
-        // We might have some wal to import as well, and we should prevent compute
-        // from connecting before that and writing conflicting wal.
-        //
-        // This is not relevant for pageserver->pageserver migrations, since there's
-        // no wal to import. But should be fixed if we want to import from postgres.
-
-        // TODO leave clean state on error. For now you can use detach to clean
-        // up broken state from a failed import.
-
-        // Import basebackup provided via CopyData
-        info!("importing basebackup");
-
-        timeline
-            .import_basebackup_from_tar(tenant.clone(), &mut body, base_lsn, broker_client, &ctx)
-            .await
-            .map_err(ApiError::InternalServerError)?;
-
-        // Read the end of the tar archive.
-        read_tar_eof(body)
-            .await
-            .map_err(ApiError::InternalServerError)?;
-
-        // TODO check checksum
-        // Meanwhile you can verify client-side by taking fullbackup
-        // and checking that it matches in size with what was imported.
-        // It wouldn't work if base came from vanilla postgres though,
-        // since we discard some log files.
-
-        info!("done");
-        json_response(StatusCode::OK, ())
-    }
-    .instrument(span)
-    .await
-}
-
-async fn put_tenant_timeline_import_wal(
-    request: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
-    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
-    let start_lsn: Lsn = must_parse_query_param(&request, "start_lsn")?;
-    let end_lsn: Lsn = must_parse_query_param(&request, "end_lsn")?;
-
-    check_permission(&request, Some(tenant_id))?;
-
-    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
-
-    let span = info_span!("import_wal", tenant_id=%tenant_id, timeline_id=%timeline_id, start_lsn=%start_lsn, end_lsn=%end_lsn);
-    async move {
-        let state = get_state(&request);
-
-        let timeline = active_timeline_of_active_tenant(&state.tenant_manager, TenantShardId::unsharded(tenant_id), timeline_id).await?;
-
-        let mut body = StreamReader::new(request.into_body().map(|res| {
-            res.map_err(|error| {
-                std::io::Error::new(std::io::ErrorKind::Other, anyhow::anyhow!(error))
-            })
-        }));
-
-        let last_record_lsn = timeline.get_last_record_lsn();
-        if last_record_lsn != start_lsn {
-            return Err(ApiError::InternalServerError(anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}")));
-        }
-
-        // TODO leave clean state on error. For now you can use detach to clean
-        // up broken state from a failed import.
-
-        // Import wal provided via CopyData
-        info!("importing wal");
-        crate::import_datadir::import_wal_from_tar(&timeline, &mut body, start_lsn, end_lsn, &ctx).await.map_err(ApiError::InternalServerError)?;
-        info!("wal import complete");
-
-        // Read the end of the tar archive.
-        read_tar_eof(body).await.map_err(ApiError::InternalServerError)?;
-
-        // TODO Does it make sense to overshoot?
-        if timeline.get_last_record_lsn() < end_lsn {
-            return Err(ApiError::InternalServerError(anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}")));
-        }
-
-        // Flush data to disk, then upload to s3. No need for a forced checkpoint.
-        // We only want to persist the data, and it doesn't matter if it's in the
-        // shape of deltas or images.
-        info!("flushing layers");
-        timeline.freeze_and_flush().await.map_err(|e| match e {
-            tenant::timeline::FlushLayerError::Cancelled => ApiError::ShuttingDown,
-            other => ApiError::InternalServerError(anyhow::anyhow!(other)),
-        })?;
-
-        info!("done");
-
-        json_response(StatusCode::OK, ())
-    }.instrument(span).await
-}
-
-/// Read the end of a tar archive.
-///
-/// A tar archive normally ends with two consecutive blocks of zeros, 512 bytes each.
-/// `tokio_tar` already read the first such block. Read the second all-zeros block,
-/// and check that there is no more data after the EOF marker.
-///
-/// 'tar' command can also write extra blocks of zeros, up to a record
-/// size, controlled by the --record-size argument. Ignore them too.
-async fn read_tar_eof(mut reader: (impl tokio::io::AsyncRead + Unpin)) -> anyhow::Result<()> {
-    use tokio::io::AsyncReadExt;
-    let mut buf = [0u8; 512];
-
-    // Read the all-zeros block, and verify it
-    let mut total_bytes = 0;
-    while total_bytes < 512 {
-        let nbytes = reader.read(&mut buf[total_bytes..]).await?;
-        total_bytes += nbytes;
-        if nbytes == 0 {
-            break;
-        }
-    }
-    if total_bytes < 512 {
-        anyhow::bail!("incomplete or invalid tar EOF marker");
-    }
-    if !buf.iter().all(|&x| x == 0) {
-        anyhow::bail!("invalid tar EOF marker");
-    }
-
-    // Drain any extra zero-blocks after the EOF marker
-    let mut trailing_bytes = 0;
-    let mut seen_nonzero_bytes = false;
-    loop {
-        let nbytes = reader.read(&mut buf).await?;
-        trailing_bytes += nbytes;
-        if !buf.iter().all(|&x| x == 0) {
-            seen_nonzero_bytes = true;
-        }
-        if nbytes == 0 {
-            break;
-        }
-    }
-    if seen_nonzero_bytes {
-        anyhow::bail!("unexpected non-zero bytes after the tar archive");
-    }
-    if trailing_bytes % 512 != 0 {
-        anyhow::bail!("unexpected number of zeros ({trailing_bytes}), not divisible by tar block size (512 bytes), after the tar archive");
-    }
-    Ok(())
-}
-
 /// Common functionality of all the HTTP API handlers.
 ///
 /// - Adds a tracing span to each request (by `request_span`)
@@ -2734,6 +2611,7 @@ pub fn make_router(
            api_handler(r, reload_auth_validation_keys_handler)
        })
        .get("/v1/tenant", |r| api_handler(r, tenant_list_handler))
+        .post("/v1/tenant", |r| api_handler(r, tenant_create_handler))
        .get("/v1/tenant/:tenant_shard_id", |r| {
            api_handler(r, tenant_status)
        })
@@ -2884,13 +2762,5 @@ pub fn make_router(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/perf_info",
            |r| testing_api_handler("perf_info", r, perf_info),
        )
-        .put(
-            "/v1/tenant/:tenant_id/timeline/:timeline_id/import_basebackup",
-            |r| api_handler(r, put_tenant_timeline_import_basebackup),
-        )
-        .put(
-            "/v1/tenant/:tenant_id/timeline/:timeline_id/import_wal",
-            |r| api_handler(r, put_tenant_timeline_import_wal),
-        )
        .any(handler_404))
 }
--- a/pageserver/src/l0_flush.rs
+++ b/pageserver/src/l0_flush.rs
@@ -1,46 +0,0 @@
-use std::{num::NonZeroUsize, sync::Arc};
-
-use crate::tenant::ephemeral_file;
-
-#[derive(Default, Debug, PartialEq, Eq, Clone, serde::Deserialize)]
-#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
-pub enum L0FlushConfig {
-    #[default]
-    PageCached,
-    #[serde(rename_all = "snake_case")]
-    Direct { max_concurrency: NonZeroUsize },
-}
-
-#[derive(Clone)]
-pub struct L0FlushGlobalState(Arc<Inner>);
-
-pub(crate) enum Inner {
-    PageCached,
-    Direct { semaphore: tokio::sync::Semaphore },
-}
-
-impl L0FlushGlobalState {
-    pub fn new(config: L0FlushConfig) -> Self {
-        match config {
-            L0FlushConfig::PageCached => Self(Arc::new(Inner::PageCached)),
-            L0FlushConfig::Direct { max_concurrency } => {
-                let semaphore = tokio::sync::Semaphore::new(max_concurrency.get());
-                Self(Arc::new(Inner::Direct { semaphore }))
-            }
-        }
-    }
-
-    pub(crate) fn inner(&self) -> &Arc<Inner> {
-        &self.0
-    }
-}
-
-impl L0FlushConfig {
-    pub(crate) fn prewarm_on_write(&self) -> ephemeral_file::PrewarmPageCacheOnWrite {
-        use L0FlushConfig::*;
-        match self {
-            PageCached => ephemeral_file::PrewarmPageCacheOnWrite::Yes,
-            Direct { .. } => ephemeral_file::PrewarmPageCacheOnWrite::No,
-        }
-    }
-}
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -11,7 +11,6 @@ pub mod deletion_queue;
 pub mod disk_usage_eviction_task;
 pub mod http;
 pub mod import_datadir;
-pub mod l0_flush;
 pub use pageserver_api::keyspace;
 pub mod aux_file;
 pub mod metrics;
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -8,7 +8,7 @@ use metrics::{
 };
 use once_cell::sync::Lazy;
 use pageserver_api::shard::TenantShardId;
-use strum::{EnumCount, VariantNames};
+use strum::{EnumCount, IntoEnumIterator, VariantNames};
 use strum_macros::{EnumVariantNames, IntoStaticStr};
 use tracing::warn;
 use utils::id::TimelineId;
@@ -53,6 +53,9 @@ pub(crate) enum StorageTimeOperation {

    #[strum(serialize = "find gc cutoffs")]
    FindGcCutoffs,
+
+    #[strum(serialize = "create tenant")]
+    CreateTenant,
 }

 pub(crate) static STORAGE_TIME_SUM_PER_TIMELINE: Lazy<CounterVec> = Lazy::new(|| {
@@ -464,24 +467,6 @@ static LAST_RECORD_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-static PITR_HISTORY_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
-    register_uint_gauge_vec!(
-        "pageserver_pitr_history_size",
-        "Data written since PITR cutoff on this timeline",
-        &["tenant_id", "shard_id", "timeline_id"]
-    )
-    .expect("failed to define a metric")
-});
-
-static TIMELINE_ARCHIVE_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
-    register_uint_gauge_vec!(
-        "pageserver_archive_size",
-        "Timeline's logical size if it is considered eligible for archival (outside PITR window), else zero",
-        &["tenant_id", "shard_id", "timeline_id"]
-    )
-    .expect("failed to define a metric")
-});
-
 static STANDBY_HORIZON: Lazy<IntGaugeVec> = Lazy::new(|| {
    register_int_gauge_vec!(
        "pageserver_standby_horizon",
@@ -494,7 +479,7 @@ static STANDBY_HORIZON: Lazy<IntGaugeVec> = Lazy::new(|| {
 static RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
    register_uint_gauge_vec!(
        "pageserver_resident_physical_size",
-        "The size of the layer files present in the pageserver's filesystem, for attached locations.",
+        "The size of the layer files present in the pageserver's filesystem.",
        &["tenant_id", "shard_id", "timeline_id"]
    )
    .expect("failed to define a metric")
@@ -1094,12 +1079,21 @@ pub(crate) mod virtual_file_io_engine {
    });
 }

+#[derive(Debug)]
+struct GlobalAndPerTimelineHistogram {
+    global: Histogram,
+    per_tenant_timeline: Histogram,
+}
+
+impl GlobalAndPerTimelineHistogram {
+    fn observe(&self, value: f64) {
+        self.global.observe(value);
+        self.per_tenant_timeline.observe(value);
+    }
+}
+
 struct GlobalAndPerTimelineHistogramTimer<'a, 'c> {
-    global_metric: &'a Histogram,
-
-    // Optional because not all op types are tracked per-timeline
-    timeline_metric: Option<&'a Histogram>,
-
+    h: &'a GlobalAndPerTimelineHistogram,
    ctx: &'c RequestContext,
    start: std::time::Instant,
    op: SmgrQueryType,
@@ -1130,10 +1124,7 @@ impl<'a, 'c> Drop for GlobalAndPerTimelineHistogramTimer<'a, 'c> {
                elapsed
            }
        };
-        self.global_metric.observe(ex_throttled.as_secs_f64());
-        if let Some(timeline_metric) = self.timeline_metric {
-            timeline_metric.observe(ex_throttled.as_secs_f64());
-        }
+        self.h.observe(ex_throttled.as_secs_f64());
    }
 }

@@ -1158,8 +1149,7 @@ pub enum SmgrQueryType {

 #[derive(Debug)]
 pub(crate) struct SmgrQueryTimePerTimeline {
-    global_metrics: [Histogram; SmgrQueryType::COUNT],
-    per_timeline_getpage: Histogram,
+    metrics: [GlobalAndPerTimelineHistogram; SmgrQueryType::COUNT],
 }

 static SMGR_QUERY_TIME_PER_TENANT_TIMELINE: Lazy<HistogramVec> = Lazy::new(|| {
@@ -1237,32 +1227,27 @@ impl SmgrQueryTimePerTimeline {
        let tenant_id = tenant_shard_id.tenant_id.to_string();
        let shard_slug = format!("{}", tenant_shard_id.shard_slug());
        let timeline_id = timeline_id.to_string();
-        let global_metrics = std::array::from_fn(|i| {
+        let metrics = std::array::from_fn(|i| {
            let op = SmgrQueryType::from_repr(i).unwrap();
-            SMGR_QUERY_TIME_GLOBAL
+            let global = SMGR_QUERY_TIME_GLOBAL
                .get_metric_with_label_values(&[op.into()])
-                .unwrap()
+                .unwrap();
+            let per_tenant_timeline = SMGR_QUERY_TIME_PER_TENANT_TIMELINE
+                .get_metric_with_label_values(&[op.into(), &tenant_id, &shard_slug, &timeline_id])
+                .unwrap();
+            GlobalAndPerTimelineHistogram {
+                global,
+                per_tenant_timeline,
+            }
        });
-
-        let per_timeline_getpage = SMGR_QUERY_TIME_PER_TENANT_TIMELINE
-            .get_metric_with_label_values(&[
-                SmgrQueryType::GetPageAtLsn.into(),
-                &tenant_id,
-                &shard_slug,
-                &timeline_id,
-            ])
-            .unwrap();
-        Self {
-            global_metrics,
-            per_timeline_getpage,
-        }
+        Self { metrics }
    }
    pub(crate) fn start_timer<'c: 'a, 'a>(
        &'a self,
        op: SmgrQueryType,
        ctx: &'c RequestContext,
-    ) -> Option<impl Drop + '_> {
-        let global_metric = &self.global_metrics[op as usize];
+    ) -> impl Drop + '_ {
+        let metric = &self.metrics[op as usize];
        let start = Instant::now();
        match ctx.micros_spent_throttled.open() {
            Ok(()) => (),
@@ -1281,20 +1266,12 @@ impl SmgrQueryTimePerTimeline {
                });
            }
        }
-
-        let timeline_metric = if matches!(op, SmgrQueryType::GetPageAtLsn) {
-            Some(&self.per_timeline_getpage)
-        } else {
-            None
-        };
-
-        Some(GlobalAndPerTimelineHistogramTimer {
-            global_metric,
-            timeline_metric,
+        GlobalAndPerTimelineHistogramTimer {
+            h: metric,
            ctx,
            start,
            op,
-        })
+        }
    }
 }

@@ -1341,9 +1318,17 @@ mod smgr_query_time_tests {
            let get_counts = || {
                let global: u64 = ops
                    .iter()
-                    .map(|op| metrics.global_metrics[*op as usize].get_sample_count())
+                    .map(|op| metrics.metrics[*op as usize].global.get_sample_count())
                    .sum();
-                (global, metrics.per_timeline_getpage.get_sample_count())
+                let per_tenant_timeline: u64 = ops
+                    .iter()
+                    .map(|op| {
+                        metrics.metrics[*op as usize]
+                            .per_tenant_timeline
+                            .get_sample_count()
+                    })
+                    .sum();
+                (global, per_tenant_timeline)
            };

            let (pre_global, pre_per_tenant_timeline) = get_counts();
@@ -1354,12 +1339,7 @@ mod smgr_query_time_tests {
            drop(timer);

            let (post_global, post_per_tenant_timeline) = get_counts();
-            if matches!(op, super::SmgrQueryType::GetPageAtLsn) {
-                // getpage ops are tracked per-timeline, others aren't
-                assert_eq!(post_per_tenant_timeline, 1);
-            } else {
-                assert_eq!(post_per_tenant_timeline, 0);
-            }
+            assert_eq!(post_per_tenant_timeline, 1);
            assert!(post_global > pre_global);
        }
    }
@@ -1456,12 +1436,10 @@ impl<'a, 'c> BasebackupQueryTimeOngoingRecording<'a, 'c> {
    }
 }

-pub(crate) static LIVE_CONNECTIONS: Lazy<IntCounterPairVec> = Lazy::new(|| {
-    register_int_counter_pair_vec!(
-        "pageserver_live_connections_started",
-        "Number of network connections that we started handling",
-        "pageserver_live_connections_finished",
-        "Number of network connections that we finished handling",
+pub(crate) static LIVE_CONNECTIONS_COUNT: Lazy<IntGaugeVec> = Lazy::new(|| {
+    register_int_gauge_vec!(
+        "pageserver_live_connections",
+        "Number of live network connections",
        &["pageserver_connection_kind"]
    )
    .expect("failed to define a metric")
@@ -1472,7 +1450,10 @@ pub(crate) enum ComputeCommandKind {
    PageStreamV2,
    PageStream,
    Basebackup,
+    GetLastRecordRlsn,
    Fullbackup,
+    ImportBasebackup,
+    ImportWal,
    LeaseLsn,
    Show,
 }
@@ -1713,15 +1694,6 @@ pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| {
 }
 });

-pub(crate) static SECONDARY_RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
-    register_uint_gauge_vec!(
-        "pageserver_secondary_resident_physical_size",
-        "The size of the layer files present in the pageserver's filesystem, for secondary locations.",
-        &["tenant_id", "shard_id"]
-    )
-    .expect("failed to define a metric")
-});
-
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 pub enum RemoteOpKind {
    Upload,
@@ -2124,8 +2096,6 @@ pub(crate) struct TimelineMetrics {
    pub garbage_collect_histo: StorageTimeMetrics,
    pub find_gc_cutoffs_histo: StorageTimeMetrics,
    pub last_record_gauge: IntGauge,
-    pub pitr_history_size: UIntGauge,
-    pub archival_size: UIntGauge,
    pub standby_horizon_gauge: IntGauge,
    pub resident_physical_size_gauge: UIntGauge,
    /// copy of LayeredTimeline.current_logical_size
@@ -2199,15 +2169,6 @@ impl TimelineMetrics {
        let last_record_gauge = LAST_RECORD_LSN
            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
            .unwrap();
-
-        let pitr_history_size = PITR_HISTORY_SIZE
-            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
-            .unwrap();
-
-        let archival_size = TIMELINE_ARCHIVE_SIZE
-            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
-            .unwrap();
-
        let standby_horizon_gauge = STANDBY_HORIZON
            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
            .unwrap();
@@ -2260,8 +2221,6 @@ impl TimelineMetrics {
            find_gc_cutoffs_histo,
            load_layer_map_histo,
            last_record_gauge,
-            pitr_history_size,
-            archival_size,
            standby_horizon_gauge,
            resident_physical_size_gauge,
            current_logical_size_gauge,
@@ -2319,10 +2278,6 @@ impl TimelineMetrics {
        if let Some(metric) = Lazy::get(&DIRECTORY_ENTRIES_COUNT) {
            let _ = metric.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        }
-
-        let _ = TIMELINE_ARCHIVE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
-        let _ = PITR_HISTORY_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
-
        let _ = EVICTIONS.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        let _ = AUX_FILE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        let _ = VALID_LSN_LEASE_COUNT.remove_label_values(&[tenant_id, shard_id, timeline_id]);
@@ -2356,12 +2311,14 @@ impl TimelineMetrics {
            let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, shard_id, timeline_id]);
        }

-        let _ = SMGR_QUERY_TIME_PER_TENANT_TIMELINE.remove_label_values(&[
-            SmgrQueryType::GetPageAtLsn.into(),
-            tenant_id,
-            shard_id,
-            timeline_id,
-        ]);
+        for op in SmgrQueryType::iter() {
+            let _ = SMGR_QUERY_TIME_PER_TENANT_TIMELINE.remove_label_values(&[
+                op.into(),
+                tenant_id,
+                shard_id,
+                timeline_id,
+            ]);
+        }
    }
 }

--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -4,7 +4,9 @@
 use anyhow::Context;
 use async_compression::tokio::write::GzipEncoder;
 use bytes::Buf;
+use bytes::Bytes;
 use futures::stream::FuturesUnordered;
+use futures::Stream;
 use futures::StreamExt;
 use pageserver_api::key::Key;
 use pageserver_api::models::TenantState;
@@ -26,6 +28,7 @@ use std::borrow::Cow;
 use std::collections::HashMap;
 use std::io;
 use std::net::TcpListener;
+use std::pin::pin;
 use std::str;
 use std::str::FromStr;
 use std::sync::Arc;
@@ -34,6 +37,7 @@ use std::time::Instant;
 use std::time::SystemTime;
 use tokio::io::AsyncWriteExt;
 use tokio::io::{AsyncRead, AsyncWrite};
+use tokio_util::io::StreamReader;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::id::ConnectionId;
@@ -49,8 +53,9 @@ use crate::auth::check_permission;
 use crate::basebackup;
 use crate::basebackup::BasebackupError;
 use crate::context::{DownloadBehavior, RequestContext};
+use crate::import_datadir::import_wal_from_tar;
 use crate::metrics;
-use crate::metrics::{ComputeCommandKind, COMPUTE_COMMANDS_COUNTERS, LIVE_CONNECTIONS};
+use crate::metrics::{ComputeCommandKind, COMPUTE_COMMANDS_COUNTERS, LIVE_CONNECTIONS_COUNT};
 use crate::pgdatadir_mapping::Version;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
@@ -61,6 +66,7 @@ use crate::tenant::mgr::GetTenantError;
 use crate::tenant::mgr::ShardResolveResult;
 use crate::tenant::mgr::ShardSelector;
 use crate::tenant::mgr::TenantManager;
+use crate::tenant::timeline::FlushLayerError;
 use crate::tenant::timeline::WaitLsnError;
 use crate::tenant::GetTimelineError;
 use crate::tenant::PageReconstructError;
@@ -76,6 +82,56 @@ use postgres_ffi::BLCKSZ;
 // is not yet in state [`TenantState::Active`].
 const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000);

+/// Read the end of a tar archive.
+///
+/// A tar archive normally ends with two consecutive blocks of zeros, 512 bytes each.
+/// `tokio_tar` already read the first such block. Read the second all-zeros block,
+/// and check that there is no more data after the EOF marker.
+///
+/// 'tar' command can also write extra blocks of zeros, up to a record
+/// size, controlled by the --record-size argument. Ignore them too.
+async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<()> {
+    use tokio::io::AsyncReadExt;
+    let mut buf = [0u8; 512];
+
+    // Read the all-zeros block, and verify it
+    let mut total_bytes = 0;
+    while total_bytes < 512 {
+        let nbytes = reader.read(&mut buf[total_bytes..]).await?;
+        total_bytes += nbytes;
+        if nbytes == 0 {
+            break;
+        }
+    }
+    if total_bytes < 512 {
+        anyhow::bail!("incomplete or invalid tar EOF marker");
+    }
+    if !buf.iter().all(|&x| x == 0) {
+        anyhow::bail!("invalid tar EOF marker");
+    }
+
+    // Drain any extra zero-blocks after the EOF marker
+    let mut trailing_bytes = 0;
+    let mut seen_nonzero_bytes = false;
+    loop {
+        let nbytes = reader.read(&mut buf).await?;
+        trailing_bytes += nbytes;
+        if !buf.iter().all(|&x| x == 0) {
+            seen_nonzero_bytes = true;
+        }
+        if nbytes == 0 {
+            break;
+        }
+    }
+    if seen_nonzero_bytes {
+        anyhow::bail!("unexpected non-zero bytes after the tar archive");
+    }
+    if trailing_bytes % 512 != 0 {
+        anyhow::bail!("unexpected number of zeros ({trailing_bytes}), not divisible by tar block size (512 bytes), after the tar archive");
+    }
+    Ok(())
+}
+
 ///////////////////////////////////////////////////////////////////////////////

 ///
@@ -85,6 +141,7 @@ const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000);
 ///
 pub async fn libpq_listener_main(
    tenant_manager: Arc<TenantManager>,
+    broker_client: storage_broker::BrokerClientChannel,
    auth: Option<Arc<SwappableJwtAuth>>,
    listener: TcpListener,
    auth_type: AuthType,
@@ -129,6 +186,7 @@ pub async fn libpq_listener_main(
                    false,
                    page_service_conn_main(
                        tenant_manager.clone(),
+                        broker_client.clone(),
                        local_auth,
                        socket,
                        auth_type,
@@ -151,14 +209,20 @@ pub async fn libpq_listener_main(
 #[instrument(skip_all, fields(peer_addr))]
 async fn page_service_conn_main(
    tenant_manager: Arc<TenantManager>,
+    broker_client: storage_broker::BrokerClientChannel,
    auth: Option<Arc<SwappableJwtAuth>>,
    socket: tokio::net::TcpStream,
    auth_type: AuthType,
    connection_ctx: RequestContext,
 ) -> anyhow::Result<()> {
-    let _guard = LIVE_CONNECTIONS
-        .with_label_values(&["page_service"])
-        .guard();
+    // Immediately increment the gauge, then create a job to decrement it on task exit.
+    // One of the pros of `defer!` is that this will *most probably*
+    // get called, even in presence of panics.
+    let gauge = LIVE_CONNECTIONS_COUNT.with_label_values(&["page_service"]);
+    gauge.inc();
+    scopeguard::defer! {
+        gauge.dec();
+    }

    socket
        .set_nodelay(true)
@@ -203,11 +267,12 @@ async fn page_service_conn_main(
    // and create a child per-query context when it invokes process_query.
    // But it's in a shared crate, so, we store connection_ctx inside PageServerHandler
    // and create the per-query context in process_query ourselves.
-    let mut conn_handler = PageServerHandler::new(tenant_manager, auth, connection_ctx);
+    let mut conn_handler =
+        PageServerHandler::new(tenant_manager, broker_client, auth, connection_ctx);
    let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;

    match pgbackend
-        .run(&mut conn_handler, &task_mgr::shutdown_token())
+        .run(&mut conn_handler, task_mgr::shutdown_watcher)
        .await
    {
        Ok(()) => {
@@ -234,6 +299,7 @@ struct HandlerTimeline {
 }

 struct PageServerHandler {
+    broker_client: storage_broker::BrokerClientChannel,
    auth: Option<Arc<SwappableJwtAuth>>,
    claims: Option<Claims>,

@@ -325,11 +391,13 @@ impl From<WaitLsnError> for QueryError {
 impl PageServerHandler {
    pub fn new(
        tenant_manager: Arc<TenantManager>,
+        broker_client: storage_broker::BrokerClientChannel,
        auth: Option<Arc<SwappableJwtAuth>>,
        connection_ctx: RequestContext,
    ) -> Self {
        PageServerHandler {
            tenant_manager,
+            broker_client,
            auth,
            claims: None,
            connection_ctx,
@@ -412,6 +480,73 @@ impl PageServerHandler {
        )
    }

+    fn copyin_stream<'a, IO>(
+        &'a self,
+        pgb: &'a mut PostgresBackend<IO>,
+        cancel: &'a CancellationToken,
+    ) -> impl Stream<Item = io::Result<Bytes>> + 'a
+    where
+        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
+    {
+        async_stream::try_stream! {
+            loop {
+                let msg = tokio::select! {
+                    biased;
+
+                    _ = cancel.cancelled() => {
+                        // We were requested to shut down.
+                        let msg = "pageserver is shutting down";
+                        let _ = pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, None));
+                        Err(QueryError::Shutdown)
+                    }
+
+                    msg = pgb.read_message() => { msg.map_err(QueryError::from)}
+                };
+
+                match msg {
+                    Ok(Some(message)) => {
+                        let copy_data_bytes = match message {
+                            FeMessage::CopyData(bytes) => bytes,
+                            FeMessage::CopyDone => { break },
+                            FeMessage::Sync => continue,
+                            FeMessage::Terminate => {
+                                let msg = "client terminated connection with Terminate message during COPY";
+                                let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
+                                // error can't happen here, ErrorResponse serialization should be always ok
+                                pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?;
+                                Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
+                                break;
+                            }
+                            m => {
+                                let msg = format!("unexpected message {m:?}");
+                                // error can't happen here, ErrorResponse serialization should be always ok
+                                pgb.write_message_noflush(&BeMessage::ErrorResponse(&msg, None)).map_err(|e| e.into_io_error())?;
+                                Err(io::Error::new(io::ErrorKind::Other, msg))?;
+                                break;
+                            }
+                        };
+
+                        yield copy_data_bytes;
+                    }
+                    Ok(None) => {
+                        let msg = "client closed connection during COPY";
+                        let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
+                        // error can't happen here, ErrorResponse serialization should be always ok
+                        pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?;
+                        self.flush_cancellable(pgb, cancel).await.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
+                        Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
+                    }
+                    Err(QueryError::Disconnected(ConnectionError::Io(io_error))) => {
+                        Err(io_error)?;
+                    }
+                    Err(other) => {
+                        Err(io::Error::new(io::ErrorKind::Other, other.to_string()))?;
+                    }
+                };
+            }
+        }
+    }
+
    #[instrument(skip_all)]
    async fn handle_pagerequests<IO>(
        &mut self,
@@ -583,6 +718,128 @@ impl PageServerHandler {
        Ok(())
    }

+    #[allow(clippy::too_many_arguments)]
+    #[instrument(skip_all, fields(%base_lsn, end_lsn=%_end_lsn, %pg_version))]
+    async fn handle_import_basebackup<IO>(
+        &self,
+        pgb: &mut PostgresBackend<IO>,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        base_lsn: Lsn,
+        _end_lsn: Lsn,
+        pg_version: u32,
+        ctx: RequestContext,
+    ) -> Result<(), QueryError>
+    where
+        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
+    {
+        debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id();
+
+        // Create empty timeline
+        info!("creating new timeline");
+        let tenant = self
+            .get_active_tenant_with_timeout(tenant_id, ShardSelector::Zero, ACTIVE_TENANT_TIMEOUT)
+            .await?;
+        let timeline = tenant
+            .create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx)
+            .await?;
+
+        // TODO mark timeline as not ready until it reaches end_lsn.
+        // We might have some wal to import as well, and we should prevent compute
+        // from connecting before that and writing conflicting wal.
+        //
+        // This is not relevant for pageserver->pageserver migrations, since there's
+        // no wal to import. But should be fixed if we want to import from postgres.
+
+        // TODO leave clean state on error. For now you can use detach to clean
+        // up broken state from a failed import.
+
+        // Import basebackup provided via CopyData
+        info!("importing basebackup");
+        pgb.write_message_noflush(&BeMessage::CopyInResponse)?;
+        self.flush_cancellable(pgb, &tenant.cancel).await?;
+
+        let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb, &tenant.cancel)));
+        timeline
+            .import_basebackup_from_tar(
+                tenant.clone(),
+                &mut copyin_reader,
+                base_lsn,
+                self.broker_client.clone(),
+                &ctx,
+            )
+            .await?;
+
+        // Read the end of the tar archive.
+        read_tar_eof(copyin_reader).await?;
+
+        // TODO check checksum
+        // Meanwhile you can verify client-side by taking fullbackup
+        // and checking that it matches in size with what was imported.
+        // It wouldn't work if base came from vanilla postgres though,
+        // since we discard some log files.
+
+        info!("done");
+        Ok(())
+    }
+
+    #[instrument(skip_all, fields(shard_id, %start_lsn, %end_lsn))]
+    async fn handle_import_wal<IO>(
+        &self,
+        pgb: &mut PostgresBackend<IO>,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        start_lsn: Lsn,
+        end_lsn: Lsn,
+        ctx: RequestContext,
+    ) -> Result<(), QueryError>
+    where
+        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
+    {
+        let timeline = self
+            .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
+            .await?;
+        let last_record_lsn = timeline.get_last_record_lsn();
+        if last_record_lsn != start_lsn {
+            return Err(QueryError::Other(
+                anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}"))
+            );
+        }
+
+        // TODO leave clean state on error. For now you can use detach to clean
+        // up broken state from a failed import.
+
+        // Import wal provided via CopyData
+        info!("importing wal");
+        pgb.write_message_noflush(&BeMessage::CopyInResponse)?;
+        self.flush_cancellable(pgb, &timeline.cancel).await?;
+        let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb, &timeline.cancel)));
+        import_wal_from_tar(&timeline, &mut copyin_reader, start_lsn, end_lsn, &ctx).await?;
+        info!("wal import complete");
+
+        // Read the end of the tar archive.
+        read_tar_eof(copyin_reader).await?;
+
+        // TODO Does it make sense to overshoot?
+        if timeline.get_last_record_lsn() < end_lsn {
+            return Err(QueryError::Other(
+                anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}"))
+            );
+        }
+
+        // Flush data to disk, then upload to s3. No need for a forced checkpoint.
+        // We only want to persist the data, and it doesn't matter if it's in the
+        // shape of deltas or images.
+        info!("flushing layers");
+        timeline.freeze_and_flush().await.map_err(|e| match e {
+            FlushLayerError::Cancelled => QueryError::Shutdown,
+            other => QueryError::Other(other.into()),
+        })?;
+
+        info!("done");
+        Ok(())
+    }
+
    /// Helper function to handle the LSN from client request.
    ///
    /// Each GetPage (and Exists and Nblocks) request includes information about
@@ -1399,6 +1656,53 @@ where
            metric_recording.observe(&res);
            res?;
        }
+        // return pair of prev_lsn and last_lsn
+        else if let Some(params) = parts.strip_prefix(&["get_last_record_rlsn"]) {
+            if params.len() != 2 {
+                return Err(QueryError::Other(anyhow::anyhow!(
+                    "invalid param number for get_last_record_rlsn command"
+                )));
+            }
+
+            let tenant_id = TenantId::from_str(params[0])
+                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
+            let timeline_id = TimelineId::from_str(params[1])
+                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
+
+            tracing::Span::current()
+                .record("tenant_id", field::display(tenant_id))
+                .record("timeline_id", field::display(timeline_id));
+
+            self.check_permission(Some(tenant_id))?;
+
+            COMPUTE_COMMANDS_COUNTERS
+                .for_command(ComputeCommandKind::GetLastRecordRlsn)
+                .inc();
+
+            async {
+                let timeline = self
+                    .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
+                    .await?;
+
+                let end_of_timeline = timeline.get_last_record_rlsn();
+
+                pgb.write_message_noflush(&BeMessage::RowDescription(&[
+                    RowDescriptor::text_col(b"prev_lsn"),
+                    RowDescriptor::text_col(b"last_lsn"),
+                ]))?
+                .write_message_noflush(&BeMessage::DataRow(&[
+                    Some(end_of_timeline.prev.to_string().as_bytes()),
+                    Some(end_of_timeline.last.to_string().as_bytes()),
+                ]))?
+                .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
+                anyhow::Ok(())
+            }
+            .instrument(info_span!(
+                "handle_get_last_record_lsn",
+                shard_id = tracing::field::Empty
+            ))
+            .await?;
+        }
        // same as basebackup, but result includes relational data as well
        else if let Some(params) = parts.strip_prefix(&["fullbackup"]) {
            if params.len() < 2 {
@@ -1453,6 +1757,109 @@ where
            )
            .await?;
            pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
+        } else if query_string.starts_with("import basebackup ") {
+            // Import the `base` section (everything but the wal) of a basebackup.
+            // Assumes the tenant already exists on this pageserver.
+            //
+            // Files are scheduled to be persisted to remote storage, and the
+            // caller should poll the http api to check when that is done.
+            //
+            // Example import command:
+            // 1. Get start/end LSN from backup_manifest file
+            // 2. Run:
+            // cat my_backup/base.tar | psql -h $PAGESERVER \
+            //     -c "import basebackup $TENANT $TIMELINE $START_LSN $END_LSN $PG_VERSION"
+            let params = &parts[2..];
+            if params.len() != 5 {
+                return Err(QueryError::Other(anyhow::anyhow!(
+                    "invalid param number for import basebackup command"
+                )));
+            }
+            let tenant_id = TenantId::from_str(params[0])
+                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
+            let timeline_id = TimelineId::from_str(params[1])
+                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
+            let base_lsn = Lsn::from_str(params[2])
+                .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?;
+            let end_lsn = Lsn::from_str(params[3])
+                .with_context(|| format!("Failed to parse Lsn from {}", params[3]))?;
+            let pg_version = u32::from_str(params[4])
+                .with_context(|| format!("Failed to parse pg_version from {}", params[4]))?;
+
+            tracing::Span::current()
+                .record("tenant_id", field::display(tenant_id))
+                .record("timeline_id", field::display(timeline_id));
+
+            self.check_permission(Some(tenant_id))?;
+
+            COMPUTE_COMMANDS_COUNTERS
+                .for_command(ComputeCommandKind::ImportBasebackup)
+                .inc();
+
+            match self
+                .handle_import_basebackup(
+                    pgb,
+                    tenant_id,
+                    timeline_id,
+                    base_lsn,
+                    end_lsn,
+                    pg_version,
+                    ctx,
+                )
+                .await
+            {
+                Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?,
+                Err(e) => {
+                    error!("error importing base backup between {base_lsn} and {end_lsn}: {e:?}");
+                    pgb.write_message_noflush(&BeMessage::ErrorResponse(
+                        &e.to_string(),
+                        Some(e.pg_error_code()),
+                    ))?
+                }
+            };
+        } else if query_string.starts_with("import wal ") {
+            // Import the `pg_wal` section of a basebackup.
+            //
+            // Files are scheduled to be persisted to remote storage, and the
+            // caller should poll the http api to check when that is done.
+            let params = &parts[2..];
+            if params.len() != 4 {
+                return Err(QueryError::Other(anyhow::anyhow!(
+                    "invalid param number for import wal command"
+                )));
+            }
+            let tenant_id = TenantId::from_str(params[0])
+                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
+            let timeline_id = TimelineId::from_str(params[1])
+                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
+            let start_lsn = Lsn::from_str(params[2])
+                .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?;
+            let end_lsn = Lsn::from_str(params[3])
+                .with_context(|| format!("Failed to parse Lsn from {}", params[3]))?;
+
+            tracing::Span::current()
+                .record("tenant_id", field::display(tenant_id))
+                .record("timeline_id", field::display(timeline_id));
+
+            self.check_permission(Some(tenant_id))?;
+
+            COMPUTE_COMMANDS_COUNTERS
+                .for_command(ComputeCommandKind::ImportWal)
+                .inc();
+
+            match self
+                .handle_import_wal(pgb, tenant_id, timeline_id, start_lsn, end_lsn, ctx)
+                .await
+            {
+                Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?,
+                Err(e) => {
+                    error!("error importing WAL between {start_lsn} and {end_lsn}: {e:?}");
+                    pgb.write_message_noflush(&BeMessage::ErrorResponse(
+                        &e.to_string(),
+                        Some(e.pg_error_code()),
+                    ))?
+                }
+            };
        } else if query_string.to_ascii_lowercase().starts_with("set ") {
            // important because psycopg2 executes "SET datestyle TO 'ISO'"
            // on connect
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -854,14 +854,13 @@ impl Timeline {
        result.add_key(DBDIR_KEY);

        // Fetch list of database dirs and iterate them
-        let dbdir = self.list_dbdirs(lsn, ctx).await?;
-        let mut dbs: Vec<((Oid, Oid), bool)> = dbdir.into_iter().collect();
+        let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
+        let dbdir = DbDirectory::des(&buf)?;

-        dbs.sort_unstable_by(|(k_a, _), (k_b, _)| k_a.cmp(k_b));
-        for ((spcnode, dbnode), has_relmap_file) in dbs {
-            if has_relmap_file {
-                result.add_key(relmap_file_key(spcnode, dbnode));
-            }
+        let mut dbs: Vec<(Oid, Oid)> = dbdir.dbdirs.keys().cloned().collect();
+        dbs.sort_unstable();
+        for (spcnode, dbnode) in dbs {
+            result.add_key(relmap_file_key(spcnode, dbnode));
            result.add_key(rel_dir_to_key(spcnode, dbnode));

            let mut rels: Vec<RelTag> = self
@@ -920,9 +919,6 @@ impl Timeline {
            result.add_key(AUX_FILES_KEY);
        }

-        // Add extra keyspaces in the test cases. Some test cases write keys into the storage without
-        // creating directory keys. These test cases will add such keyspaces into `extra_test_dense_keyspace`
-        // and the keys will not be garbage-colllected.
        #[cfg(test)]
        {
            let guard = self.extra_test_dense_keyspace.load();
@@ -931,48 +927,13 @@ impl Timeline {
            }
        }

-        let dense_keyspace = result.to_keyspace();
-        let sparse_keyspace = SparseKeySpace(KeySpace {
-            ranges: vec![Key::metadata_aux_key_range(), repl_origin_key_range()],
-        });
-
-        if cfg!(debug_assertions) {
-            // Verify if the sparse keyspaces are ordered and non-overlapping.
-
-            // We do not use KeySpaceAccum for sparse_keyspace because we want to ensure each
-            // category of sparse keys are split into their own image/delta files. If there
-            // are overlapping keyspaces, they will be automatically merged by keyspace accum,
-            // and we want the developer to keep the keyspaces separated.
-
-            let ranges = &sparse_keyspace.0.ranges;
-
-            // TODO: use a single overlaps_with across the codebase
-            fn overlaps_with<T: Ord>(a: &Range<T>, b: &Range<T>) -> bool {
-                !(a.end <= b.start || b.end <= a.start)
-            }
-            for i in 0..ranges.len() {
-                for j in 0..i {
-                    if overlaps_with(&ranges[i], &ranges[j]) {
-                        panic!(
-                            "overlapping sparse keyspace: {}..{} and {}..{}",
-                            ranges[i].start, ranges[i].end, ranges[j].start, ranges[j].end
-                        );
-                    }
-                }
-            }
-            for i in 1..ranges.len() {
-                assert!(
-                    ranges[i - 1].end <= ranges[i].start,
-                    "unordered sparse keyspace: {}..{} and {}..{}",
-                    ranges[i - 1].start,
-                    ranges[i - 1].end,
-                    ranges[i].start,
-                    ranges[i].end
-                );
-            }
-        }
-
-        Ok((dense_keyspace, sparse_keyspace))
+        Ok((
+            result.to_keyspace(),
+            /* AUX sparse key space */
+            SparseKeySpace(KeySpace {
+                ranges: vec![repl_origin_key_range(), Key::metadata_aux_key_range()],
+            }),
+        ))
    }

    /// Get cached size of relation if it not updated after specified LSN
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -73,7 +73,6 @@ use crate::deletion_queue::DeletionQueueClient;
 use crate::deletion_queue::DeletionQueueError;
 use crate::import_datadir;
 use crate::is_uninit_mark;
-use crate::l0_flush::L0FlushGlobalState;
 use crate::metrics::TENANT;
 use crate::metrics::{
    remove_tenant_metrics, BROKEN_TENANTS_SET, TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC,
@@ -167,7 +166,6 @@ pub struct TenantSharedResources {
    pub broker_client: storage_broker::BrokerClientChannel,
    pub remote_storage: GenericRemoteStorage,
    pub deletion_queue_client: DeletionQueueClient,
-    pub l0_flush_global_state: L0FlushGlobalState,
 }

 /// A [`Tenant`] is really an _attached_ tenant.  The configuration
@@ -215,6 +213,8 @@ pub(crate) enum SpawnMode {
    Eager,
    /// Lazy activation in the background, with the option to skip the queue if the need comes up
    Lazy,
+    /// Tenant has been created during the lifetime of this process
+    Create,
 }

 ///
@@ -296,8 +296,6 @@ pub struct Tenant {

    /// An ongoing timeline detach must be checked during attempts to GC or compact a timeline.
    ongoing_timeline_detach: std::sync::Mutex<Option<(TimelineId, utils::completion::Barrier)>>,
-
-    l0_flush_global_state: L0FlushGlobalState,
 }

 impl std::fmt::Debug for Tenant {
@@ -533,15 +531,6 @@ impl From<PageReconstructError> for GcError {
    }
 }

-#[derive(thiserror::Error, Debug)]
-pub(crate) enum LoadConfigError {
-    #[error("TOML deserialization error: '{0}'")]
-    DeserializeToml(#[from] toml_edit::de::Error),
-
-    #[error("Config not found at {0}")]
-    NotFound(Utf8PathBuf),
-}
-
 impl Tenant {
    /// Yet another helper for timeline initialization.
    ///
@@ -680,7 +669,6 @@ impl Tenant {
            broker_client,
            remote_storage,
            deletion_queue_client,
-            l0_flush_global_state,
        } = resources;

        let attach_mode = attached_conf.location.attach_mode;
@@ -695,7 +683,6 @@ impl Tenant {
            tenant_shard_id,
            remote_storage.clone(),
            deletion_queue_client,
-            l0_flush_global_state,
        ));

        // The attach task will carry a GateGuard, so that shutdown() reliably waits for it to drop out if
@@ -821,6 +808,9 @@ impl Tenant {
                };

                let preload = match &mode {
+                    SpawnMode::Create => {
+                        None
+                    },
                    SpawnMode::Eager | SpawnMode::Lazy => {
                        let _preload_timer = TENANT.preload.start_timer();
                        let res = tenant_clone
@@ -842,8 +832,11 @@ impl Tenant {

                // We will time the duration of the attach phase unless this is a creation (attach will do no work)
                let attached = {
-                    let _attach_timer = Some(TENANT.attach.start_timer());
-                    tenant_clone.attach(preload, &ctx).await
+                    let _attach_timer = match mode {
+                        SpawnMode::Create => None,
+                        SpawnMode::Eager | SpawnMode::Lazy => Some(TENANT.attach.start_timer()),
+                    };
+                    tenant_clone.attach(preload, mode, &ctx).await
                };

                match attached {
@@ -919,14 +912,21 @@ impl Tenant {
    async fn attach(
        self: &Arc<Tenant>,
        preload: Option<TenantPreload>,
+        mode: SpawnMode,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
        span::debug_assert_current_span_has_tenant_id();

        failpoint_support::sleep_millis_async!("before-attaching-tenant");

-        let Some(preload) = preload else {
-            anyhow::bail!("local-only deployment is no longer supported, https://github.com/neondatabase/neon/issues/5624");
+        let preload = match (preload, mode) {
+            (Some(p), _) => p,
+            (None, SpawnMode::Create) => TenantPreload {
+                timelines: HashMap::new(),
+            },
+            (None, _) => {
+                anyhow::bail!("local-only deployment is no longer supported, https://github.com/neondatabase/neon/issues/5624");
+            }
        };

        let mut timelines_to_resume_deletions = vec![];
@@ -995,7 +995,6 @@ impl Tenant {
                TimelineResources {
                    remote_client,
                    timeline_get_throttle: self.timeline_get_throttle.clone(),
-                    l0_flush_global_state: self.l0_flush_global_state.clone(),
                },
                ctx,
            )
@@ -1365,7 +1364,7 @@ impl Tenant {
        initdb_lsn: Lsn,
        pg_version: u32,
        ctx: &RequestContext,
-        delta_layer_desc: Vec<timeline::DeltaLayerTestDesc>,
+        delta_layer_desc: Vec<Vec<(pageserver_api::key::Key, Lsn, crate::repository::Value)>>,
        image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>,
        end_lsn: Lsn,
    ) -> anyhow::Result<Arc<Timeline>> {
@@ -1816,15 +1815,9 @@ impl Tenant {
        // If we're still attaching, fire the cancellation token early to drop out: this
        // will prevent us flushing, but ensures timely shutdown if some I/O during attach
        // is very slow.
-        let shutdown_mode = if matches!(self.current_state(), TenantState::Attaching) {
+        if matches!(self.current_state(), TenantState::Attaching) {
            self.cancel.cancel();
-
-            // Having fired our cancellation token, do not try and flush timelines: their cancellation tokens
-            // are children of ours, so their flush loops will have shut down already
-            timeline::ShutdownMode::Hard
-        } else {
-            shutdown_mode
-        };
+        }

        match self.set_stopping(shutdown_progress, false, false).await {
            Ok(()) => {}
@@ -2491,7 +2484,6 @@ impl Tenant {
        tenant_shard_id: TenantShardId,
        remote_storage: GenericRemoteStorage,
        deletion_queue_client: DeletionQueueClient,
-        l0_flush_global_state: L0FlushGlobalState,
    ) -> Tenant {
        debug_assert!(
            !attached_conf.location.generation.is_none() || conf.control_plane_api.is_none()
@@ -2579,7 +2571,6 @@ impl Tenant {
            )),
            tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)),
            ongoing_timeline_detach: std::sync::Mutex::default(),
-            l0_flush_global_state,
        }
    }

@@ -2587,35 +2578,36 @@ impl Tenant {
    pub(super) fn load_tenant_config(
        conf: &'static PageServerConf,
        tenant_shard_id: &TenantShardId,
-    ) -> Result<LocationConf, LoadConfigError> {
+    ) -> anyhow::Result<LocationConf> {
        let config_path = conf.tenant_location_config_path(tenant_shard_id);

-        info!("loading tenant configuration from {config_path}");
+        if config_path.exists() {
+            // New-style config takes precedence
+            let deserialized = Self::read_config(&config_path)?;
+            Ok(toml_edit::de::from_document::<LocationConf>(deserialized)?)
+        } else {
+            // The config should almost always exist for a tenant directory:
+            //  - When attaching a tenant, the config is the first thing we write
+            //  - When detaching a tenant, we atomically move the directory to a tmp location
+            //    before deleting contents.
+            //
+            // The very rare edge case that can result in a missing config is if we crash during attach
+            // between creating directory and writing config.  Callers should handle that as if the
+            // directory didn't exist.
+            anyhow::bail!("tenant config not found in {}", config_path);
+        }
+    }
+
+    fn read_config(path: &Utf8Path) -> anyhow::Result<toml_edit::Document> {
+        info!("loading tenant configuration from {path}");

        // load and parse file
-        let config = fs::read_to_string(&config_path).map_err(|e| {
-            match e.kind() {
-                std::io::ErrorKind::NotFound => {
-                    // The config should almost always exist for a tenant directory:
-                    //  - When attaching a tenant, the config is the first thing we write
-                    //  - When detaching a tenant, we atomically move the directory to a tmp location
-                    //    before deleting contents.
-                    //
-                    // The very rare edge case that can result in a missing config is if we crash during attach
-                    // between creating directory and writing config.  Callers should handle that as if the
-                    // directory didn't exist.
+        let config = fs::read_to_string(path)
+            .with_context(|| format!("Failed to load config from path '{path}'"))?;

-                    LoadConfigError::NotFound(config_path)
-                }
-                _ => {
-                    // No IO errors except NotFound are acceptable here: other kinds of error indicate local storage or permissions issues
-                    // that we cannot cleanly recover
-                    crate::virtual_file::on_fatal_io_error(&e, "Reading tenant config file")
-                }
-            }
-        })?;
-
-        Ok(toml_edit::de::from_str::<LocationConf>(&config)?)
+        config
+            .parse::<toml_edit::Document>()
+            .with_context(|| format!("Failed to parse config from file '{path}' as toml file"))
    }

    #[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))]
@@ -2623,7 +2615,7 @@ impl Tenant {
        conf: &'static PageServerConf,
        tenant_shard_id: &TenantShardId,
        location_conf: &LocationConf,
-    ) -> std::io::Result<()> {
+    ) -> anyhow::Result<()> {
        let config_path = conf.tenant_location_config_path(tenant_shard_id);

        Self::persist_tenant_config_at(tenant_shard_id, &config_path, location_conf).await
@@ -2634,7 +2626,7 @@ impl Tenant {
        tenant_shard_id: &TenantShardId,
        config_path: &Utf8Path,
        location_conf: &LocationConf,
-    ) -> std::io::Result<()> {
+    ) -> anyhow::Result<()> {
        debug!("persisting tenantconf to {config_path}");

        let mut conf_content = r#"# This file contains a specific per-tenant's config.
@@ -2643,20 +2635,22 @@ impl Tenant {
        .to_string();

        fail::fail_point!("tenant-config-before-write", |_| {
-            Err(std::io::Error::new(
-                std::io::ErrorKind::Other,
-                "tenant-config-before-write",
-            ))
+            anyhow::bail!("tenant-config-before-write");
        });

        // Convert the config to a toml file.
-        conf_content +=
-            &toml_edit::ser::to_string_pretty(&location_conf).expect("Config serialization failed");
+        conf_content += &toml_edit::ser::to_string_pretty(&location_conf)?;

        let temp_path = path_with_suffix_extension(config_path, TEMP_FILE_SUFFIX);

+        let tenant_shard_id = *tenant_shard_id;
+        let config_path = config_path.to_owned();
        let conf_content = conf_content.into_bytes();
-        VirtualFile::crashsafe_overwrite(config_path.to_owned(), temp_path, conf_content).await
+        VirtualFile::crashsafe_overwrite(config_path.clone(), temp_path, conf_content)
+            .await
+            .with_context(|| format!("write tenant {tenant_shard_id} config to {config_path}"))?;
+
+        Ok(())
    }

    //
@@ -2874,7 +2868,6 @@ impl Tenant {
            {
                let mut target = timeline.gc_info.write().unwrap();

-                // Cull any expired leases
                let now = SystemTime::now();
                target.leases.retain(|_, lease| !lease.is_expired(&now));

@@ -2883,31 +2876,6 @@ impl Tenant {
                    .valid_lsn_lease_count_gauge
                    .set(target.leases.len() as u64);

-                // Look up parent's PITR cutoff to update the child's knowledge of whether it is within parent's PITR
-                if let Some(ancestor_id) = timeline.get_ancestor_timeline_id() {
-                    if let Some(ancestor_gc_cutoffs) = gc_cutoffs.get(&ancestor_id) {
-                        target.within_ancestor_pitr =
-                            timeline.get_ancestor_lsn() >= ancestor_gc_cutoffs.pitr;
-                    }
-                }
-
-                // Update metrics that depend on GC state
-                timeline
-                    .metrics
-                    .archival_size
-                    .set(if target.within_ancestor_pitr {
-                        timeline.metrics.current_logical_size_gauge.get()
-                    } else {
-                        0
-                    });
-                timeline.metrics.pitr_history_size.set(
-                    timeline
-                        .get_last_record_lsn()
-                        .checked_sub(target.cutoffs.pitr)
-                        .unwrap_or(Lsn(0))
-                        .0,
-                );
-
                match gc_cutoffs.remove(&timeline.timeline_id) {
                    Some(cutoffs) => {
                        target.retain_lsns = branchpoints;
@@ -2959,7 +2927,7 @@ impl Tenant {
        dst_id: TimelineId,
        ancestor_lsn: Option<Lsn>,
        ctx: &RequestContext,
-        delta_layer_desc: Vec<timeline::DeltaLayerTestDesc>,
+        delta_layer_desc: Vec<Vec<(pageserver_api::key::Key, Lsn, crate::repository::Value)>>,
        image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>,
        end_lsn: Lsn,
    ) -> anyhow::Result<Arc<Timeline>> {
@@ -3343,7 +3311,6 @@ impl Tenant {
        TimelineResources {
            remote_client,
            timeline_get_throttle: self.timeline_get_throttle.clone(),
-            l0_flush_global_state: self.l0_flush_global_state.clone(),
        }
    }

@@ -3680,7 +3647,6 @@ pub(crate) mod harness {
    use utils::logging;

    use crate::deletion_queue::mock::MockDeletionQueue;
-    use crate::l0_flush::L0FlushConfig;
    use crate::walredo::apply_neon;
    use crate::{repository::Key, walrecord::NeonWalRecord};

@@ -3870,14 +3836,12 @@ pub(crate) mod harness {
                self.tenant_shard_id,
                self.remote_storage.clone(),
                self.deletion_queue.new_client(),
-                // TODO: ideally we should run all unit tests with both configs
-                L0FlushGlobalState::new(L0FlushConfig::default()),
            ));

            let preload = tenant
                .preload(&self.remote_storage, CancellationToken::new())
                .await?;
-            tenant.attach(Some(preload), ctx).await?;
+            tenant.attach(Some(preload), SpawnMode::Eager, ctx).await?;

            tenant.state.send_replace(TenantState::Active);
            for timeline in tenant.timelines.lock().unwrap().values() {
@@ -3959,7 +3923,7 @@ mod tests {
    use storage_layer::PersistentLayerKey;
    use tests::storage_layer::ValuesReconstructState;
    use tests::timeline::{GetVectoredError, ShutdownMode};
-    use timeline::{DeltaLayerTestDesc, GcInfo};
+    use timeline::GcInfo;
    use utils::bin_ser::BeSer;
    use utils::id::TenantId;

@@ -6255,6 +6219,27 @@ mod tests {
            .await
            .unwrap();

+        async fn get_vectored_impl_wrapper(
+            tline: &Arc<Timeline>,
+            key: Key,
+            lsn: Lsn,
+            ctx: &RequestContext,
+        ) -> Result<Option<Bytes>, GetVectoredError> {
+            let mut reconstruct_state = ValuesReconstructState::new();
+            let mut res = tline
+                .get_vectored_impl(
+                    KeySpace::single(key..key.next()),
+                    lsn,
+                    &mut reconstruct_state,
+                    ctx,
+                )
+                .await?;
+            Ok(res.pop_last().map(|(k, v)| {
+                assert_eq!(k, key);
+                v.unwrap()
+            }))
+        }
+
        let lsn = Lsn(0x30);

        // test vectored get on parent timeline
@@ -6294,7 +6279,7 @@ mod tests {

    #[tokio::test]
    async fn test_vectored_missing_metadata_key_reads() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_vectored_missing_metadata_key_reads")?;
+        let harness = TenantHarness::create("test_vectored_missing_data_key_reads")?;
        let (tenant, ctx) = harness.load().await;

        let base_key = Key::from_hex("620000000033333333444444445500000000").unwrap();
@@ -6330,6 +6315,27 @@ mod tests {
            .await
            .unwrap();

+        async fn get_vectored_impl_wrapper(
+            tline: &Arc<Timeline>,
+            key: Key,
+            lsn: Lsn,
+            ctx: &RequestContext,
+        ) -> Result<Option<Bytes>, GetVectoredError> {
+            let mut reconstruct_state = ValuesReconstructState::new();
+            let mut res = tline
+                .get_vectored_impl(
+                    KeySpace::single(key..key.next()),
+                    lsn,
+                    &mut reconstruct_state,
+                    ctx,
+                )
+                .await?;
+            Ok(res.pop_last().map(|(k, v)| {
+                assert_eq!(k, key);
+                v.unwrap()
+            }))
+        }
+
        let lsn = Lsn(0x30);

        // test vectored get on parent timeline
@@ -6405,18 +6411,9 @@ mod tests {
                &ctx,
                // delta layers
                vec![
-                    DeltaLayerTestDesc::new_with_inferred_key_range(
-                        Lsn(0x10)..Lsn(0x20),
-                        vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))],
-                    ),
-                    DeltaLayerTestDesc::new_with_inferred_key_range(
-                        Lsn(0x20)..Lsn(0x30),
-                        vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))],
-                    ),
-                    DeltaLayerTestDesc::new_with_inferred_key_range(
-                        Lsn(0x20)..Lsn(0x30),
-                        vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))],
-                    ),
+                    vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))],
+                    vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))],
+                    vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))],
                ],
                // image layers
                vec![
@@ -6482,29 +6479,17 @@ mod tests {
                &ctx,
                // delta layers
                vec![
-                    DeltaLayerTestDesc::new_with_inferred_key_range(
-                        Lsn(0x10)..Lsn(0x20),
-                        vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))],
-                    ),
-                    DeltaLayerTestDesc::new_with_inferred_key_range(
-                        Lsn(0x20)..Lsn(0x30),
-                        vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))],
-                    ),
-                    DeltaLayerTestDesc::new_with_inferred_key_range(
-                        Lsn(0x20)..Lsn(0x30),
-                        vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))],
-                    ),
-                    DeltaLayerTestDesc::new_with_inferred_key_range(
-                        Lsn(0x30)..Lsn(0x40),
-                        vec![
-                            (key0, Lsn(0x30), Value::Image(test_img("metadata key 0"))),
-                            (key3, Lsn(0x30), Value::Image(test_img("metadata key 3"))),
-                        ],
-                    ),
+                    vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))],
+                    vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))],
+                    vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))],
+                    vec![
+                        (key0, Lsn(0x30), Value::Image(test_img("metadata key 0"))),
+                        (key3, Lsn(0x30), Value::Image(test_img("metadata key 3"))),
+                    ],
                ],
                // image layers
                vec![(Lsn(0x10), vec![(key1, test_img("metadata key 1"))])],
-                Lsn(0x40),
+                Lsn(0x30),
            )
            .await
            .unwrap();
@@ -6527,7 +6512,7 @@ mod tests {

        // Image layers are created at last_record_lsn
        let images = tline
-            .inspect_image_layers(Lsn(0x40), &ctx)
+            .inspect_image_layers(Lsn(0x30), &ctx)
            .await
            .unwrap()
            .into_iter()
@@ -6553,18 +6538,9 @@ mod tests {
                &ctx,
                // delta layers
                vec![
-                    DeltaLayerTestDesc::new_with_inferred_key_range(
-                        Lsn(0x10)..Lsn(0x20),
-                        vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))],
-                    ),
-                    DeltaLayerTestDesc::new_with_inferred_key_range(
-                        Lsn(0x20)..Lsn(0x30),
-                        vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))],
-                    ),
-                    DeltaLayerTestDesc::new_with_inferred_key_range(
-                        Lsn(0x20)..Lsn(0x30),
-                        vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))],
-                    ),
+                    vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))],
+                    vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))],
+                    vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))],
                ],
                // image layers
                vec![(Lsn(0x10), vec![(key1, test_img("metadata key 1"))])],
@@ -6612,21 +6588,15 @@ mod tests {
            key
        }

-        // We create
-        // - one bottom-most image layer,
-        // - a delta layer D1 crossing the GC horizon with data below and above the horizon,
-        // - a delta layer D2 crossing the GC horizon with data only below the horizon,
-        // - a delta layer D3 above the horizon.
+        // We create one bottom-most image layer, a delta layer D1 crossing the GC horizon, D2 below the horizon, and D3 above the horizon.
        //
-        //                             | D3 |
-        //  | D1 |
+        //  | D1 |                       | D3 |
        // -|    |-- gc horizon -----------------
        //  |    |                | D2 |
        // --------- img layer ------------------
        //
        // What we should expact from this compaction is:
-        //                             | D3 |
-        //  | Part of D1 |
+        //  | Part of D1 |               | D3 |
        // --------- img layer with D1+D2 at GC horizon------------------

        // img layer at 0x10
@@ -6666,13 +6636,13 @@ mod tests {
        let delta3 = vec![
            (
                get_key(8),
-                Lsn(0x48),
-                Value::Image(Bytes::from("value 8@0x48")),
+                Lsn(0x40),
+                Value::Image(Bytes::from("value 8@0x40")),
            ),
            (
                get_key(9),
-                Lsn(0x48),
-                Value::Image(Bytes::from("value 9@0x48")),
+                Lsn(0x40),
+                Value::Image(Bytes::from("value 9@0x40")),
            ),
        ];

@@ -6682,11 +6652,7 @@ mod tests {
                Lsn(0x10),
                DEFAULT_PG_VERSION,
                &ctx,
-                vec![
-                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta1),
-                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta2),
-                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x48)..Lsn(0x50), delta3),
-                ], // delta layers
+                vec![delta1, delta2, delta3], // delta layers
                vec![(Lsn(0x10), img_layer)], // image layers
                Lsn(0x50),
            )
@@ -6707,8 +6673,8 @@ mod tests {
            Bytes::from_static(b"value 5@0x20"),
            Bytes::from_static(b"value 6@0x20"),
            Bytes::from_static(b"value 7@0x10"),
-            Bytes::from_static(b"value 8@0x48"),
-            Bytes::from_static(b"value 9@0x48"),
+            Bytes::from_static(b"value 8@0x40"),
+            Bytes::from_static(b"value 9@0x40"),
        ];

        for (idx, expected) in expected_result.iter().enumerate() {
@@ -6796,10 +6762,10 @@ mod tests {
                    lsn_range: Lsn(0x30)..Lsn(0x41),
                    is_delta: true
                },
-                // The delta3 layer that should not be picked for the compaction
+                // The delta layer we created and should not be picked for the compaction
                PersistentLayerKey {
                    key_range: get_key(8)..get_key(10),
-                    lsn_range: Lsn(0x48)..Lsn(0x50),
+                    lsn_range: Lsn(0x40)..Lsn(0x41),
                    is_delta: true
                }
            ]
@@ -6863,10 +6829,7 @@ mod tests {
                Lsn(0x10),
                DEFAULT_PG_VERSION,
                &ctx,
-                vec![DeltaLayerTestDesc::new_with_inferred_key_range(
-                    Lsn(0x10)..Lsn(0x40),
-                    delta1,
-                )], // delta layers
+                vec![delta1],              // delta layers
                vec![(Lsn(0x10), image1)], // image layers
                Lsn(0x50),
            )
@@ -6990,21 +6953,15 @@ mod tests {
            key
        }

-        // We create
-        // - one bottom-most image layer,
-        // - a delta layer D1 crossing the GC horizon with data below and above the horizon,
-        // - a delta layer D2 crossing the GC horizon with data only below the horizon,
-        // - a delta layer D3 above the horizon.
+        // We create one bottom-most image layer, a delta layer D1 crossing the GC horizon, D2 below the horizon, and D3 above the horizon.
        //
-        //                             | D3 |
-        //  | D1 |
+        //  | D1 |                       | D3 |
        // -|    |-- gc horizon -----------------
        //  |    |                | D2 |
        // --------- img layer ------------------
        //
        // What we should expact from this compaction is:
-        //                             | D3 |
-        //  | Part of D1 |
+        //  | Part of D1 |               | D3 |
        // --------- img layer with D1+D2 at GC horizon------------------

        // img layer at 0x10
@@ -7054,13 +7011,13 @@ mod tests {
        let delta3 = vec![
            (
                get_key(8),
-                Lsn(0x48),
-                Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
+                Lsn(0x40),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x40")),
            ),
            (
                get_key(9),
-                Lsn(0x48),
-                Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
+                Lsn(0x40),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x40")),
            ),
        ];

@@ -7070,11 +7027,7 @@ mod tests {
                Lsn(0x10),
                DEFAULT_PG_VERSION,
                &ctx,
-                vec![
-                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta1),
-                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta2),
-                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x48)..Lsn(0x50), delta3),
-                ], // delta layers
+                vec![delta1, delta2, delta3], // delta layers
                vec![(Lsn(0x10), img_layer)], // image layers
                Lsn(0x50),
            )
@@ -7089,7 +7042,6 @@ mod tests {
                    horizon: Lsn(0x30),
                },
                leases: Default::default(),
-                within_ancestor_pitr: false,
            };
        }

@@ -7102,8 +7054,8 @@ mod tests {
            Bytes::from_static(b"value 5@0x10@0x20"),
            Bytes::from_static(b"value 6@0x10@0x20"),
            Bytes::from_static(b"value 7@0x10"),
-            Bytes::from_static(b"value 8@0x10@0x48"),
-            Bytes::from_static(b"value 9@0x10@0x48"),
+            Bytes::from_static(b"value 8@0x10@0x40"),
+            Bytes::from_static(b"value 9@0x10@0x40"),
        ];

        let expected_result_at_gc_horizon = [
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -6,20 +6,13 @@
 //! is written as a one byte. If it's larger than that, the length
 //! is written as a four-byte integer, in big-endian, with the high
 //! bit set. This way, we can detect whether it's 1- or 4-byte header
-//! by peeking at the first byte. For blobs larger than 128 bits,
-//! we also specify three reserved bits, only one of the three bit
-//! patterns is currently in use (0b011) and signifies compression
-//! with zstd.
+//! by peeking at the first byte.
 //!
 //! len <  128: 0XXXXXXX
-//! len >= 128: 1CCCXXXX XXXXXXXX XXXXXXXX XXXXXXXX
+//! len >= 128: 1XXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX
 //!
-use async_compression::Level;
 use bytes::{BufMut, BytesMut};
-use pageserver_api::models::ImageCompressionAlgorithm;
-use tokio::io::AsyncWriteExt;
 use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice};
-use tracing::warn;

 use crate::context::RequestContext;
 use crate::page_cache::PAGE_SZ;
@@ -73,37 +66,12 @@ impl<'a> BlockCursor<'a> {
                len_buf.copy_from_slice(&buf[off..off + 4]);
                off += 4;
            }
-            let bit_mask = if self.read_compressed {
-                !LEN_COMPRESSION_BIT_MASK
-            } else {
-                0x7f
-            };
-            len_buf[0] &= bit_mask;
+            len_buf[0] &= 0x7f;
            u32::from_be_bytes(len_buf) as usize
        };
-        let compression_bits = first_len_byte & LEN_COMPRESSION_BIT_MASK;

-        let mut tmp_buf = Vec::new();
-        let buf_to_write;
-        let compression = if compression_bits <= BYTE_UNCOMPRESSED || !self.read_compressed {
-            if compression_bits > BYTE_UNCOMPRESSED {
-                warn!("reading key above future limit ({len} bytes)");
-            }
-            buf_to_write = dstbuf;
-            None
-        } else if compression_bits == BYTE_ZSTD {
-            buf_to_write = &mut tmp_buf;
-            Some(dstbuf)
-        } else {
-            let error = std::io::Error::new(
-                std::io::ErrorKind::InvalidData,
-                format!("invalid compression byte {compression_bits:x}"),
-            );
-            return Err(error);
-        };
-
-        buf_to_write.clear();
-        buf_to_write.reserve(len);
+        dstbuf.clear();
+        dstbuf.reserve(len);

        // Read the payload
        let mut remain = len;
@@ -117,35 +85,14 @@ impl<'a> BlockCursor<'a> {
                page_remain = PAGE_SZ;
            }
            let this_blk_len = min(remain, page_remain);
-            buf_to_write.extend_from_slice(&buf[off..off + this_blk_len]);
+            dstbuf.extend_from_slice(&buf[off..off + this_blk_len]);
            remain -= this_blk_len;
            off += this_blk_len;
        }
-
-        if let Some(dstbuf) = compression {
-            if compression_bits == BYTE_ZSTD {
-                let mut decoder = async_compression::tokio::write::ZstdDecoder::new(dstbuf);
-                decoder.write_all(buf_to_write).await?;
-                decoder.flush().await?;
-            } else {
-                unreachable!("already checked above")
-            }
-        }
-
        Ok(())
    }
 }

-/// Reserved bits for length and compression
-const LEN_COMPRESSION_BIT_MASK: u8 = 0xf0;
-
-/// The maximum size of blobs we support. The highest few bits
-/// are reserved for compression and other further uses.
-const MAX_SUPPORTED_LEN: usize = 0x0fff_ffff;
-
-const BYTE_UNCOMPRESSED: u8 = 0x80;
-const BYTE_ZSTD: u8 = BYTE_UNCOMPRESSED | 0x10;
-
 /// A wrapper of `VirtualFile` that allows users to write blobs.
 ///
 /// If a `BlobWriter` is dropped, the internal buffer will be
@@ -272,18 +219,6 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
        &mut self,
        srcbuf: B,
        ctx: &RequestContext,
-    ) -> (B::Buf, Result<u64, Error>) {
-        self.write_blob_maybe_compressed(srcbuf, ctx, ImageCompressionAlgorithm::Disabled)
-            .await
-    }
-
-    /// Write a blob of data. Returns the offset that it was written to,
-    /// which can be used to retrieve the data later.
-    pub async fn write_blob_maybe_compressed<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
-        &mut self,
-        srcbuf: B,
-        ctx: &RequestContext,
-        algorithm: ImageCompressionAlgorithm,
    ) -> (B::Buf, Result<u64, Error>) {
        let offset = self.offset;

@@ -291,60 +226,29 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {

        let mut io_buf = self.io_buf.take().expect("we always put it back below");
        io_buf.clear();
-        let mut compressed_buf = None;
-        let ((io_buf, hdr_res), srcbuf) = async {
+        let (io_buf, hdr_res) = async {
            if len < 128 {
                // Short blob. Write a 1-byte length header
                io_buf.put_u8(len as u8);
-                (
-                    self.write_all(io_buf, ctx).await,
-                    srcbuf.slice_full().into_inner(),
-                )
+                self.write_all(io_buf, ctx).await
            } else {
                // Write a 4-byte length header
-                if len > MAX_SUPPORTED_LEN {
+                if len > 0x7fff_ffff {
                    return (
-                        (
-                            io_buf,
-                            Err(Error::new(
-                                ErrorKind::Other,
-                                format!("blob too large ({len} bytes)"),
-                            )),
-                        ),
-                        srcbuf.slice_full().into_inner(),
+                        io_buf,
+                        Err(Error::new(
+                            ErrorKind::Other,
+                            format!("blob too large ({len} bytes)"),
+                        )),
                    );
                }
-                let (high_bit_mask, len_written, srcbuf) = match algorithm {
-                    ImageCompressionAlgorithm::Zstd { level } => {
-                        let mut encoder = if let Some(level) = level {
-                            async_compression::tokio::write::ZstdEncoder::with_quality(
-                                Vec::new(),
-                                Level::Precise(level.into()),
-                            )
-                        } else {
-                            async_compression::tokio::write::ZstdEncoder::new(Vec::new())
-                        };
-                        let slice = srcbuf.slice_full();
-                        encoder.write_all(&slice[..]).await.unwrap();
-                        encoder.shutdown().await.unwrap();
-                        let compressed = encoder.into_inner();
-                        if compressed.len() < len {
-                            let compressed_len = compressed.len();
-                            compressed_buf = Some(compressed);
-                            (BYTE_ZSTD, compressed_len, slice.into_inner())
-                        } else {
-                            (BYTE_UNCOMPRESSED, len, slice.into_inner())
-                        }
-                    }
-                    ImageCompressionAlgorithm::Disabled => {
-                        (BYTE_UNCOMPRESSED, len, srcbuf.slice_full().into_inner())
-                    }
-                };
-                let mut len_buf = (len_written as u32).to_be_bytes();
-                assert_eq!(len_buf[0] & 0xf0, 0);
-                len_buf[0] |= high_bit_mask;
+                if len > 0x0fff_ffff {
+                    tracing::warn!("writing blob above future limit ({len} bytes)");
+                }
+                let mut len_buf = (len as u32).to_be_bytes();
+                len_buf[0] |= 0x80;
                io_buf.extend_from_slice(&len_buf[..]);
-                (self.write_all(io_buf, ctx).await, srcbuf)
+                self.write_all(io_buf, ctx).await
            }
        }
        .await;
@@ -353,12 +257,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
            Ok(_) => (),
            Err(e) => return (Slice::into_inner(srcbuf.slice(..)), Err(e)),
        }
-        let (srcbuf, res) = if let Some(compressed_buf) = compressed_buf {
-            let (_buf, res) = self.write_all(compressed_buf, ctx).await;
-            (Slice::into_inner(srcbuf.slice(..)), res)
-        } else {
-            self.write_all(srcbuf, ctx).await
-        };
+        let (srcbuf, res) = self.write_all(srcbuf, ctx).await;
        (srcbuf, res.map(|_| offset))
    }
 }
@@ -396,13 +295,6 @@ mod tests {
    use rand::{Rng, SeedableRng};

    async fn round_trip_test<const BUFFERED: bool>(blobs: &[Vec<u8>]) -> Result<(), Error> {
-        round_trip_test_compressed::<BUFFERED>(blobs, false).await
-    }
-
-    async fn round_trip_test_compressed<const BUFFERED: bool>(
-        blobs: &[Vec<u8>],
-        compression: bool,
-    ) -> Result<(), Error> {
        let temp_dir = camino_tempfile::tempdir()?;
        let pathbuf = temp_dir.path().join("file");
        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
@@ -413,16 +305,7 @@ mod tests {
            let file = VirtualFile::create(pathbuf.as_path(), &ctx).await?;
            let mut wtr = BlobWriter::<BUFFERED>::new(file, 0);
            for blob in blobs.iter() {
-                let (_, res) = if compression {
-                    wtr.write_blob_maybe_compressed(
-                        blob.clone(),
-                        &ctx,
-                        ImageCompressionAlgorithm::Zstd { level: Some(1) },
-                    )
-                    .await
-                } else {
-                    wtr.write_blob(blob.clone(), &ctx).await
-                };
+                let (_, res) = wtr.write_blob(blob.clone(), &ctx).await;
                let offs = res?;
                offsets.push(offs);
            }
@@ -436,7 +319,7 @@ mod tests {

        let file = VirtualFile::open(pathbuf.as_path(), &ctx).await?;
        let rdr = BlockReaderRef::VirtualFile(&file);
-        let rdr = BlockCursor::new_with_compression(rdr, compression);
+        let rdr = BlockCursor::new(rdr);
        for (idx, (blob, offset)) in blobs.iter().zip(offsets.iter()).enumerate() {
            let blob_read = rdr.read_blob(*offset, &ctx).await?;
            assert_eq!(
@@ -470,8 +353,6 @@ mod tests {
        ];
        round_trip_test::<false>(blobs).await?;
        round_trip_test::<true>(blobs).await?;
-        round_trip_test_compressed::<false>(blobs, true).await?;
-        round_trip_test_compressed::<true>(blobs, true).await?;
        Ok(())
    }

@@ -480,15 +361,10 @@ mod tests {
        let blobs = &[
            b"test".to_vec(),
            random_array(10 * PAGE_SZ),
-            b"hello".to_vec(),
-            random_array(66 * PAGE_SZ),
-            vec![0xf3; 24 * PAGE_SZ],
            b"foobar".to_vec(),
        ];
        round_trip_test::<false>(blobs).await?;
        round_trip_test::<true>(blobs).await?;
-        round_trip_test_compressed::<false>(blobs, true).await?;
-        round_trip_test_compressed::<true>(blobs, true).await?;
        Ok(())
    }

--- a/pageserver/src/tenant/block_io.rs
+++ b/pageserver/src/tenant/block_io.rs
@@ -37,7 +37,6 @@ where
 pub enum BlockLease<'a> {
    PageReadGuard(PageReadGuard<'static>),
    EphemeralFileMutableTail(&'a [u8; PAGE_SZ]),
-    Slice(&'a [u8; PAGE_SZ]),
    #[cfg(test)]
    Arc(std::sync::Arc<[u8; PAGE_SZ]>),
    #[cfg(test)]
@@ -64,7 +63,6 @@ impl<'a> Deref for BlockLease<'a> {
        match self {
            BlockLease::PageReadGuard(v) => v.deref(),
            BlockLease::EphemeralFileMutableTail(v) => v,
-            BlockLease::Slice(v) => v,
            #[cfg(test)]
            BlockLease::Arc(v) => v.deref(),
            #[cfg(test)]
@@ -83,7 +81,6 @@ pub(crate) enum BlockReaderRef<'a> {
    FileBlockReader(&'a FileBlockReader<'a>),
    EphemeralFile(&'a EphemeralFile),
    Adapter(Adapter<&'a DeltaLayerInner>),
-    Slice(&'a [u8]),
    #[cfg(test)]
    TestDisk(&'a super::disk_btree::tests::TestDisk),
    #[cfg(test)]
@@ -102,7 +99,6 @@ impl<'a> BlockReaderRef<'a> {
            FileBlockReader(r) => r.read_blk(blknum, ctx).await,
            EphemeralFile(r) => r.read_blk(blknum, ctx).await,
            Adapter(r) => r.read_blk(blknum, ctx).await,
-            Slice(s) => Self::read_blk_slice(s, blknum),
            #[cfg(test)]
            TestDisk(r) => r.read_blk(blknum),
            #[cfg(test)]
@@ -111,24 +107,6 @@ impl<'a> BlockReaderRef<'a> {
    }
 }

-impl<'a> BlockReaderRef<'a> {
-    fn read_blk_slice(slice: &[u8], blknum: u32) -> std::io::Result<BlockLease> {
-        let start = (blknum as usize).checked_mul(PAGE_SZ).unwrap();
-        let end = start.checked_add(PAGE_SZ).unwrap();
-        if end > slice.len() {
-            return Err(std::io::Error::new(
-                std::io::ErrorKind::UnexpectedEof,
-                format!("slice too short, len={} end={}", slice.len(), end),
-            ));
-        }
-        let slice = &slice[start..end];
-        let page_sized: &[u8; PAGE_SZ] = slice
-            .try_into()
-            .expect("we add PAGE_SZ to start, so the slice must have PAGE_SZ");
-        Ok(BlockLease::Slice(page_sized))
-    }
-}
-
 ///
 /// A "cursor" for efficiently reading multiple pages from a BlockReader
 ///
@@ -149,24 +127,16 @@ impl<'a> BlockReaderRef<'a> {
 /// ```
 ///
 pub struct BlockCursor<'a> {
-    pub(super) read_compressed: bool,
    reader: BlockReaderRef<'a>,
 }

 impl<'a> BlockCursor<'a> {
    pub(crate) fn new(reader: BlockReaderRef<'a>) -> Self {
-        Self::new_with_compression(reader, false)
-    }
-    pub(crate) fn new_with_compression(reader: BlockReaderRef<'a>, read_compressed: bool) -> Self {
-        BlockCursor {
-            read_compressed,
-            reader,
-        }
+        BlockCursor { reader }
    }
    // Needed by cli
    pub fn new_fileblockreader(reader: &'a FileBlockReader) -> Self {
        BlockCursor {
-            read_compressed: false,
            reader: BlockReaderRef::FileBlockReader(reader),
        }
    }
@@ -196,17 +166,11 @@ pub struct FileBlockReader<'a> {

    /// Unique ID of this file, used as key in the page cache.
    file_id: page_cache::FileId,
-
-    compressed_reads: bool,
 }

 impl<'a> FileBlockReader<'a> {
    pub fn new(file: &'a VirtualFile, file_id: FileId) -> Self {
-        FileBlockReader {
-            file_id,
-            file,
-            compressed_reads: true,
-        }
+        FileBlockReader { file_id, file }
    }

    /// Read a page from the underlying file into given buffer.
@@ -253,10 +217,7 @@ impl<'a> FileBlockReader<'a> {

 impl BlockReader for FileBlockReader<'_> {
    fn block_cursor(&self) -> BlockCursor<'_> {
-        BlockCursor::new_with_compression(
-            BlockReaderRef::FileBlockReader(self),
-            self.compressed_reads,
-        )
+        BlockCursor::new(BlockReaderRef::FileBlockReader(self))
    }
 }

--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -21,7 +21,6 @@ pub struct EphemeralFile {
 }

 mod page_caching;
-pub(crate) use page_caching::PrewarmOnWrite as PrewarmPageCacheOnWrite;
 mod zero_padded_read_write;

 impl EphemeralFile {
@@ -54,7 +53,7 @@ impl EphemeralFile {
        Ok(EphemeralFile {
            _tenant_shard_id: tenant_shard_id,
            _timeline_id: timeline_id,
-            rw: page_caching::RW::new(file, conf.l0_flush.prewarm_on_write()),
+            rw: page_caching::RW::new(file),
        })
    }

@@ -66,11 +65,6 @@ impl EphemeralFile {
        self.rw.page_cache_file_id()
    }

-    /// See [`self::page_caching::RW::load_to_vec`].
-    pub(crate) async fn load_to_vec(&self, ctx: &RequestContext) -> Result<Vec<u8>, io::Error> {
-        self.rw.load_to_vec(ctx).await
-    }
-
    pub(crate) async fn read_blk(
        &self,
        blknum: u32,
--- a/pageserver/src/tenant/ephemeral_file/page_caching.rs
+++ b/pageserver/src/tenant/ephemeral_file/page_caching.rs
@@ -8,7 +8,6 @@ use crate::virtual_file::VirtualFile;

 use once_cell::sync::Lazy;
 use std::io::{self, ErrorKind};
-use std::ops::{Deref, Range};
 use tokio_epoll_uring::BoundedBuf;
 use tracing::*;

@@ -20,23 +19,14 @@ pub struct RW {
    rw: super::zero_padded_read_write::RW<PreWarmingWriter>,
 }

-/// When we flush a block to the underlying [`crate::virtual_file::VirtualFile`],
-/// should we pre-warm the [`crate::page_cache`] with the contents?
-#[derive(Clone, Copy)]
-pub enum PrewarmOnWrite {
-    Yes,
-    No,
-}
-
 impl RW {
-    pub fn new(file: VirtualFile, prewarm_on_write: PrewarmOnWrite) -> Self {
+    pub fn new(file: VirtualFile) -> Self {
        let page_cache_file_id = page_cache::next_file_id();
        Self {
            page_cache_file_id,
            rw: super::zero_padded_read_write::RW::new(PreWarmingWriter::new(
                page_cache_file_id,
                file,
-                prewarm_on_write,
            )),
        }
    }
@@ -59,43 +49,6 @@ impl RW {
        self.rw.bytes_written()
    }

-    /// Load all blocks that can be read via [`Self::read_blk`] into a contiguous memory buffer.
-    ///
-    /// This includes the blocks that aren't yet flushed to disk by the internal buffered writer.
-    /// The last block is zero-padded to [`PAGE_SZ`], so, the returned buffer is always a multiple of [`PAGE_SZ`].
-    pub(super) async fn load_to_vec(&self, ctx: &RequestContext) -> Result<Vec<u8>, io::Error> {
-        // round up to the next PAGE_SZ multiple, required by blob_io
-        let size = {
-            let s = usize::try_from(self.bytes_written()).unwrap();
-            if s % PAGE_SZ == 0 {
-                s
-            } else {
-                s.checked_add(PAGE_SZ - (s % PAGE_SZ)).unwrap()
-            }
-        };
-        let vec = Vec::with_capacity(size);
-
-        // read from disk what we've already flushed
-        let writer = self.rw.as_writer();
-        let flushed_range = writer.written_range();
-        let mut vec = writer
-            .file
-            .read_exact_at(
-                vec.slice(0..(flushed_range.end - flushed_range.start)),
-                u64::try_from(flushed_range.start).unwrap(),
-                ctx,
-            )
-            .await?
-            .into_inner();
-
-        // copy from in-memory buffer what we haven't flushed yet but would return when accessed via read_blk
-        let buffered = self.rw.get_tail_zero_padded();
-        vec.extend_from_slice(buffered);
-        assert_eq!(vec.len(), size);
-        assert_eq!(vec.len() % PAGE_SZ, 0);
-        Ok(vec)
-    }
-
    pub(crate) async fn read_blk(
        &self,
        blknum: u32,
@@ -163,40 +116,19 @@ impl Drop for RW {
 }

 struct PreWarmingWriter {
-    prewarm_on_write: PrewarmOnWrite,
    nwritten_blocks: u32,
    page_cache_file_id: page_cache::FileId,
    file: VirtualFile,
 }

 impl PreWarmingWriter {
-    fn new(
-        page_cache_file_id: page_cache::FileId,
-        file: VirtualFile,
-        prewarm_on_write: PrewarmOnWrite,
-    ) -> Self {
+    fn new(page_cache_file_id: page_cache::FileId, file: VirtualFile) -> Self {
        Self {
-            prewarm_on_write,
            nwritten_blocks: 0,
            page_cache_file_id,
            file,
        }
    }
-
-    /// Return the byte range within `file` that has been written though `write_all`.
-    ///
-    /// The returned range would be invalidated by another `write_all`. To prevent that, we capture `&_`.
-    fn written_range(&self) -> (impl Deref<Target = Range<usize>> + '_) {
-        let nwritten_blocks = usize::try_from(self.nwritten_blocks).unwrap();
-        struct Wrapper(Range<usize>);
-        impl Deref for Wrapper {
-            type Target = Range<usize>;
-            fn deref(&self) -> &Range<usize> {
-                &self.0
-            }
-        }
-        Wrapper(0..nwritten_blocks * PAGE_SZ)
-    }
 }

 impl crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter for PreWarmingWriter {
@@ -246,51 +178,45 @@ impl crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter for PreWarmi
            assert_eq!(&check_bounds_stuff_works, &*buf);
        }

+        // Pre-warm page cache with the contents.
+        // At least in isolated bulk ingest benchmarks (test_bulk_insert.py), the pre-warming
+        // benefits the code that writes InMemoryLayer=>L0 layers.
        let nblocks = buflen / PAGE_SZ;
        let nblocks32 = u32::try_from(nblocks).unwrap();
-
-        if matches!(self.prewarm_on_write, PrewarmOnWrite::Yes) {
-            // Pre-warm page cache with the contents.
-            // At least in isolated bulk ingest benchmarks (test_bulk_insert.py), the pre-warming
-            // benefits the code that writes InMemoryLayer=>L0 layers.
-
-            let cache = page_cache::get();
-            static CTX: Lazy<RequestContext> = Lazy::new(|| {
-                RequestContext::new(
-                    crate::task_mgr::TaskKind::EphemeralFilePreWarmPageCache,
-                    crate::context::DownloadBehavior::Error,
-                )
-            });
-            for blknum_in_buffer in 0..nblocks {
-                let blk_in_buffer =
-                    &buf[blknum_in_buffer * PAGE_SZ..(blknum_in_buffer + 1) * PAGE_SZ];
-                let blknum = self
-                    .nwritten_blocks
-                    .checked_add(blknum_in_buffer as u32)
-                    .unwrap();
-                match cache
-                    .read_immutable_buf(self.page_cache_file_id, blknum, &CTX)
-                    .await
-                {
-                    Err(e) => {
-                        error!("ephemeral_file write_blob failed to get immutable buf to pre-warm page cache: {e:?}");
-                        // fail gracefully, it's not the end of the world if we can't pre-warm the cache here
-                    }
-                    Ok(v) => match v {
-                        page_cache::ReadBufResult::Found(_guard) => {
-                            // This function takes &mut self, so, it shouldn't be possible to reach this point.
-                            unreachable!("we just wrote block {blknum} to the VirtualFile, which is owned by Self, \
-                                      and this function takes &mut self, so, no concurrent read_blk is possible");
-                        }
-                        page_cache::ReadBufResult::NotFound(mut write_guard) => {
-                            write_guard.copy_from_slice(blk_in_buffer);
-                            let _ = write_guard.mark_valid();
-                        }
-                    },
+        let cache = page_cache::get();
+        static CTX: Lazy<RequestContext> = Lazy::new(|| {
+            RequestContext::new(
+                crate::task_mgr::TaskKind::EphemeralFilePreWarmPageCache,
+                crate::context::DownloadBehavior::Error,
+            )
+        });
+        for blknum_in_buffer in 0..nblocks {
+            let blk_in_buffer = &buf[blknum_in_buffer * PAGE_SZ..(blknum_in_buffer + 1) * PAGE_SZ];
+            let blknum = self
+                .nwritten_blocks
+                .checked_add(blknum_in_buffer as u32)
+                .unwrap();
+            match cache
+                .read_immutable_buf(self.page_cache_file_id, blknum, &CTX)
+                .await
+            {
+                Err(e) => {
+                    error!("ephemeral_file write_blob failed to get immutable buf to pre-warm page cache: {e:?}");
+                    // fail gracefully, it's not the end of the world if we can't pre-warm the cache here
                }
+                Ok(v) => match v {
+                    page_cache::ReadBufResult::Found(_guard) => {
+                        // This function takes &mut self, so, it shouldn't be possible to reach this point.
+                        unreachable!("we just wrote block {blknum} to the VirtualFile, which is owned by Self, \
+                                      and this function takes &mut self, so, no concurrent read_blk is possible");
+                    }
+                    page_cache::ReadBufResult::NotFound(mut write_guard) => {
+                        write_guard.copy_from_slice(blk_in_buffer);
+                        let _ = write_guard.mark_valid();
+                    }
+                },
            }
        }
-
        self.nwritten_blocks = self.nwritten_blocks.checked_add(nblocks32).unwrap();
        Ok((buflen, buf.into_inner()))
    }
--- a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs
+++ b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs
@@ -75,21 +75,6 @@ where
        flushed_offset + u64::try_from(buffer.pending()).unwrap()
    }

-    /// Get a slice of all blocks that [`Self::read_blk`] would return as [`ReadResult::ServedFromZeroPaddedMutableTail`].
-    pub fn get_tail_zero_padded(&self) -> &[u8] {
-        let buffer: &zero_padded::Buffer<TAIL_SZ> = self.buffered_writer.inspect_buffer();
-        let buffer_written_up_to = buffer.pending();
-        // pad to next page boundary
-        let read_up_to = if buffer_written_up_to % PAGE_SZ == 0 {
-            buffer_written_up_to
-        } else {
-            buffer_written_up_to
-                .checked_add(PAGE_SZ - (buffer_written_up_to % PAGE_SZ))
-                .unwrap()
-        };
-        &buffer.as_zero_padded_slice()[0..read_up_to]
-    }
-
    pub(crate) async fn read_blk(&self, blknum: u32) -> Result<ReadResult<'_, W>, std::io::Error> {
        let flushed_offset = self.buffered_writer.as_inner().bytes_written();
        let buffer: &zero_padded::Buffer<TAIL_SZ> = self.buffered_writer.inspect_buffer();
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -43,8 +43,7 @@ use crate::tenant::config::{
 use crate::tenant::span::debug_assert_current_span_has_tenant_id;
 use crate::tenant::storage_layer::inmemory_layer;
 use crate::tenant::timeline::ShutdownMode;
-use crate::tenant::{AttachedTenantConf, GcError, LoadConfigError, SpawnMode, Tenant, TenantState};
-use crate::virtual_file::MaybeFatalIo;
+use crate::tenant::{AttachedTenantConf, GcError, SpawnMode, Tenant, TenantState};
 use crate::{InitializationOrder, TEMP_FILE_SUFFIX};

 use utils::crashsafe::path_with_suffix_extension;
@@ -273,7 +272,7 @@ pub struct TenantManager {
 }

 fn emergency_generations(
-    tenant_confs: &HashMap<TenantShardId, Result<LocationConf, LoadConfigError>>,
+    tenant_confs: &HashMap<TenantShardId, anyhow::Result<LocationConf>>,
 ) -> HashMap<TenantShardId, TenantStartupMode> {
    tenant_confs
        .iter()
@@ -297,7 +296,7 @@ fn emergency_generations(

 async fn init_load_generations(
    conf: &'static PageServerConf,
-    tenant_confs: &HashMap<TenantShardId, Result<LocationConf, LoadConfigError>>,
+    tenant_confs: &HashMap<TenantShardId, anyhow::Result<LocationConf>>,
    resources: &TenantSharedResources,
    cancel: &CancellationToken,
 ) -> anyhow::Result<Option<HashMap<TenantShardId, TenantStartupMode>>> {
@@ -347,32 +346,56 @@ async fn init_load_generations(
 /// Given a directory discovered in the pageserver's tenants/ directory, attempt
 /// to load a tenant config from it.
 ///
-/// If we cleaned up something expected (like an empty dir or a temp dir), return None.
+/// If file is missing, return Ok(None)
 fn load_tenant_config(
    conf: &'static PageServerConf,
-    tenant_shard_id: TenantShardId,
    dentry: Utf8DirEntry,
-) -> Option<Result<LocationConf, LoadConfigError>> {
+) -> anyhow::Result<Option<(TenantShardId, anyhow::Result<LocationConf>)>> {
    let tenant_dir_path = dentry.path().to_path_buf();
    if crate::is_temporary(&tenant_dir_path) {
        info!("Found temporary tenant directory, removing: {tenant_dir_path}");
        // No need to use safe_remove_tenant_dir_all because this is already
        // a temporary path
-        std::fs::remove_dir_all(&tenant_dir_path).fatal_err("delete temporary tenant dir");
-        return None;
+        if let Err(e) = std::fs::remove_dir_all(&tenant_dir_path) {
+            error!(
+                "Failed to remove temporary directory '{}': {:?}",
+                tenant_dir_path, e
+            );
+        }
+        return Ok(None);
    }

    // This case happens if we crash during attachment before writing a config into the dir
    let is_empty = tenant_dir_path
        .is_empty_dir()
-        .fatal_err("Checking for empty tenant dir");
+        .with_context(|| format!("Failed to check whether {tenant_dir_path:?} is an empty dir"))?;
    if is_empty {
        info!("removing empty tenant directory {tenant_dir_path:?}");
-        std::fs::remove_dir(&tenant_dir_path).fatal_err("delete empty tenant dir");
-        return None;
+        if let Err(e) = std::fs::remove_dir(&tenant_dir_path) {
+            error!(
+                "Failed to remove empty tenant directory '{}': {e:#}",
+                tenant_dir_path
+            )
+        }
+        return Ok(None);
    }

-    Some(Tenant::load_tenant_config(conf, &tenant_shard_id))
+    let tenant_shard_id = match tenant_dir_path
+        .file_name()
+        .unwrap_or_default()
+        .parse::<TenantShardId>()
+    {
+        Ok(id) => id,
+        Err(_) => {
+            warn!("Invalid tenant path (garbage in our repo directory?): {tenant_dir_path}",);
+            return Ok(None);
+        }
+    };
+
+    Ok(Some((
+        tenant_shard_id,
+        Tenant::load_tenant_config(conf, &tenant_shard_id),
+    )))
 }

 /// Initial stage of load: walk the local tenants directory, clean up any temp files,
@@ -382,51 +405,32 @@ fn load_tenant_config(
 /// seconds even on reasonably fast drives.
 async fn init_load_tenant_configs(
    conf: &'static PageServerConf,
-) -> HashMap<TenantShardId, Result<LocationConf, LoadConfigError>> {
+) -> anyhow::Result<HashMap<TenantShardId, anyhow::Result<LocationConf>>> {
    let tenants_dir = conf.tenants_path();

-    let dentries = tokio::task::spawn_blocking(move || -> Vec<Utf8DirEntry> {
-        let context = format!("read tenants dir {tenants_dir}");
-        let dir_entries = tenants_dir.read_dir_utf8().fatal_err(&context);
+    let dentries = tokio::task::spawn_blocking(move || -> anyhow::Result<Vec<Utf8DirEntry>> {
+        let dir_entries = tenants_dir
+            .read_dir_utf8()
+            .with_context(|| format!("Failed to list tenants dir {tenants_dir:?}"))?;

-        dir_entries
-            .collect::<Result<Vec<_>, std::io::Error>>()
-            .fatal_err(&context)
+        Ok(dir_entries.collect::<Result<Vec<_>, std::io::Error>>()?)
    })
-    .await
-    .expect("Config load task panicked");
+    .await??;

    let mut configs = HashMap::new();

    let mut join_set = JoinSet::new();
    for dentry in dentries {
-        let tenant_shard_id = match dentry.file_name().parse::<TenantShardId>() {
-            Ok(id) => id,
-            Err(_) => {
-                warn!(
-                    "Invalid tenant path (garbage in our repo directory?): '{}'",
-                    dentry.file_name()
-                );
-                continue;
-            }
-        };
-
-        join_set.spawn_blocking(move || {
-            (
-                tenant_shard_id,
-                load_tenant_config(conf, tenant_shard_id, dentry),
-            )
-        });
+        join_set.spawn_blocking(move || load_tenant_config(conf, dentry));
    }

    while let Some(r) = join_set.join_next().await {
-        let (tenant_shard_id, tenant_config) = r.expect("Panic in config load task");
-        if let Some(tenant_config) = tenant_config {
-            configs.insert(tenant_shard_id, tenant_config);
+        if let Some((tenant_id, tenant_config)) = r?? {
+            configs.insert(tenant_id, tenant_config);
        }
    }

-    configs
+    Ok(configs)
 }

 #[derive(Debug, thiserror::Error)]
@@ -468,7 +472,7 @@ pub async fn init_tenant_mgr(
    );

    // Scan local filesystem for attached tenants
-    let tenant_configs = init_load_tenant_configs(conf).await;
+    let tenant_configs = init_load_tenant_configs(conf).await?;

    // Determine which tenants are to be secondary or attached, and in which generation
    let tenant_modes = init_load_generations(conf, &tenant_configs, &resources, &cancel).await?;
@@ -586,23 +590,31 @@ pub async fn init_tenant_mgr(
    );
    // For those shards that have live configurations, construct `Tenant` or `SecondaryTenant` objects and start them running
    for (tenant_shard_id, location_conf, config_write_result) in config_write_results {
-        // Writing a config to local disk is foundational to startup up tenants: panic if we can't.
-        config_write_result.fatal_err("write tenant shard config file");
+        // Errors writing configs are fatal
+        config_write_result?;

        let tenant_dir_path = conf.tenant_path(&tenant_shard_id);
        let shard_identity = location_conf.shard;
        let slot = match location_conf.mode {
-            LocationMode::Attached(attached_conf) => TenantSlot::Attached(tenant_spawn(
-                conf,
-                tenant_shard_id,
-                &tenant_dir_path,
-                resources.clone(),
-                AttachedTenantConf::new(location_conf.tenant_conf, attached_conf),
-                shard_identity,
-                Some(init_order.clone()),
-                SpawnMode::Lazy,
-                &ctx,
-            )),
+            LocationMode::Attached(attached_conf) => {
+                match tenant_spawn(
+                    conf,
+                    tenant_shard_id,
+                    &tenant_dir_path,
+                    resources.clone(),
+                    AttachedTenantConf::new(location_conf.tenant_conf, attached_conf),
+                    shard_identity,
+                    Some(init_order.clone()),
+                    SpawnMode::Lazy,
+                    &ctx,
+                ) {
+                    Ok(tenant) => TenantSlot::Attached(tenant),
+                    Err(e) => {
+                        error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Failed to start tenant: {e:#}");
+                        continue;
+                    }
+                }
+            }
            LocationMode::Secondary(secondary_conf) => {
                info!(
                    tenant_id = %tenant_shard_id.tenant_id,
@@ -637,7 +649,8 @@ pub async fn init_tenant_mgr(
    })
 }

-/// Wrapper for Tenant::spawn that checks invariants before running
+/// Wrapper for Tenant::spawn that checks invariants before running, and inserts
+/// a broken tenant in the map if Tenant::spawn fails.
 #[allow(clippy::too_many_arguments)]
 fn tenant_spawn(
    conf: &'static PageServerConf,
@@ -649,18 +662,23 @@ fn tenant_spawn(
    init_order: Option<InitializationOrder>,
    mode: SpawnMode,
    ctx: &RequestContext,
-) -> Arc<Tenant> {
-    // All these conditions should have been satisfied by our caller: the tenant dir exists, is a well formed
-    // path, and contains a configuration file.  Assertions that do synchronous I/O are limited to debug mode
-    // to avoid impacting prod runtime performance.
-    assert!(!crate::is_temporary(tenant_path));
-    debug_assert!(tenant_path.is_dir());
-    debug_assert!(conf
-        .tenant_location_config_path(&tenant_shard_id)
-        .try_exists()
-        .unwrap());
+) -> anyhow::Result<Arc<Tenant>> {
+    anyhow::ensure!(
+        tenant_path.is_dir(),
+        "Cannot load tenant from path {tenant_path:?}, it either does not exist or not a directory"
+    );
+    anyhow::ensure!(
+        !crate::is_temporary(tenant_path),
+        "Cannot load tenant from temporary path {tenant_path:?}"
+    );
+    anyhow::ensure!(
+        !tenant_path.is_empty_dir().with_context(|| {
+            format!("Failed to check whether {tenant_path:?} is an empty dir")
+        })?,
+        "Cannot load tenant from empty directory {tenant_path:?}"
+    );

-    Tenant::spawn(
+    let tenant = Tenant::spawn(
        conf,
        tenant_shard_id,
        resources,
@@ -669,7 +687,9 @@ fn tenant_spawn(
        init_order,
        mode,
        ctx,
-    )
+    );
+
+    Ok(tenant)
 }

 async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
@@ -820,9 +840,8 @@ pub(crate) enum UpsertLocationError {
    #[error("Failed to flush: {0}")]
    Flush(anyhow::Error),

-    /// This error variant is for unexpected situations (soft assertions) where the system is in an unexpected state.
    #[error("Internal error: {0}")]
-    InternalError(anyhow::Error),
+    Other(#[from] anyhow::Error),
 }

 impl TenantManager {
@@ -952,8 +971,7 @@ impl TenantManager {
        match fast_path_taken {
            Some(FastPathModified::Attached(tenant)) => {
                Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
-                    .await
-                    .fatal_err("write tenant shard config");
+                    .await?;

                // Transition to AttachedStale means we may well hold a valid generation
                // still, and have been requested to go stale as part of a migration.  If
@@ -983,8 +1001,7 @@ impl TenantManager {
            }
            Some(FastPathModified::Secondary(_secondary_tenant)) => {
                Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
-                    .await
-                    .fatal_err("write tenant shard config");
+                    .await?;

                return Ok(None);
            }
@@ -1050,7 +1067,7 @@ impl TenantManager {
            Some(TenantSlot::InProgress(_)) => {
                // This should never happen: acquire_slot should error out
                // if the contents of a slot were InProgress.
-                return Err(UpsertLocationError::InternalError(anyhow::anyhow!(
+                return Err(UpsertLocationError::Other(anyhow::anyhow!(
                    "Acquired an InProgress slot, this is a bug."
                )));
            }
@@ -1069,14 +1086,12 @@ impl TenantManager {
        // Does not need to be fsync'd because local storage is just a cache.
        tokio::fs::create_dir_all(&timelines_path)
            .await
-            .fatal_err("create timelines/ dir");
+            .with_context(|| format!("Creating {timelines_path}"))?;

        // Before activating either secondary or attached mode, persist the
        // configuration, so that on restart we will re-attach (or re-start
        // secondary) on the tenant.
-        Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
-            .await
-            .fatal_err("write tenant shard config");
+        Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config).await?;

        let new_slot = match &new_location_config.mode {
            LocationMode::Secondary(secondary_config) => {
@@ -1095,15 +1110,13 @@ impl TenantManager {
                // from upserts.  This enables creating generation-less tenants even though neon_local
                // always uses generations when calling the location conf API.
                let attached_conf = if cfg!(feature = "testing") {
-                    let mut conf = AttachedTenantConf::try_from(new_location_config)
-                        .map_err(UpsertLocationError::BadRequest)?;
+                    let mut conf = AttachedTenantConf::try_from(new_location_config)?;
                    if self.conf.control_plane_api.is_none() {
                        conf.location.generation = Generation::none();
                    }
                    conf
                } else {
-                    AttachedTenantConf::try_from(new_location_config)
-                        .map_err(UpsertLocationError::BadRequest)?
+                    AttachedTenantConf::try_from(new_location_config)?
                };

                let tenant = tenant_spawn(
@@ -1116,7 +1129,7 @@ impl TenantManager {
                    None,
                    spawn_mode,
                    ctx,
-                );
+                )?;

                TenantSlot::Attached(tenant)
            }
@@ -1130,7 +1143,7 @@ impl TenantManager {

        match slot_guard.upsert(new_slot) {
            Err(TenantSlotUpsertError::InternalError(e)) => {
-                Err(UpsertLocationError::InternalError(anyhow::anyhow!(e)))
+                Err(UpsertLocationError::Other(anyhow::anyhow!(e)))
            }
            Err(TenantSlotUpsertError::MapState(e)) => Err(UpsertLocationError::Unavailable(e)),
            Err(TenantSlotUpsertError::ShuttingDown((new_slot, _completion))) => {
@@ -1237,7 +1250,7 @@ impl TenantManager {
            None,
            SpawnMode::Eager,
            ctx,
-        );
+        )?;

        slot_guard.upsert(TenantSlot::Attached(tenant))?;

@@ -1971,7 +1984,7 @@ impl TenantManager {
            None,
            SpawnMode::Eager,
            ctx,
-        );
+        )?;

        slot_guard.upsert(TenantSlot::Attached(tenant))?;

--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -519,7 +519,7 @@ impl RemoteTimelineClient {
        local_path: &Utf8Path,
        cancel: &CancellationToken,
        ctx: &RequestContext,
-    ) -> Result<u64, DownloadError> {
+    ) -> anyhow::Result<u64> {
        let downloaded_size = {
            let _unfinished_gauge_guard = self.metrics.call_begin(
                &RemoteOpFileKind::Layer,
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -23,8 +23,6 @@ use super::{
    storage_layer::LayerName,
 };

-use crate::metrics::SECONDARY_RESIDENT_PHYSICAL_SIZE;
-use metrics::UIntGauge;
 use pageserver_api::{
    models,
    shard::{ShardIdentity, TenantShardId},
@@ -101,17 +99,6 @@ pub(crate) struct SecondaryTenant {

    // Public state indicating overall progress of downloads relative to the last heatmap seen
    pub(crate) progress: std::sync::Mutex<models::SecondaryProgress>,
-
-    // Sum of layer sizes on local disk
-    pub(super) resident_size_metric: UIntGauge,
-}
-
-impl Drop for SecondaryTenant {
-    fn drop(&mut self) {
-        let tenant_id = self.tenant_shard_id.tenant_id.to_string();
-        let shard_id = format!("{}", self.tenant_shard_id.shard_slug());
-        let _ = SECONDARY_RESIDENT_PHYSICAL_SIZE.remove_label_values(&[&tenant_id, &shard_id]);
-    }
 }

 impl SecondaryTenant {
@@ -121,12 +108,6 @@ impl SecondaryTenant {
        tenant_conf: TenantConfOpt,
        config: &SecondaryLocationConfig,
    ) -> Arc<Self> {
-        let tenant_id = tenant_shard_id.tenant_id.to_string();
-        let shard_id = format!("{}", tenant_shard_id.shard_slug());
-        let resident_size_metric = SECONDARY_RESIDENT_PHYSICAL_SIZE
-            .get_metric_with_label_values(&[&tenant_id, &shard_id])
-            .unwrap();
-
        Arc::new(Self {
            tenant_shard_id,
            // todo: shall we make this a descendent of the
@@ -142,8 +123,6 @@ impl SecondaryTenant {
            detail: std::sync::Mutex::new(SecondaryDetail::new(config.clone())),

            progress: std::sync::Mutex::default(),
-
-            resident_size_metric,
        })
    }

@@ -232,12 +211,16 @@ impl SecondaryTenant {
            // have to 100% match what is on disk, because it's a best-effort warming
            // of the cache.
            let mut detail = this.detail.lock().unwrap();
-            if let Some(removed) =
-                detail.evict_layer(name, &timeline_id, now, &this.resident_size_metric)
-            {
-                // We might race with removal of the same layer during downloads, so finding the layer we
-                // were trying to remove is optional.  Only issue the disk I/O to remove it if we found it.
-                removed.remove_blocking();
+            if let Some(timeline_detail) = detail.timelines.get_mut(&timeline_id) {
+                let removed = timeline_detail.on_disk_layers.remove(&name);
+
+                // We might race with removal of the same layer during downloads, if it was removed
+                // from the heatmap.  If we see that the OnDiskState is gone, then no need to
+                // do a physical deletion or store in evicted_at.
+                if let Some(removed) = removed {
+                    removed.remove_blocking();
+                    timeline_detail.evicted_at.insert(name, now);
+                }
            }
        })
        .await
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -46,7 +46,6 @@ use crate::tenant::{
 use camino::Utf8PathBuf;
 use chrono::format::{DelayedFormat, StrftimeItems};
 use futures::Future;
-use metrics::UIntGauge;
 use pageserver_api::models::SecondaryProgress;
 use pageserver_api::shard::TenantShardId;
 use remote_storage::{DownloadError, Etag, GenericRemoteStorage};
@@ -132,66 +131,16 @@ impl OnDiskState {
            .or_else(fs_ext::ignore_not_found)
            .fatal_err("Deleting secondary layer")
    }
-
-    pub(crate) fn file_size(&self) -> u64 {
-        self.metadata.file_size
-    }
 }

 #[derive(Debug, Clone, Default)]
 pub(super) struct SecondaryDetailTimeline {
-    on_disk_layers: HashMap<LayerName, OnDiskState>,
+    pub(super) on_disk_layers: HashMap<LayerName, OnDiskState>,

    /// We remember when layers were evicted, to prevent re-downloading them.
    pub(super) evicted_at: HashMap<LayerName, SystemTime>,
 }

-impl SecondaryDetailTimeline {
-    pub(super) fn remove_layer(
-        &mut self,
-        name: &LayerName,
-        resident_metric: &UIntGauge,
-    ) -> Option<OnDiskState> {
-        let removed = self.on_disk_layers.remove(name);
-        if let Some(removed) = &removed {
-            resident_metric.sub(removed.file_size());
-        }
-        removed
-    }
-
-    /// `local_path`
-    fn touch_layer<F>(
-        &mut self,
-        conf: &'static PageServerConf,
-        tenant_shard_id: &TenantShardId,
-        timeline_id: &TimelineId,
-        touched: &HeatMapLayer,
-        resident_metric: &UIntGauge,
-        local_path: F,
-    ) where
-        F: FnOnce() -> Utf8PathBuf,
-    {
-        use std::collections::hash_map::Entry;
-        match self.on_disk_layers.entry(touched.name.clone()) {
-            Entry::Occupied(mut v) => {
-                v.get_mut().access_time = touched.access_time;
-            }
-            Entry::Vacant(e) => {
-                e.insert(OnDiskState::new(
-                    conf,
-                    tenant_shard_id,
-                    timeline_id,
-                    touched.name.clone(),
-                    touched.metadata.clone(),
-                    touched.access_time,
-                    local_path(),
-                ));
-                resident_metric.add(touched.metadata.file_size);
-            }
-        }
-    }
-}
-
 // Aspects of a heatmap that we remember after downloading it
 #[derive(Clone, Debug)]
 struct DownloadSummary {
@@ -209,7 +158,7 @@ pub(super) struct SecondaryDetail {

    last_download: Option<DownloadSummary>,
    next_download: Option<Instant>,
-    timelines: HashMap<TimelineId, SecondaryDetailTimeline>,
+    pub(super) timelines: HashMap<TimelineId, SecondaryDetailTimeline>,
 }

 /// Helper for logging SystemTime
@@ -242,38 +191,6 @@ impl SecondaryDetail {
        }
    }

-    pub(super) fn evict_layer(
-        &mut self,
-        name: LayerName,
-        timeline_id: &TimelineId,
-        now: SystemTime,
-        resident_metric: &UIntGauge,
-    ) -> Option<OnDiskState> {
-        let timeline = self.timelines.get_mut(timeline_id)?;
-        let removed = timeline.remove_layer(&name, resident_metric);
-        if removed.is_some() {
-            timeline.evicted_at.insert(name, now);
-        }
-        removed
-    }
-
-    pub(super) fn remove_timeline(
-        &mut self,
-        timeline_id: &TimelineId,
-        resident_metric: &UIntGauge,
-    ) {
-        let removed = self.timelines.remove(timeline_id);
-        if let Some(removed) = removed {
-            resident_metric.sub(
-                removed
-                    .on_disk_layers
-                    .values()
-                    .map(|l| l.metadata.file_size)
-                    .sum(),
-            );
-        }
-    }
-
    /// Additionally returns the total number of layers, used for more stable relative access time
    /// based eviction.
    pub(super) fn get_layers_for_eviction(
@@ -345,7 +262,6 @@ impl scheduler::RunningJob for RunningDownload {
 struct CompleteDownload {
    secondary_state: Arc<SecondaryTenant>,
    completed_at: Instant,
-    result: Result<(), UpdateError>,
 }

 impl scheduler::Completion for CompleteDownload {
@@ -370,33 +286,21 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
        let CompleteDownload {
            secondary_state,
            completed_at: _completed_at,
-            result,
        } = completion;

        tracing::debug!("Secondary tenant download completed");

        let mut detail = secondary_state.detail.lock().unwrap();

-        match result {
-            Err(UpdateError::Restart) => {
-                // Start downloading again as soon as we can.  This will involve waiting for the scheduler's
-                // scheduling interval.  This slightly reduces the peak download speed of tenants that hit their
-                // deadline and keep restarting, but that also helps give other tenants a chance to execute rather
-                // that letting one big tenant dominate for a long time.
-                detail.next_download = Some(Instant::now());
-            }
-            _ => {
-                let period = detail
-                    .last_download
-                    .as_ref()
-                    .map(|d| d.upload_period)
-                    .unwrap_or(DEFAULT_DOWNLOAD_INTERVAL);
+        let period = detail
+            .last_download
+            .as_ref()
+            .map(|d| d.upload_period)
+            .unwrap_or(DEFAULT_DOWNLOAD_INTERVAL);

-                // We advance next_download irrespective of errors: we don't want error cases to result in
-                // expensive busy-polling.
-                detail.next_download = Some(Instant::now() + period_jitter(period, 5));
-            }
-        }
+        // We advance next_download irrespective of errors: we don't want error cases to result in
+        // expensive busy-polling.
+        detail.next_download = Some(Instant::now() + period_jitter(period, 5));
    }

    async fn schedule(&mut self) -> SchedulingResult<PendingDownload> {
@@ -492,10 +396,9 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
        (RunningDownload { barrier }, Box::pin(async move {
            let _completion = completion;

-            let result = TenantDownloader::new(conf, &remote_storage, &secondary_state)
+            match TenantDownloader::new(conf, &remote_storage, &secondary_state)
                .download(&download_ctx)
-                .await;
-            match &result
+                .await
            {
                Err(UpdateError::NoData) => {
                    tracing::info!("No heatmap found for tenant.  This is fine if it is new.");
@@ -512,9 +415,6 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
                Err(e @ (UpdateError::DownloadError(_) | UpdateError::Other(_))) => {
                    tracing::error!("Error while downloading tenant: {e}");
                },
-                Err(UpdateError::Restart) => {
-                    tracing::info!("Download reached deadline & will restart to update heatmap")
-                }
                Ok(()) => {}
            };

@@ -536,7 +436,6 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
            CompleteDownload {
                secondary_state,
                completed_at: Instant::now(),
-                result
            }
        }.instrument(info_span!(parent: None, "secondary_download", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))))
    }
@@ -553,11 +452,6 @@ struct TenantDownloader<'a> {
 /// Errors that may be encountered while updating a tenant
 #[derive(thiserror::Error, Debug)]
 enum UpdateError {
-    /// This is not a true failure, but it's how a download indicates that it would like to be restarted by
-    /// the scheduler, to pick up the latest heatmap
-    #[error("Reached deadline, restarting downloads")]
-    Restart,
-
    #[error("No remote data found")]
    NoData,
    #[error("Insufficient local storage space")]
@@ -684,13 +578,8 @@ impl<'a> TenantDownloader<'a> {
                Some(t) => t,
                None => {
                    // We have no existing state: need to scan local disk for layers first.
-                    let timeline_state = init_timeline_state(
-                        self.conf,
-                        tenant_shard_id,
-                        timeline,
-                        &self.secondary_state.resident_size_metric,
-                    )
-                    .await;
+                    let timeline_state =
+                        init_timeline_state(self.conf, tenant_shard_id, timeline).await;

                    // Re-acquire detail lock now that we're done with async load from local FS
                    self.secondary_state
@@ -714,26 +603,6 @@ impl<'a> TenantDownloader<'a> {
                self.prepare_timelines(&heatmap, heatmap_mtime).await?;
        }

-        // Calculate a deadline for downloads: if downloading takes longer than this, it is useful to drop out and start again,
-        // so that we are always using reasonably a fresh heatmap.  Otherwise, if we had really huge content to download, we might
-        // spend 10s of minutes downloading layers we don't need.
-        // (see https://github.com/neondatabase/neon/issues/8182)
-        let deadline = {
-            let period = self
-                .secondary_state
-                .detail
-                .lock()
-                .unwrap()
-                .last_download
-                .as_ref()
-                .map(|d| d.upload_period)
-                .unwrap_or(DEFAULT_DOWNLOAD_INTERVAL);
-
-            // Use double the period: we are not promising to complete within the period, this is just a heuristic
-            // to keep using a "reasonably fresh" heatmap.
-            Instant::now() + period * 2
-        };
-
        // Download the layers in the heatmap
        for timeline in heatmap.timelines {
            let timeline_state = timeline_states
@@ -749,7 +618,7 @@ impl<'a> TenantDownloader<'a> {
            }

            let timeline_id = timeline.timeline_id;
-            self.download_timeline(timeline, timeline_state, deadline, ctx)
+            self.download_timeline(timeline, timeline_state, ctx)
                .instrument(tracing::info_span!(
                    "secondary_download_timeline",
                    tenant_id=%tenant_shard_id.tenant_id,
@@ -759,25 +628,6 @@ impl<'a> TenantDownloader<'a> {
                .await?;
        }

-        // Metrics consistency check in testing builds
-        if cfg!(feature = "testing") {
-            let detail = self.secondary_state.detail.lock().unwrap();
-            let resident_size = detail
-                .timelines
-                .values()
-                .map(|tl| {
-                    tl.on_disk_layers
-                        .values()
-                        .map(|v| v.metadata.file_size)
-                        .sum::<u64>()
-                })
-                .sum::<u64>();
-            assert_eq!(
-                resident_size,
-                self.secondary_state.resident_size_metric.get()
-            );
-        }
-
        // Only update last_etag after a full successful download: this way will not skip
        // the next download, even if the heatmap's actual etag is unchanged.
        self.secondary_state.detail.lock().unwrap().last_download = Some(DownloadSummary {
@@ -890,7 +740,7 @@ impl<'a> TenantDownloader<'a> {
            for delete_timeline in &delete_timelines {
                // We haven't removed from disk yet, but optimistically remove from in-memory state: if removal
                // from disk fails that will be a fatal error.
-                detail.remove_timeline(delete_timeline, &self.secondary_state.resident_size_metric);
+                detail.timelines.remove(delete_timeline);
            }
        }

@@ -908,7 +758,7 @@ impl<'a> TenantDownloader<'a> {
            let Some(timeline_state) = detail.timelines.get_mut(&timeline_id) else {
                continue;
            };
-            timeline_state.remove_layer(&layer_name, &self.secondary_state.resident_size_metric);
+            timeline_state.on_disk_layers.remove(&layer_name);
        }

        for timeline_id in delete_timelines {
@@ -977,28 +827,26 @@ impl<'a> TenantDownloader<'a> {
        .and_then(|x| x)
    }

-    /// Download heatmap layers that are not present on local disk, or update their
-    /// access time if they are already present.
-    async fn download_timeline_layers(
+    async fn download_timeline(
        &self,
-        tenant_shard_id: &TenantShardId,
        timeline: HeatMapTimeline,
        timeline_state: SecondaryDetailTimeline,
-        deadline: Instant,
        ctx: &RequestContext,
-    ) -> (Result<(), UpdateError>, Vec<HeatMapLayer>) {
+    ) -> Result<(), UpdateError> {
+        debug_assert_current_span_has_tenant_and_timeline_id();
+        let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
+
        // Accumulate updates to the state
        let mut touched = Vec::new();

+        tracing::debug!(timeline_id=%timeline.timeline_id, "Downloading layers, {} in heatmap", timeline.layers.len());
+
+        // Download heatmap layers that are not present on local disk, or update their
+        // access time if they are already present.
        for layer in timeline.layers {
            if self.secondary_state.cancel.is_cancelled() {
                tracing::debug!("Cancelled -- dropping out of layer loop");
-                return (Err(UpdateError::Cancelled), touched);
-            }
-
-            if Instant::now() > deadline {
-                // We've been running downloads for a while, restart to download latest heatmap.
-                return (Err(UpdateError::Restart), touched);
+                return Err(UpdateError::Cancelled);
            }

            // Existing on-disk layers: just update their access time.
@@ -1068,66 +916,52 @@ impl<'a> TenantDownloader<'a> {

            match self
                .download_layer(tenant_shard_id, &timeline.timeline_id, layer, ctx)
-                .await
+                .await?
            {
-                Ok(Some(layer)) => touched.push(layer),
-                Ok(None) => {
+                Some(layer) => touched.push(layer),
+                None => {
                    // Not an error but we didn't download it: remote layer is missing.  Don't add it to the list of
                    // things to consider touched.
                }
-                Err(e) => {
-                    return (Err(e), touched);
-                }
            }
        }

-        (Ok(()), touched)
-    }
-
-    async fn download_timeline(
-        &self,
-        timeline: HeatMapTimeline,
-        timeline_state: SecondaryDetailTimeline,
-        deadline: Instant,
-        ctx: &RequestContext,
-    ) -> Result<(), UpdateError> {
-        debug_assert_current_span_has_tenant_and_timeline_id();
-        let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
-        let timeline_id = timeline.timeline_id;
-
-        tracing::debug!(timeline_id=%timeline_id, "Downloading layers, {} in heatmap", timeline.layers.len());
-
-        let (result, touched) = self
-            .download_timeline_layers(tenant_shard_id, timeline, timeline_state, deadline, ctx)
-            .await;
-
-        // Write updates to state to record layers we just downloaded or touched, irrespective of whether the overall result was successful
+        // Write updates to state to record layers we just downloaded or touched.
        {
            let mut detail = self.secondary_state.detail.lock().unwrap();
-            let timeline_detail = detail.timelines.entry(timeline_id).or_default();
+            let timeline_detail = detail.timelines.entry(timeline.timeline_id).or_default();

            tracing::info!("Wrote timeline_detail for {} touched layers", touched.len());
-            touched.into_iter().for_each(|t| {
-                timeline_detail.touch_layer(
-                    self.conf,
-                    tenant_shard_id,
-                    &timeline_id,
-                    &t,
-                    &self.secondary_state.resident_size_metric,
-                    || {
-                        local_layer_path(
+
+            for t in touched {
+                use std::collections::hash_map::Entry;
+                match timeline_detail.on_disk_layers.entry(t.name.clone()) {
+                    Entry::Occupied(mut v) => {
+                        v.get_mut().access_time = t.access_time;
+                    }
+                    Entry::Vacant(e) => {
+                        let local_path = local_layer_path(
                            self.conf,
                            tenant_shard_id,
-                            &timeline_id,
+                            &timeline.timeline_id,
                            &t.name,
                            &t.metadata.generation,
-                        )
-                    },
-                )
-            });
+                        );
+                        e.insert(OnDiskState::new(
+                            self.conf,
+                            tenant_shard_id,
+                            &timeline.timeline_id,
+                            t.name,
+                            t.metadata.clone(),
+                            t.access_time,
+                            local_path,
+                        ));
+                    }
+                }
+            }
        }

-        result
+        Ok(())
    }

    /// Call this during timeline download if a layer will _not_ be downloaded, to update progress statistics
@@ -1233,7 +1067,6 @@ async fn init_timeline_state(
    conf: &'static PageServerConf,
    tenant_shard_id: &TenantShardId,
    heatmap: &HeatMapTimeline,
-    resident_metric: &UIntGauge,
 ) -> SecondaryDetailTimeline {
    let timeline_path = conf.timeline_path(tenant_shard_id, &heatmap.timeline_id);
    let mut detail = SecondaryDetailTimeline::default();
@@ -1309,13 +1142,17 @@ async fn init_timeline_state(
                        } else {
                            // We expect the access time to be initialized immediately afterwards, when
                            // the latest heatmap is applied to the state.
-                            detail.touch_layer(
-                                conf,
-                                tenant_shard_id,
-                                &heatmap.timeline_id,
-                                remote_meta,
-                                resident_metric,
-                                || file_path,
+                            detail.on_disk_layers.insert(
+                                name.clone(),
+                                OnDiskState::new(
+                                    conf,
+                                    tenant_shard_id,
+                                    &heatmap.timeline_id,
+                                    name,
+                                    remote_meta.metadata.clone(),
+                                    remote_meta.access_time,
+                                    file_path,
+                                ),
                            );
                        }
                    }
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -3,7 +3,6 @@ use std::collections::hash_map::Entry;
 use std::collections::{HashMap, HashSet};
 use std::sync::Arc;

-use tenant_size_model::svg::SvgBranchKind;
 use tokio::sync::oneshot::error::RecvError;
 use tokio::sync::Semaphore;
 use tokio_util::sync::CancellationToken;
@@ -88,9 +87,6 @@ impl SegmentMeta {
            LsnKind::BranchPoint => true,
            LsnKind::GcCutOff => true,
            LsnKind::BranchEnd => false,
-            LsnKind::LeasePoint => true,
-            LsnKind::LeaseStart => false,
-            LsnKind::LeaseEnd => false,
        }
    }
 }
@@ -107,21 +103,6 @@ pub enum LsnKind {
    GcCutOff,
    /// Last record LSN
    BranchEnd,
-    /// A LSN lease is granted here.
-    LeasePoint,
-    /// A lease starts from here.
-    LeaseStart,
-    /// Last record LSN for the lease (should have the same LSN as the previous [`LsnKind::LeaseStart`]).
-    LeaseEnd,
-}
-
-impl From<LsnKind> for SvgBranchKind {
-    fn from(kind: LsnKind) -> Self {
-        match kind {
-            LsnKind::LeasePoint | LsnKind::LeaseStart | LsnKind::LeaseEnd => SvgBranchKind::Lease,
-            _ => SvgBranchKind::Timeline,
-        }
-    }
 }

 /// Collect all relevant LSNs to the inputs. These will only be helpful in the serialized form as
@@ -143,9 +124,6 @@ pub struct TimelineInputs {

    /// Cutoff point calculated from the user-supplied 'max_retention_period'
    retention_param_cutoff: Option<Lsn>,
-
-    /// Lease points on the timeline
-    lease_points: Vec<Lsn>,
 }

 /// Gathers the inputs for the tenant sizing model.
@@ -256,13 +234,6 @@ pub(super) async fn gather_inputs(
            None
        };

-        let lease_points = gc_info
-            .leases
-            .keys()
-            .filter(|&&lsn| lsn > ancestor_lsn)
-            .copied()
-            .collect::<Vec<_>>();
-
        // next_gc_cutoff in parent branch are not of interest (right now at least), nor do we
        // want to query any logical size before initdb_lsn.
        let branch_start_lsn = cmp::max(ancestor_lsn, timeline.initdb_lsn);
@@ -277,8 +248,6 @@ pub(super) async fn gather_inputs(
            .map(|lsn| (lsn, LsnKind::BranchPoint))
            .collect::<Vec<_>>();

-        lsns.extend(lease_points.iter().map(|&lsn| (lsn, LsnKind::LeasePoint)));
-
        drop(gc_info);

        // Add branch points we collected earlier, just in case there were any that were
@@ -327,7 +296,6 @@ pub(super) async fn gather_inputs(
            if kind == LsnKind::BranchPoint {
                branchpoint_segments.insert((timeline_id, lsn), segments.len());
            }
-
            segments.push(SegmentMeta {
                segment: Segment {
                    parent: Some(parent),
@@ -338,45 +306,7 @@ pub(super) async fn gather_inputs(
                timeline_id: timeline.timeline_id,
                kind,
            });
-
-            parent = segments.len() - 1;
-
-            if kind == LsnKind::LeasePoint {
-                // Needs `LeaseStart` and `LeaseEnd` as well to model lease as a read-only branch that never writes data
-                // (i.e. it's lsn has not advanced from ancestor_lsn), and therefore the three segments have the same LSN
-                // value. Without the other two segments, the calculation code would not count the leased LSN as a point
-                // to be retained.
-                // Did not use `BranchStart` or `BranchEnd` so we can differentiate branches and leases during debug.
-                //
-                // Alt Design: rewrite the entire calculation code to be independent of timeline id. Both leases and
-                // branch points can be given a synthetic id so we can unite them.
-                let mut lease_parent = parent;
-
-                // Start of a lease.
-                segments.push(SegmentMeta {
-                    segment: Segment {
-                        parent: Some(lease_parent),
-                        lsn: lsn.0,
-                        size: None,                   // Filled in later, if necessary
-                        needed: lsn > next_gc_cutoff, // only needed if the point is within rentention.
-                    },
-                    timeline_id: timeline.timeline_id,
-                    kind: LsnKind::LeaseStart,
-                });
-                lease_parent += 1;
-
-                // End of the lease.
-                segments.push(SegmentMeta {
-                    segment: Segment {
-                        parent: Some(lease_parent),
-                        lsn: lsn.0,
-                        size: None,   // Filled in later, if necessary
-                        needed: true, // everything at the lease LSN must be readable => is needed
-                    },
-                    timeline_id: timeline.timeline_id,
-                    kind: LsnKind::LeaseEnd,
-                });
-            }
+            parent += 1;
        }

        // Current end of the timeline
@@ -402,7 +332,6 @@ pub(super) async fn gather_inputs(
            pitr_cutoff,
            next_gc_cutoff,
            retention_param_cutoff,
-            lease_points,
        });
    }

@@ -745,8 +674,7 @@ fn verify_size_for_multiple_branches() {
      "horizon_cutoff": "0/2210CD0",
      "pitr_cutoff": "0/2210CD0",
      "next_gc_cutoff": "0/2210CD0",
-      "retention_param_cutoff": null,
-      "lease_points": []
+      "retention_param_cutoff": null
    },
    {
      "timeline_id": "454626700469f0a9914949b9d018e876",
@@ -756,8 +684,7 @@ fn verify_size_for_multiple_branches() {
      "horizon_cutoff": "0/1817770",
      "pitr_cutoff": "0/1817770",
      "next_gc_cutoff": "0/1817770",
-      "retention_param_cutoff": null,
-      "lease_points": []
+      "retention_param_cutoff": null
    },
    {
      "timeline_id": "cb5e3cbe60a4afc00d01880e1a37047f",
@@ -767,8 +694,7 @@ fn verify_size_for_multiple_branches() {
      "horizon_cutoff": "0/18B3D98",
      "pitr_cutoff": "0/18B3D98",
      "next_gc_cutoff": "0/18B3D98",
-      "retention_param_cutoff": null,
-      "lease_points": []
+      "retention_param_cutoff": null
    }
  ]
 }
@@ -823,8 +749,7 @@ fn verify_size_for_one_branch() {
      "horizon_cutoff": "47/240A5860",
      "pitr_cutoff": "47/240A5860",
      "next_gc_cutoff": "47/240A5860",
-      "retention_param_cutoff": "0/0",
-      "lease_points": []
+      "retention_param_cutoff": "0/0"
    }
  ]
 }"#;
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -7,9 +7,6 @@ pub(crate) mod layer;
 mod layer_desc;
 mod layer_name;

-#[cfg(test)]
-pub mod merge_iterator;
-
 use crate::context::{AccessStatsBehavior, RequestContext};
 use crate::repository::Value;
 use crate::task_mgr::TaskKind;
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -49,7 +49,7 @@ use camino::{Utf8Path, Utf8PathBuf};
 use futures::StreamExt;
 use itertools::Itertools;
 use pageserver_api::keyspace::KeySpace;
-use pageserver_api::models::{ImageCompressionAlgorithm, LayerAccessKind};
+use pageserver_api::models::LayerAccessKind;
 use pageserver_api::shard::TenantShardId;
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
@@ -223,11 +223,6 @@ pub struct DeltaLayerInner {
    file: VirtualFile,
    file_id: FileId,

-    #[allow(dead_code)]
-    layer_key_range: Range<Key>,
-    #[allow(dead_code)]
-    layer_lsn_range: Range<Lsn>,
-
    max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
 }

@@ -457,12 +452,7 @@ impl DeltaLayerWriterInner {
        ctx: &RequestContext,
    ) -> (Vec<u8>, anyhow::Result<()>) {
        assert!(self.lsn_range.start <= lsn);
-        // We don't want to use compression in delta layer creation
-        let compression = ImageCompressionAlgorithm::Disabled;
-        let (val, res) = self
-            .blob_writer
-            .write_blob_maybe_compressed(val, ctx, compression)
-            .await;
+        let (val, res) = self.blob_writer.write_blob(val, ctx).await;
        let off = match res {
            Ok(off) => off,
            Err(e) => return (val, Err(anyhow::anyhow!(e))),
@@ -747,16 +737,6 @@ impl DeltaLayer {
 }

 impl DeltaLayerInner {
-    #[cfg(test)]
-    pub(crate) fn key_range(&self) -> &Range<Key> {
-        &self.layer_key_range
-    }
-
-    #[cfg(test)]
-    pub(crate) fn lsn_range(&self) -> &Range<Lsn> {
-        &self.layer_lsn_range
-    }
-
    /// Returns nested result following Result<Result<_, OpErr>, Critical>:
    /// - inner has the success or transient failure
    /// - outer has the permanent failure
@@ -805,8 +785,6 @@ impl DeltaLayerInner {
            index_start_blk: actual_summary.index_start_blk,
            index_root_blk: actual_summary.index_root_blk,
            max_vectored_read_bytes,
-            layer_key_range: actual_summary.key_range,
-            layer_lsn_range: actual_summary.lsn_range,
        }))
    }

@@ -1656,7 +1634,7 @@ impl<'a> DeltaLayerIterator<'a> {
 }

 #[cfg(test)]
-pub(crate) mod test {
+mod test {
    use std::collections::BTreeMap;

    use itertools::MinMaxResult;
@@ -2234,20 +2212,13 @@ pub(crate) mod test {
        }
    }

-    pub(crate) fn sort_delta(
-        (k1, l1, _): &(Key, Lsn, Value),
-        (k2, l2, _): &(Key, Lsn, Value),
-    ) -> std::cmp::Ordering {
-        (k1, l1).cmp(&(k2, l2))
-    }
-
-    pub(crate) async fn produce_delta_layer(
+    async fn produce_delta_layer(
        tenant: &Tenant,
        tline: &Arc<Timeline>,
        mut deltas: Vec<(Key, Lsn, Value)>,
        ctx: &RequestContext,
    ) -> anyhow::Result<ResidentLayer> {
-        deltas.sort_by(sort_delta);
+        deltas.sort_by(|(k1, l1, _), (k2, l2, _)| (k1, l1).cmp(&(k2, l2)));
        let (key_start, _, _) = deltas.first().unwrap();
        let (key_max, _, _) = deltas.first().unwrap();
        let lsn_min = deltas.iter().map(|(_, lsn, _)| lsn).min().unwrap();
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -369,16 +369,6 @@ impl ImageLayer {
 }

 impl ImageLayerInner {
-    #[cfg(test)]
-    pub(crate) fn key_range(&self) -> &Range<Key> {
-        &self.key_range
-    }
-
-    #[cfg(test)]
-    pub(crate) fn lsn(&self) -> Lsn {
-        self.lsn
-    }
-
    /// Returns nested result following Result<Result<_, OpErr>, Critical>:
    /// - inner has the success or transient failure
    /// - outer has the permanent failure
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -6,14 +6,13 @@
 //!
 use crate::config::PageServerConf;
 use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
-use crate::page_cache::PAGE_SZ;
 use crate::repository::{Key, Value};
-use crate::tenant::block_io::{BlockCursor, BlockReader, BlockReaderRef};
+use crate::tenant::block_io::BlockReader;
 use crate::tenant::ephemeral_file::EphemeralFile;
 use crate::tenant::storage_layer::ValueReconstructResult;
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::{PageReconstructError, Timeline};
-use crate::{l0_flush, page_cache, walrecord};
+use crate::{page_cache, walrecord};
 use anyhow::{anyhow, ensure, Result};
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::InMemoryLayerInfo;
@@ -411,7 +410,6 @@ impl InMemoryLayer {
                continue;
            }

-            // TODO: this uses the page cache => https://github.com/neondatabase/neon/issues/8183
            let buf = reader.read_blob(block_read.block_offset, &ctx).await;
            if let Err(e) = buf {
                reconstruct_state
@@ -622,13 +620,6 @@ impl InMemoryLayer {
        // rare though, so we just accept the potential latency hit for now.
        let inner = self.inner.read().await;

-        let l0_flush_global_state = timeline.l0_flush_global_state.inner().clone();
-        use l0_flush::Inner;
-        let _concurrency_permit = match &*l0_flush_global_state {
-            Inner::PageCached => None,
-            Inner::Direct { semaphore, .. } => Some(semaphore.acquire().await),
-        };
-
        let end_lsn = *self.end_lsn.get().unwrap();

        let key_count = if let Some(key_range) = key_range {
@@ -654,83 +645,28 @@ impl InMemoryLayer {
        )
        .await?;

-        match &*l0_flush_global_state {
-            l0_flush::Inner::PageCached => {
-                let ctx = RequestContextBuilder::extend(ctx)
-                    .page_content_kind(PageContentKind::InMemoryLayer)
-                    .build();
+        let mut buf = Vec::new();

-                let mut buf = Vec::new();
+        let cursor = inner.file.block_cursor();

-                let cursor = inner.file.block_cursor();
-
-                for (key, vec_map) in inner.index.iter() {
-                    // Write all page versions
-                    for (lsn, pos) in vec_map.as_slice() {
-                        cursor.read_blob_into_buf(*pos, &mut buf, &ctx).await?;
-                        let will_init = Value::des(&buf)?.will_init();
-                        let res;
-                        (buf, res) = delta_layer_writer
-                            .put_value_bytes(*key, *lsn, buf, will_init, &ctx)
-                            .await;
-                        res?;
-                    }
-                }
-            }
-            l0_flush::Inner::Direct { .. } => {
-                let file_contents: Vec<u8> = inner.file.load_to_vec(ctx).await?;
-                assert_eq!(
-                    file_contents.len() % PAGE_SZ,
-                    0,
-                    "needed by BlockReaderRef::Slice"
-                );
-                assert_eq!(file_contents.len(), {
-                    let written = usize::try_from(inner.file.len()).unwrap();
-                    if written % PAGE_SZ == 0 {
-                        written
-                    } else {
-                        written.checked_add(PAGE_SZ - (written % PAGE_SZ)).unwrap()
-                    }
-                });
-
-                let cursor = BlockCursor::new(BlockReaderRef::Slice(&file_contents));
-
-                let mut buf = Vec::new();
-
-                for (key, vec_map) in inner.index.iter() {
-                    // Write all page versions
-                    for (lsn, pos) in vec_map.as_slice() {
-                        // TODO: once we have blob lengths in the in-memory index, we can
-                        // 1. get rid of the blob_io / BlockReaderRef::Slice business and
-                        // 2. load the file contents into a Bytes and
-                        // 3. the use `Bytes::slice` to get the `buf` that is our blob
-                        // 4. pass that `buf` into `put_value_bytes`
-                        // => https://github.com/neondatabase/neon/issues/8183
-                        cursor.read_blob_into_buf(*pos, &mut buf, ctx).await?;
-                        let will_init = Value::des(&buf)?.will_init();
-                        let res;
-                        (buf, res) = delta_layer_writer
-                            .put_value_bytes(*key, *lsn, buf, will_init, ctx)
-                            .await;
-                        res?;
-                    }
-                }
+        let ctx = RequestContextBuilder::extend(ctx)
+            .page_content_kind(PageContentKind::InMemoryLayer)
+            .build();
+        for (key, vec_map) in inner.index.iter() {
+            // Write all page versions
+            for (lsn, pos) in vec_map.as_slice() {
+                cursor.read_blob_into_buf(*pos, &mut buf, &ctx).await?;
+                let will_init = Value::des(&buf)?.will_init();
+                let res;
+                (buf, res) = delta_layer_writer
+                    .put_value_bytes(*key, *lsn, buf, will_init, &ctx)
+                    .await;
+                res?;
            }
        }

        // MAX is used here because we identify L0 layers by full key range
-        let delta_layer = delta_layer_writer.finish(Key::MAX, timeline, ctx).await?;
-
-        // Hold the permit until all the IO is done, including the fsync in `delta_layer_writer.finish()``.
-        //
-        // If we didn't and our caller drops this future, tokio-epoll-uring would extend the lifetime of
-        // the `file_contents: Vec<u8>` until the IO is done, but not the permit's lifetime.
-        // Thus, we'd have more concurrenct `Vec<u8>` in existence than the semaphore allows.
-        //
-        // We hold across the fsync so that on ext4 mounted with data=ordered, all the kernel page cache pages
-        // we dirtied when writing to the filesystem have been flushed and marked !dirty.
-        drop(_concurrency_permit);
-
+        let delta_layer = delta_layer_writer.finish(Key::MAX, timeline, &ctx).await?;
        Ok(Some(delta_layer))
    }
 }
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -1096,10 +1096,19 @@ impl LayerInner {

        match rx.await {
            Ok(Ok(res)) => Ok(res),
-            Ok(Err(remote_storage::DownloadError::Cancelled)) => {
-                Err(DownloadError::DownloadCancelled)
+            Ok(Err(e)) => {
+                // sleep already happened in the spawned task, if it was not cancelled
+                match e.downcast_ref::<remote_storage::DownloadError>() {
+                    // If the download failed due to its cancellation token,
+                    // propagate the cancellation error upstream.
+                    Some(remote_storage::DownloadError::Cancelled) => {
+                        Err(DownloadError::DownloadCancelled)
+                    }
+                    // FIXME: this is not embedding the error because historically it would had
+                    // been output to compute, however that is no longer the case.
+                    _ => Err(DownloadError::DownloadFailed),
+                }
            }
-            Ok(Err(_)) => Err(DownloadError::DownloadFailed),
            Err(_gone) => Err(DownloadError::DownloadCancelled),
        }
    }
@@ -1109,7 +1118,7 @@ impl LayerInner {
        timeline: Arc<Timeline>,
        permit: heavier_once_cell::InitPermit,
        ctx: &RequestContext,
-    ) -> Result<Arc<DownloadedLayer>, remote_storage::DownloadError> {
+    ) -> anyhow::Result<Arc<DownloadedLayer>> {
        let result = timeline
            .remote_client
            .download_layer_file(
--- a/pageserver/src/tenant/storage_layer/merge_iterator.rs
+++ b/pageserver/src/tenant/storage_layer/merge_iterator.rs
@@ -1,412 +0,0 @@
-use std::{
-    cmp::Ordering,
-    collections::{binary_heap, BinaryHeap},
-};
-
-use pageserver_api::key::Key;
-use utils::lsn::Lsn;
-
-use crate::{context::RequestContext, repository::Value};
-
-use super::{
-    delta_layer::{DeltaLayerInner, DeltaLayerIterator},
-    image_layer::{ImageLayerInner, ImageLayerIterator},
-};
-
-#[derive(Clone, Copy)]
-enum LayerRef<'a> {
-    Image(&'a ImageLayerInner),
-    Delta(&'a DeltaLayerInner),
-}
-
-impl<'a> LayerRef<'a> {
-    fn iter(self, ctx: &'a RequestContext) -> LayerIterRef<'a> {
-        match self {
-            Self::Image(x) => LayerIterRef::Image(x.iter(ctx)),
-            Self::Delta(x) => LayerIterRef::Delta(x.iter(ctx)),
-        }
-    }
-}
-
-enum LayerIterRef<'a> {
-    Image(ImageLayerIterator<'a>),
-    Delta(DeltaLayerIterator<'a>),
-}
-
-impl LayerIterRef<'_> {
-    async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
-        match self {
-            Self::Delta(x) => x.next().await,
-            Self::Image(x) => x.next().await,
-        }
-    }
-}
-
-/// This type plays several roles at once
-/// 1. Unified iterator for image and delta layers.
-/// 2. `Ord` for use in [`MergeIterator::heap`] (for the k-merge).
-/// 3. Lazy creation of the real delta/image iterator.
-enum IteratorWrapper<'a> {
-    NotLoaded {
-        ctx: &'a RequestContext,
-        first_key_lower_bound: (Key, Lsn),
-        layer: LayerRef<'a>,
-    },
-    Loaded {
-        iter: PeekableLayerIterRef<'a>,
-    },
-}
-
-struct PeekableLayerIterRef<'a> {
-    iter: LayerIterRef<'a>,
-    peeked: Option<(Key, Lsn, Value)>, // None == end
-}
-
-impl<'a> PeekableLayerIterRef<'a> {
-    async fn create(mut iter: LayerIterRef<'a>) -> anyhow::Result<Self> {
-        let peeked = iter.next().await?;
-        Ok(Self { iter, peeked })
-    }
-
-    fn peek(&self) -> &Option<(Key, Lsn, Value)> {
-        &self.peeked
-    }
-
-    async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
-        let result = self.peeked.take();
-        self.peeked = self.iter.next().await?;
-        Ok(result)
-    }
-}
-
-impl<'a> std::cmp::PartialEq for IteratorWrapper<'a> {
-    fn eq(&self, other: &Self) -> bool {
-        self.cmp(other) == Ordering::Equal
-    }
-}
-
-impl<'a> std::cmp::Eq for IteratorWrapper<'a> {}
-
-impl<'a> std::cmp::PartialOrd for IteratorWrapper<'a> {
-    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
-        Some(self.cmp(other))
-    }
-}
-
-impl<'a> std::cmp::Ord for IteratorWrapper<'a> {
-    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
-        use std::cmp::Ordering;
-        let a = self.peek_next_key_lsn();
-        let b = other.peek_next_key_lsn();
-        match (a, b) {
-            (Some((k1, l1)), Some((k2, l2))) => {
-                let loaded_1 = if self.is_loaded() { 1 } else { 0 };
-                let loaded_2 = if other.is_loaded() { 1 } else { 0 };
-                // When key_lsn are the same, the unloaded iter will always appear before the loaded one.
-                // And note that we do a reverse at the end of the comparison, so it works with the max heap.
-                (k1, l1, loaded_1).cmp(&(k2, l2, loaded_2))
-            }
-            (Some(_), None) => Ordering::Less,
-            (None, Some(_)) => Ordering::Greater,
-            (None, None) => Ordering::Equal,
-        }
-        .reverse()
-    }
-}
-
-impl<'a> IteratorWrapper<'a> {
-    pub fn create_from_image_layer(
-        image_layer: &'a ImageLayerInner,
-        ctx: &'a RequestContext,
-    ) -> Self {
-        Self::NotLoaded {
-            layer: LayerRef::Image(image_layer),
-            first_key_lower_bound: (image_layer.key_range().start, image_layer.lsn()),
-            ctx,
-        }
-    }
-
-    pub fn create_from_delta_layer(
-        delta_layer: &'a DeltaLayerInner,
-        ctx: &'a RequestContext,
-    ) -> Self {
-        Self::NotLoaded {
-            layer: LayerRef::Delta(delta_layer),
-            first_key_lower_bound: (delta_layer.key_range().start, delta_layer.lsn_range().start),
-            ctx,
-        }
-    }
-
-    fn peek_next_key_lsn(&self) -> Option<(&Key, Lsn)> {
-        match self {
-            Self::Loaded { iter } => iter.peek().as_ref().map(|(key, lsn, _)| (key, *lsn)),
-            Self::NotLoaded {
-                first_key_lower_bound: (key, lsn),
-                ..
-            } => Some((key, *lsn)),
-        }
-    }
-
-    // CORRECTNESS: this function must always take `&mut self`, never `&self`.
-    //
-    // The reason is that `impl Ord for Self` evaluates differently after this function
-    // returns. We're called through a `PeekMut::deref_mut`, which causes heap repair when
-    // the PeekMut gets returned. So, it's critical that we actually run through `PeekMut::deref_mut`
-    // and not just `PeekMut::deref`
-    // If we don't take `&mut self`
-    async fn load(&mut self) -> anyhow::Result<()> {
-        assert!(!self.is_loaded());
-        let Self::NotLoaded {
-            ctx,
-            first_key_lower_bound,
-            layer,
-        } = self
-        else {
-            unreachable!()
-        };
-        let iter = layer.iter(ctx);
-        let iter = PeekableLayerIterRef::create(iter).await?;
-        if let Some((k1, l1, _)) = iter.peek() {
-            let (k2, l2) = first_key_lower_bound;
-            debug_assert!((k1, l1) >= (k2, l2));
-        }
-        *self = Self::Loaded { iter };
-        Ok(())
-    }
-
-    fn is_loaded(&self) -> bool {
-        matches!(self, Self::Loaded { .. })
-    }
-
-    /// Correctness: must load the iterator before using.
-    ///
-    /// Given this iterator wrapper is private to the merge iterator, users won't be able to mis-use it.
-    /// The public interfaces to use are [`crate::tenant::storage_layer::delta_layer::DeltaLayerIterator`] and
-    /// [`crate::tenant::storage_layer::image_layer::ImageLayerIterator`].
-    async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
-        let Self::Loaded { iter } = self else {
-            panic!("must load the iterator before using")
-        };
-        iter.next().await
-    }
-}
-
-pub struct MergeIterator<'a> {
-    heap: BinaryHeap<IteratorWrapper<'a>>,
-}
-
-impl<'a> MergeIterator<'a> {
-    pub fn create(
-        deltas: &[&'a DeltaLayerInner],
-        images: &[&'a ImageLayerInner],
-        ctx: &'a RequestContext,
-    ) -> Self {
-        let mut heap = Vec::with_capacity(images.len() + deltas.len());
-        for image in images {
-            heap.push(IteratorWrapper::create_from_image_layer(image, ctx));
-        }
-        for delta in deltas {
-            heap.push(IteratorWrapper::create_from_delta_layer(delta, ctx));
-        }
-        Self {
-            heap: BinaryHeap::from(heap),
-        }
-    }
-
-    pub async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
-        while let Some(mut iter) = self.heap.peek_mut() {
-            if !iter.is_loaded() {
-                // Once we load the iterator, we can know the real first key-value pair in the iterator.
-                // We put it back into the heap so that a potentially unloaded layer may have a key between
-                // [potential_first_key, loaded_first_key).
-                iter.load().await?;
-                continue;
-            }
-            let Some(item) = iter.next().await? else {
-                // If the iterator returns None, we pop this iterator. Actually, in the current implementation,
-                // we order None > Some, and all the rest of the iterators should return None.
-                binary_heap::PeekMut::pop(iter);
-                continue;
-            };
-            return Ok(Some(item));
-        }
-        Ok(None)
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    use itertools::Itertools;
-    use pageserver_api::key::Key;
-    use utils::lsn::Lsn;
-
-    use crate::{
-        tenant::{
-            harness::{TenantHarness, TIMELINE_ID},
-            storage_layer::delta_layer::test::{produce_delta_layer, sort_delta},
-        },
-        DEFAULT_PG_VERSION,
-    };
-
-    async fn assert_merge_iter_equal(
-        merge_iter: &mut MergeIterator<'_>,
-        expect: &[(Key, Lsn, Value)],
-    ) {
-        let mut expect_iter = expect.iter();
-        loop {
-            let o1 = merge_iter.next().await.unwrap();
-            let o2 = expect_iter.next();
-            assert_eq!(o1.is_some(), o2.is_some());
-            if o1.is_none() && o2.is_none() {
-                break;
-            }
-            let (k1, l1, v1) = o1.unwrap();
-            let (k2, l2, v2) = o2.unwrap();
-            assert_eq!(&k1, k2);
-            assert_eq!(l1, *l2);
-            assert_eq!(&v1, v2);
-        }
-    }
-
-    #[tokio::test]
-    async fn merge_in_between() {
-        use crate::repository::Value;
-        use bytes::Bytes;
-
-        let harness = TenantHarness::create("merge_iterator_delta_merge").unwrap();
-        let (tenant, ctx) = harness.load().await;
-
-        let tline = tenant
-            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
-            .await
-            .unwrap();
-
-        fn get_key(id: u32) -> Key {
-            let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap();
-            key.field6 = id;
-            key
-        }
-        let test_deltas1 = vec![
-            (
-                get_key(0),
-                Lsn(0x10),
-                Value::Image(Bytes::copy_from_slice(b"test")),
-            ),
-            (
-                get_key(5),
-                Lsn(0x10),
-                Value::Image(Bytes::copy_from_slice(b"test")),
-            ),
-        ];
-        let resident_layer_1 = produce_delta_layer(&tenant, &tline, test_deltas1.clone(), &ctx)
-            .await
-            .unwrap();
-        let test_deltas2 = vec![
-            (
-                get_key(3),
-                Lsn(0x10),
-                Value::Image(Bytes::copy_from_slice(b"test")),
-            ),
-            (
-                get_key(4),
-                Lsn(0x10),
-                Value::Image(Bytes::copy_from_slice(b"test")),
-            ),
-        ];
-        let resident_layer_2 = produce_delta_layer(&tenant, &tline, test_deltas2.clone(), &ctx)
-            .await
-            .unwrap();
-        let mut merge_iter = MergeIterator::create(
-            &[
-                resident_layer_2.get_as_delta(&ctx).await.unwrap(),
-                resident_layer_1.get_as_delta(&ctx).await.unwrap(),
-            ],
-            &[],
-            &ctx,
-        );
-        let mut expect = Vec::new();
-        expect.extend(test_deltas1);
-        expect.extend(test_deltas2);
-        expect.sort_by(sort_delta);
-        assert_merge_iter_equal(&mut merge_iter, &expect).await;
-    }
-
-    #[tokio::test]
-    async fn delta_merge() {
-        use crate::repository::Value;
-        use bytes::Bytes;
-
-        let harness = TenantHarness::create("merge_iterator_delta_merge").unwrap();
-        let (tenant, ctx) = harness.load().await;
-
-        let tline = tenant
-            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
-            .await
-            .unwrap();
-
-        fn get_key(id: u32) -> Key {
-            let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap();
-            key.field6 = id;
-            key
-        }
-        const N: usize = 1000;
-        let test_deltas1 = (0..N)
-            .map(|idx| {
-                (
-                    get_key(idx as u32 / 10),
-                    Lsn(0x20 * ((idx as u64) % 10 + 1)),
-                    Value::Image(Bytes::from(format!("img{idx:05}"))),
-                )
-            })
-            .collect_vec();
-        let resident_layer_1 = produce_delta_layer(&tenant, &tline, test_deltas1.clone(), &ctx)
-            .await
-            .unwrap();
-        let test_deltas2 = (0..N)
-            .map(|idx| {
-                (
-                    get_key(idx as u32 / 10),
-                    Lsn(0x20 * ((idx as u64) % 10 + 1) + 0x10),
-                    Value::Image(Bytes::from(format!("img{idx:05}"))),
-                )
-            })
-            .collect_vec();
-        let resident_layer_2 = produce_delta_layer(&tenant, &tline, test_deltas2.clone(), &ctx)
-            .await
-            .unwrap();
-        let test_deltas3 = (0..N)
-            .map(|idx| {
-                (
-                    get_key(idx as u32 / 10 + N as u32),
-                    Lsn(0x10 * ((idx as u64) % 10 + 1)),
-                    Value::Image(Bytes::from(format!("img{idx:05}"))),
-                )
-            })
-            .collect_vec();
-        let resident_layer_3 = produce_delta_layer(&tenant, &tline, test_deltas3.clone(), &ctx)
-            .await
-            .unwrap();
-        let mut merge_iter = MergeIterator::create(
-            &[
-                resident_layer_1.get_as_delta(&ctx).await.unwrap(),
-                resident_layer_2.get_as_delta(&ctx).await.unwrap(),
-                resident_layer_3.get_as_delta(&ctx).await.unwrap(),
-            ],
-            &[],
-            &ctx,
-        );
-        let mut expect = Vec::new();
-        expect.extend(test_deltas1);
-        expect.extend(test_deltas2);
-        expect.extend(test_deltas3);
-        expect.sort_by(sort_delta);
-        assert_merge_iter_equal(&mut merge_iter, &expect).await;
-
-        // TODO: test layers are loaded only when needed, reducing num of active iterators in k-merge
-    }
-
-    // TODO: image layer merge, delta+image mixed merge
-    // TODO: is it possible to have duplicated delta at same LSN now? we might need to test that
-}
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -14,7 +14,6 @@ use anyhow::{anyhow, bail, ensure, Context, Result};
 use arc_swap::ArcSwap;
 use bytes::Bytes;
 use camino::Utf8Path;
-use chrono::{DateTime, Utc};
 use enumset::EnumSet;
 use fail::fail_point;
 use once_cell::sync::Lazy;
@@ -66,12 +65,13 @@ use std::{
    ops::{Deref, Range},
 };

+use crate::metrics::GetKind;
+use crate::pgdatadir_mapping::MAX_AUX_FILE_V2_DELTAS;
 use crate::{
    aux_file::AuxFileSizeEstimator,
    tenant::{
        layer_map::{LayerMap, SearchResult},
        metadata::TimelineMetadata,
-        storage_layer::PersistentLayerDesc,
    },
 };
 use crate::{
@@ -90,15 +90,10 @@ use crate::{
 use crate::{
    disk_usage_eviction_task::EvictionCandidate, tenant::storage_layer::delta_layer::DeltaEntry,
 };
-use crate::{
-    l0_flush::{self, L0FlushGlobalState},
-    metrics::GetKind,
-};
 use crate::{
    metrics::ScanLatencyOngoingRecording, tenant::timeline::logical_size::CurrentLogicalSize,
 };
 use crate::{pgdatadir_mapping::LsnForTimestamp, tenant::tasks::BackgroundLoopKind};
-use crate::{pgdatadir_mapping::MAX_AUX_FILE_V2_DELTAS, tenant::storage_layer::PersistentLayerKey};
 use crate::{
    pgdatadir_mapping::{AuxFilesDirectory, DirectoryKind},
    virtual_file::{MaybeFatalIo, VirtualFile},
@@ -213,7 +208,6 @@ pub struct TimelineResources {
    pub timeline_get_throttle: Arc<
        crate::tenant::throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>,
    >,
-    pub l0_flush_global_state: l0_flush::L0FlushGlobalState,
 }

 pub(crate) struct AuxFilesState {
@@ -366,7 +360,6 @@ pub struct Timeline {
    repartition_threshold: u64,

    last_image_layer_creation_check_at: AtomicLsn,
-    last_image_layer_creation_check_instant: std::sync::Mutex<Option<Instant>>,

    /// Current logical size of the "datadir", at the last LSN.
    current_logical_size: LogicalSize,
@@ -440,8 +433,6 @@ pub struct Timeline {
    /// in the future, add `extra_test_sparse_keyspace` if necessary.
    #[cfg(test)]
    pub(crate) extra_test_dense_keyspace: ArcSwap<KeySpace>,
-
-    pub(crate) l0_flush_global_state: L0FlushGlobalState,
 }

 pub struct WalReceiverInfo {
@@ -466,9 +457,6 @@ pub(crate) struct GcInfo {

    /// Leases granted to particular LSNs.
    pub(crate) leases: BTreeMap<Lsn, LsnLease>,
-
-    /// Whether our branch point is within our ancestor's PITR interval (for cost estimation)
-    pub(crate) within_ancestor_pitr: bool,
 }

 impl GcInfo {
@@ -729,9 +717,6 @@ impl From<CreateImageLayersError> for CompactionError {
    fn from(e: CreateImageLayersError) -> Self {
        match e {
            CreateImageLayersError::Cancelled => CompactionError::ShuttingDown,
-            CreateImageLayersError::Other(e) => {
-                CompactionError::Other(e.context("create image layers"))
-            }
            _ => CompactionError::Other(e.into()),
        }
    }
@@ -860,18 +845,6 @@ impl Timeline {
            .map(|ancestor| ancestor.timeline_id)
    }

-    /// Get the bytes written since the PITR cutoff on this branch, and
-    /// whether this branch's ancestor_lsn is within its parent's PITR.
-    pub(crate) fn get_pitr_history_stats(&self) -> (u64, bool) {
-        let gc_info = self.gc_info.read().unwrap();
-        let history = self
-            .get_last_record_lsn()
-            .checked_sub(gc_info.cutoffs.pitr)
-            .unwrap_or(Lsn(0))
-            .0;
-        (history, gc_info.within_ancestor_pitr)
-    }
-
    /// Lock and get timeline's GC cutoff
    pub(crate) fn get_latest_gc_cutoff_lsn(&self) -> RcuReadGuard<Lsn> {
        self.latest_gc_cutoff_lsn.read()
@@ -1023,7 +996,6 @@ impl Timeline {
    }

    pub(crate) const MAX_GET_VECTORED_KEYS: u64 = 32;
-    pub(crate) const VEC_GET_LAYERS_VISITED_WARN_THRESH: f64 = 512.0;

    /// Look up multiple page versions at a given LSN
    ///
@@ -1256,7 +1228,7 @@ impl Timeline {
        let get_data_timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME
            .for_get_kind(get_kind)
            .start_timer();
-        self.get_vectored_reconstruct_data(keyspace.clone(), lsn, reconstruct_state, ctx)
+        self.get_vectored_reconstruct_data(keyspace, lsn, reconstruct_state, ctx)
            .await?;
        get_data_timer.stop_and_record();

@@ -1286,25 +1258,11 @@ impl Timeline {
        // (this is a requirement, not a bug). Skip updating the metric in these cases
        // to avoid infinite results.
        if !results.is_empty() {
-            let avg = layers_visited as f64 / results.len() as f64;
-            if avg >= Self::VEC_GET_LAYERS_VISITED_WARN_THRESH {
-                use utils::rate_limit::RateLimit;
-                static LOGGED: Lazy<Mutex<RateLimit>> =
-                    Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(60))));
-                let mut rate_limit = LOGGED.lock().unwrap();
-                rate_limit.call(|| {
-                    tracing::info!(
-                      shard_id = %self.tenant_shard_id.shard_slug(),
-                      lsn = %lsn,
-                      "Vectored read for {} visited {} layers on average per key and {} in total. {}/{} pages were returned",
-                      keyspace, avg, layers_visited, results.len(), keyspace.total_raw_size());
-                });
-            }
-
            // Note that this is an approximation. Tracking the exact number of layers visited
            // per key requires virtually unbounded memory usage and is inefficient
            // (i.e. segment tree tracking each range queried from a layer)
-            crate::metrics::VEC_READ_NUM_LAYERS_VISITED.observe(avg);
+            crate::metrics::VEC_READ_NUM_LAYERS_VISITED
+                .observe(layers_visited as f64 / results.len() as f64);
        }

        Ok(results)
@@ -1596,13 +1554,7 @@ impl Timeline {
                    let existing_lease = occupied.get_mut();
                    if valid_until > existing_lease.valid_until {
                        existing_lease.valid_until = valid_until;
-                        let dt: DateTime<Utc> = valid_until.into();
-                        info!("lease extended to {}", dt);
-                    } else {
-                        let dt: DateTime<Utc> = existing_lease.valid_until.into();
-                        info!("existing lease covers greater length, valid until {}", dt);
                    }
-
                    existing_lease.clone()
                } else {
                    // Reject already GC-ed LSN (lsn < latest_gc_cutoff)
@@ -1611,8 +1563,6 @@ impl Timeline {
                        bail!("tried to request a page version that was garbage collected. requested at {} gc cutoff {}", lsn, *latest_gc_cutoff_lsn);
                    }

-                    let dt: DateTime<Utc> = valid_until.into();
-                    info!("lease created, valid until {}", dt);
                    entry.or_insert(LsnLease { valid_until }).clone()
                }
            };
@@ -2389,7 +2339,6 @@ impl Timeline {
                )),
                repartition_threshold: 0,
                last_image_layer_creation_check_at: AtomicLsn::new(0),
-                last_image_layer_creation_check_instant: Mutex::new(None),

                last_received_wal: Mutex::new(None),
                rel_size_cache: RwLock::new(RelSizeCache {
@@ -2427,8 +2376,6 @@ impl Timeline {

                #[cfg(test)]
                extra_test_dense_keyspace: ArcSwap::new(Arc::new(KeySpace::default())),
-
-                l0_flush_global_state: resources.l0_flush_global_state,
            };
            result.repartition_threshold =
                result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
@@ -4470,58 +4417,6 @@ impl Timeline {
        }
    }

-    /// Predicate function which indicates whether we should check if new image layers
-    /// are required. Since checking if new image layers are required is expensive in
-    /// terms of CPU, we only do it in the following cases:
-    /// 1. If the timeline has ingested sufficient WAL to justify the cost
-    /// 2. If enough time has passed since the last check
-    /// 2.1. For large tenants, we wish to perform the check more often since they
-    /// suffer from the lack of image layers
-    /// 2.2. For small tenants (that can mostly fit in RAM), we use a much longer interval
-    fn should_check_if_image_layers_required(self: &Arc<Timeline>, lsn: Lsn) -> bool {
-        const LARGE_TENANT_THRESHOLD: u64 = 2 * 1024 * 1024 * 1024;
-
-        let last_checks_at = self.last_image_layer_creation_check_at.load();
-        let distance = lsn
-            .checked_sub(last_checks_at)
-            .expect("Attempt to compact with LSN going backwards");
-        let min_distance =
-            self.get_image_layer_creation_check_threshold() as u64 * self.get_checkpoint_distance();
-
-        let distance_based_decision = distance.0 >= min_distance;
-
-        let mut time_based_decision = false;
-        let mut last_check_instant = self.last_image_layer_creation_check_instant.lock().unwrap();
-        if let CurrentLogicalSize::Exact(logical_size) = self.current_logical_size.current_size() {
-            let check_required_after = if Into::<u64>::into(&logical_size) >= LARGE_TENANT_THRESHOLD
-            {
-                self.get_checkpoint_timeout()
-            } else {
-                Duration::from_secs(3600 * 48)
-            };
-
-            time_based_decision = match *last_check_instant {
-                Some(last_check) => {
-                    let elapsed = last_check.elapsed();
-                    elapsed >= check_required_after
-                }
-                None => true,
-            };
-        }
-
-        // Do the expensive delta layer counting only if this timeline has ingested sufficient
-        // WAL since the last check or a checkpoint timeout interval has elapsed since the last
-        // check.
-        let decision = distance_based_decision || time_based_decision;
-
-        if decision {
-            self.last_image_layer_creation_check_at.store(lsn);
-            *last_check_instant = Some(Instant::now());
-        }
-
-        decision
-    }
-
    #[tracing::instrument(skip_all, fields(%lsn, %mode))]
    async fn create_image_layers(
        self: &Arc<Timeline>,
@@ -4544,7 +4439,22 @@ impl Timeline {
        // image layers  <100000000..100000099> and <200000000..200000199> are not completely covering it.
        let mut start = Key::MIN;

-        let check_for_image_layers = self.should_check_if_image_layers_required(lsn);
+        let check_for_image_layers = {
+            let last_checks_at = self.last_image_layer_creation_check_at.load();
+            let distance = lsn
+                .checked_sub(last_checks_at)
+                .expect("Attempt to compact with LSN going backwards");
+            let min_distance = self.get_image_layer_creation_check_threshold() as u64
+                * self.get_checkpoint_distance();
+
+            // Skip the expensive delta layer counting if this timeline has not ingested sufficient
+            // WAL since the last check.
+            distance.0 >= min_distance
+        };
+
+        if check_for_image_layers {
+            self.last_image_layer_creation_check_at.store(lsn);
+        }

        for partition in partitioning.parts.iter() {
            let img_range = start..partition.ranges.last().unwrap().end;
@@ -4573,22 +4483,6 @@ impl Timeline {
                    start = img_range.end;
                    continue;
                }
-            } else if let ImageLayerCreationMode::Force = mode {
-                // When forced to create image layers, we might try and create them where they already
-                // exist.  This mode is only used in tests/debug.
-                let layers = self.layers.read().await;
-                if layers.contains_key(&PersistentLayerKey {
-                    key_range: img_range.clone(),
-                    lsn_range: PersistentLayerDesc::image_layer_lsn_range(lsn),
-                    is_delta: false,
-                }) {
-                    tracing::info!(
-                        "Skipping image layer at {lsn} {}..{}, already exists",
-                        img_range.start,
-                        img_range.end
-                    );
-                    continue;
-                }
            }

            let image_layer_writer = ImageLayerWriter::new(
@@ -4817,42 +4711,6 @@ impl DurationRecorder {
    }
 }

-/// Descriptor for a delta layer used in testing infra. The start/end key/lsn range of the
-/// delta layer might be different from the min/max key/lsn in the delta layer. Therefore,
-/// the layer descriptor requires the user to provide the ranges, which should cover all
-/// keys specified in the `data` field.
-#[cfg(test)]
-pub struct DeltaLayerTestDesc {
-    pub lsn_range: Range<Lsn>,
-    pub key_range: Range<Key>,
-    pub data: Vec<(Key, Lsn, Value)>,
-}
-
-#[cfg(test)]
-impl DeltaLayerTestDesc {
-    #[allow(dead_code)]
-    pub fn new(lsn_range: Range<Lsn>, key_range: Range<Key>, data: Vec<(Key, Lsn, Value)>) -> Self {
-        Self {
-            lsn_range,
-            key_range,
-            data,
-        }
-    }
-
-    pub fn new_with_inferred_key_range(
-        lsn_range: Range<Lsn>,
-        data: Vec<(Key, Lsn, Value)>,
-    ) -> Self {
-        let key_min = data.iter().map(|(key, _, _)| key).min().unwrap();
-        let key_max = data.iter().map(|(key, _, _)| key).max().unwrap();
-        Self {
-            key_range: (*key_min)..(key_max.next()),
-            lsn_range,
-            data,
-        }
-    }
-}
-
 impl Timeline {
    async fn finish_compact_batch(
        self: &Arc<Self>,
@@ -5653,65 +5511,37 @@ impl Timeline {
    #[cfg(test)]
    pub(super) async fn force_create_delta_layer(
        self: &Arc<Timeline>,
-        mut deltas: DeltaLayerTestDesc,
+        mut deltas: Vec<(Key, Lsn, Value)>,
        check_start_lsn: Option<Lsn>,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
        let last_record_lsn = self.get_last_record_lsn();
-        deltas
-            .data
-            .sort_unstable_by(|(ka, la, _), (kb, lb, _)| (ka, la).cmp(&(kb, lb)));
-        assert!(deltas.data.first().unwrap().0 >= deltas.key_range.start);
-        assert!(deltas.data.last().unwrap().0 < deltas.key_range.end);
-        for (_, lsn, _) in &deltas.data {
-            assert!(deltas.lsn_range.start <= *lsn && *lsn < deltas.lsn_range.end);
-        }
+        deltas.sort_unstable_by(|(ka, la, _), (kb, lb, _)| (ka, la).cmp(&(kb, lb)));
+        let min_key = *deltas.first().map(|(k, _, _)| k).unwrap();
+        let end_key = deltas.last().map(|(k, _, _)| k).unwrap().next();
+        let min_lsn = *deltas.iter().map(|(_, lsn, _)| lsn).min().unwrap();
+        let max_lsn = *deltas.iter().map(|(_, lsn, _)| lsn).max().unwrap();
        assert!(
-            deltas.lsn_range.end <= last_record_lsn,
-            "advance last record lsn before inserting a layer, end_lsn={}, last_record_lsn={}",
-            deltas.lsn_range.end,
-            last_record_lsn
+            max_lsn <= last_record_lsn,
+            "advance last record lsn before inserting a layer, max_lsn={max_lsn}, last_record_lsn={last_record_lsn}"
        );
+        let end_lsn = Lsn(max_lsn.0 + 1);
        if let Some(check_start_lsn) = check_start_lsn {
-            assert!(deltas.lsn_range.start >= check_start_lsn);
-        }
-        // check if the delta layer does not violate the LSN invariant, the legacy compaction should always produce a batch of
-        // layers of the same start/end LSN, and so should the force inserted layer
-        {
-            /// Checks if a overlaps with b, assume a/b = [start, end).
-            pub fn overlaps_with<T: Ord>(a: &Range<T>, b: &Range<T>) -> bool {
-                !(a.end <= b.start || b.end <= a.start)
-            }
-
-            let guard = self.layers.read().await;
-            for layer in guard.layer_map().iter_historic_layers() {
-                if layer.is_delta()
-                    && overlaps_with(&layer.lsn_range, &deltas.lsn_range)
-                    && layer.lsn_range != deltas.lsn_range
-                {
-                    // If a delta layer overlaps with another delta layer AND their LSN range is not the same, panic
-                    panic!(
-                        "inserted layer violates delta layer LSN invariant: current_lsn_range={}..{}, conflict_lsn_range={}..{}",
-                        deltas.lsn_range.start, deltas.lsn_range.end, layer.lsn_range.start, layer.lsn_range.end
-                    );
-                }
-            }
+            assert!(min_lsn >= check_start_lsn);
        }
        let mut delta_layer_writer = DeltaLayerWriter::new(
            self.conf,
            self.timeline_id,
            self.tenant_shard_id,
-            deltas.key_range.start,
-            deltas.lsn_range,
+            min_key,
+            min_lsn..end_lsn,
            ctx,
        )
        .await?;
-        for (key, lsn, val) in deltas.data {
+        for (key, lsn, val) in deltas {
            delta_layer_writer.put_value(key, lsn, val, ctx).await?;
        }
-        let delta_layer = delta_layer_writer
-            .finish(deltas.key_range.end, self, ctx)
-            .await?;
+        let delta_layer = delta_layer_writer.finish(end_key, self, ctx).await?;

        {
            let mut guard = self.layers.write().await;
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -272,7 +272,6 @@ impl DeleteTimelineFlow {
                TimelineResources {
                    remote_client,
                    timeline_get_throttle: tenant.timeline_get_throttle.clone(),
-                    l0_flush_global_state: tenant.l0_flush_global_state.clone(),
                },
                // Important. We dont pass ancestor above because it can be missing.
                // Thus we need to skip the validation here.
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -339,10 +339,6 @@ impl LayerManager {
        self.layer_fmgr.contains(layer)
    }

-    pub(crate) fn contains_key(&self, key: &PersistentLayerKey) -> bool {
-        self.layer_fmgr.contains_key(key)
-    }
-
    pub(crate) fn all_persistent_layers(&self) -> Vec<PersistentLayerKey> {
        self.layer_fmgr.0.keys().cloned().collect_vec()
    }
@@ -367,10 +363,6 @@ impl<T: AsLayerDesc + Clone> LayerFileManager<T> {
            .clone()
    }

-    fn contains_key(&self, key: &PersistentLayerKey) -> bool {
-        self.0.contains_key(key)
-    }
-
    pub(crate) fn insert(&mut self, layer: T) {
        let present = self.0.insert(layer.layer_desc().key(), layer.clone());
        if present.is_some() && cfg!(debug_assertions) {
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -26,7 +26,7 @@ use tracing::{debug, error, info, trace, warn, Instrument};
 use super::TaskStateUpdate;
 use crate::{
    context::RequestContext,
-    metrics::{LIVE_CONNECTIONS, WALRECEIVER_STARTED_CONNECTIONS, WAL_INGEST},
+    metrics::{LIVE_CONNECTIONS_COUNT, WALRECEIVER_STARTED_CONNECTIONS, WAL_INGEST},
    task_mgr::TaskKind,
    task_mgr::WALRECEIVER_RUNTIME,
    tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline, WalReceiverInfo},
@@ -208,9 +208,14 @@ pub(super) async fn handle_walreceiver_connection(
        .instrument(tracing::info_span!("poller")),
    );

-    let _guard = LIVE_CONNECTIONS
-        .with_label_values(&["wal_receiver"])
-        .guard();
+    // Immediately increment the gauge, then create a job to decrement it on task exit.
+    // One of the pros of `defer!` is that this will *most probably*
+    // get called, even in presence of panics.
+    let gauge = LIVE_CONNECTIONS_COUNT.with_label_values(&["wal_receiver"]);
+    gauge.inc();
+    scopeguard::defer! {
+        gauge.dec();
+    }

    let identify = identify_system(&replication_client).await?;
    info!("{identify:?}");
--- a/pageserver/src/tenant/vectored_blob_io.rs
+++ b/pageserver/src/tenant/vectored_blob_io.rs
@@ -20,7 +20,6 @@ use std::num::NonZeroUsize;

 use bytes::BytesMut;
 use pageserver_api::key::Key;
-use tokio_epoll_uring::BoundedBuf;
 use utils::lsn::Lsn;
 use utils::vec_map::VecMap;

@@ -317,9 +316,8 @@ impl<'a> VectoredBlobReader<'a> {
        );
        let buf = self
            .file
-            .read_exact_at(buf.slice(0..read.size()), read.start, ctx)
-            .await?
-            .into_inner();
+            .read_exact_at_n(buf, read.start, read.size(), ctx)
+            .await?;

        let blobs_at = read.blobs_at.as_slice();
        let start_offset = blobs_at.first().expect("VectoredRead is never empty").0;
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -13,7 +13,7 @@
 use crate::context::RequestContext;
 use crate::metrics::{StorageIoOperation, STORAGE_IO_SIZE, STORAGE_IO_TIME_METRIC};

-use crate::page_cache::{PageWriteGuard, PAGE_SZ};
+use crate::page_cache::PageWriteGuard;
 use crate::tenant::TENANTS_SEGMENT_NAME;
 use camino::{Utf8Path, Utf8PathBuf};
 use once_cell::sync::OnceCell;
@@ -48,7 +48,6 @@ pub(crate) mod owned_buffers_io {
    //! but for the time being we're proving out the primitives in the neon.git repo
    //! for faster iteration.

-    pub(crate) mod slice;
    pub(crate) mod write;
    pub(crate) mod util {
        pub(crate) mod size_tracking_writer;
@@ -144,17 +143,16 @@ struct SlotInner {
 /// Impl of [`tokio_epoll_uring::IoBuf`] and [`tokio_epoll_uring::IoBufMut`] for [`PageWriteGuard`].
 struct PageWriteGuardBuf {
    page: PageWriteGuard<'static>,
+    init_up_to: usize,
 }
 // Safety: the [`PageWriteGuard`] gives us exclusive ownership of the page cache slot,
 // and the location remains stable even if [`Self`] or the [`PageWriteGuard`] is moved.
-// Page cache pages are zero-initialized, so, wrt uninitialized memory we're good.
-// (Page cache tracks separately whether the contents are valid, see `PageWriteGuard::mark_valid`.)
 unsafe impl tokio_epoll_uring::IoBuf for PageWriteGuardBuf {
    fn stable_ptr(&self) -> *const u8 {
        self.page.as_ptr()
    }
    fn bytes_init(&self) -> usize {
-        self.page.len()
+        self.init_up_to
    }
    fn bytes_total(&self) -> usize {
        self.page.len()
@@ -168,8 +166,8 @@ unsafe impl tokio_epoll_uring::IoBufMut for PageWriteGuardBuf {
    }

    unsafe fn set_init(&mut self, pos: usize) {
-        // There shouldn't really be any reason to call this API since bytes_init() == bytes_total().
        assert!(pos <= self.page.len());
+        self.init_up_to = pos;
    }
 }

@@ -587,37 +585,37 @@ impl VirtualFile {
        Ok(self.pos)
    }

-    /// Read the file contents in range `offset..(offset + slice.bytes_total())` into `slice[0..slice.bytes_total()]`.
-    ///
-    /// The returned `Slice<Buf>` is equivalent to the input `slice`, i.e., it's the same view into the same buffer.
-    pub async fn read_exact_at<Buf>(
+    pub async fn read_exact_at<B>(
        &self,
-        slice: Slice<Buf>,
+        buf: B,
        offset: u64,
        ctx: &RequestContext,
-    ) -> Result<Slice<Buf>, Error>
+    ) -> Result<B, Error>
    where
-        Buf: IoBufMut + Send,
+        B: IoBufMut + Send,
    {
-        let assert_we_return_original_bounds = if cfg!(debug_assertions) {
-            Some((slice.stable_ptr() as usize, slice.bytes_total()))
-        } else {
-            None
-        };
+        let (buf, res) = read_exact_at_impl(buf, offset, None, |buf, offset| {
+            self.read_at(buf, offset, ctx)
+        })
+        .await;
+        res.map(|()| buf)
+    }

-        let original_bounds = slice.bounds();
-        let (buf, res) =
-            read_exact_at_impl(slice, offset, |buf, offset| self.read_at(buf, offset, ctx)).await;
-        let res = res.map(|_| buf.slice(original_bounds));
-
-        if let Some(original_bounds) = assert_we_return_original_bounds {
-            if let Ok(slice) = &res {
-                let returned_bounds = (slice.stable_ptr() as usize, slice.bytes_total());
-                assert_eq!(original_bounds, returned_bounds);
-            }
-        }
-
-        res
+    pub async fn read_exact_at_n<B>(
+        &self,
+        buf: B,
+        offset: u64,
+        count: usize,
+        ctx: &RequestContext,
+    ) -> Result<B, Error>
+    where
+        B: IoBufMut + Send,
+    {
+        let (buf, res) = read_exact_at_impl(buf, offset, Some(count), |buf, offset| {
+            self.read_at(buf, offset, ctx)
+        })
+        .await;
+        res.map(|()| buf)
    }

    /// Like [`Self::read_exact_at`] but for [`PageWriteGuard`].
@@ -627,11 +625,13 @@ impl VirtualFile {
        offset: u64,
        ctx: &RequestContext,
    ) -> Result<PageWriteGuard<'static>, Error> {
-        let buf = PageWriteGuardBuf { page }.slice_full();
-        debug_assert_eq!(buf.bytes_total(), PAGE_SZ);
-        self.read_exact_at(buf, offset, ctx)
-            .await
-            .map(|slice| slice.into_inner().page)
+        let buf = PageWriteGuardBuf {
+            page,
+            init_up_to: 0,
+        };
+        let res = self.read_exact_at(buf, offset, ctx).await;
+        res.map(|PageWriteGuardBuf { page, .. }| page)
+            .map_err(|e| Error::new(ErrorKind::Other, e))
    }

    // Copied from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#219-235
@@ -722,14 +722,14 @@ impl VirtualFile {
        (buf, Ok(n))
    }

-    pub(crate) async fn read_at<Buf>(
+    pub(crate) async fn read_at<B>(
        &self,
-        buf: tokio_epoll_uring::Slice<Buf>,
+        buf: B,
        offset: u64,
        _ctx: &RequestContext, /* TODO: use for metrics: https://github.com/neondatabase/neon/issues/6107 */
-    ) -> (tokio_epoll_uring::Slice<Buf>, Result<usize, Error>)
+    ) -> (B, Result<usize, Error>)
    where
-        Buf: tokio_epoll_uring::IoBufMut + Send,
+        B: tokio_epoll_uring::BoundedBufMut + Send,
    {
        let file_guard = match self.lock_file().await {
            Ok(file_guard) => file_guard,
@@ -781,16 +781,26 @@ impl VirtualFile {
 }

 // Adapted from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#117-135
-pub async fn read_exact_at_impl<Buf, F, Fut>(
-    mut buf: tokio_epoll_uring::Slice<Buf>,
+pub async fn read_exact_at_impl<B, F, Fut>(
+    buf: B,
    mut offset: u64,
+    count: Option<usize>,
    mut read_at: F,
-) -> (Buf, std::io::Result<()>)
+) -> (B, std::io::Result<()>)
 where
-    Buf: IoBufMut + Send,
-    F: FnMut(tokio_epoll_uring::Slice<Buf>, u64) -> Fut,
-    Fut: std::future::Future<Output = (tokio_epoll_uring::Slice<Buf>, std::io::Result<usize>)>,
+    B: IoBufMut + Send,
+    F: FnMut(tokio_epoll_uring::Slice<B>, u64) -> Fut,
+    Fut: std::future::Future<Output = (tokio_epoll_uring::Slice<B>, std::io::Result<usize>)>,
 {
+    let mut buf: tokio_epoll_uring::Slice<B> = match count {
+        Some(count) => {
+            assert!(count <= buf.bytes_total());
+            assert!(count > 0);
+            buf.slice(..count) // may include uninitialized memory
+        }
+        None => buf.slice_full(), // includes all the uninitialized memory
+    };
+
    while buf.bytes_total() != 0 {
        let res;
        (buf, res) = read_at(buf, offset).await;
@@ -872,7 +882,7 @@ mod test_read_exact_at_impl {

    #[tokio::test]
    async fn test_basic() {
-        let buf = Vec::with_capacity(5).slice_full();
+        let buf = Vec::with_capacity(5);
        let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt {
            expectations: VecDeque::from(vec![Expectation {
                offset: 0,
@@ -880,7 +890,7 @@ mod test_read_exact_at_impl {
                result: Ok(vec![b'a', b'b', b'c', b'd', b'e']),
            }]),
        }));
-        let (buf, res) = read_exact_at_impl(buf, 0, |buf, offset| {
+        let (buf, res) = read_exact_at_impl(buf, 0, None, |buf, offset| {
            let mock_read_at = Arc::clone(&mock_read_at);
            async move { mock_read_at.lock().await.read_at(buf, offset).await }
        })
@@ -889,13 +899,33 @@ mod test_read_exact_at_impl {
        assert_eq!(buf, vec![b'a', b'b', b'c', b'd', b'e']);
    }

+    #[tokio::test]
+    async fn test_with_count() {
+        let buf = Vec::with_capacity(5);
+        let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt {
+            expectations: VecDeque::from(vec![Expectation {
+                offset: 0,
+                bytes_total: 3,
+                result: Ok(vec![b'a', b'b', b'c']),
+            }]),
+        }));
+
+        let (buf, res) = read_exact_at_impl(buf, 0, Some(3), |buf, offset| {
+            let mock_read_at = Arc::clone(&mock_read_at);
+            async move { mock_read_at.lock().await.read_at(buf, offset).await }
+        })
+        .await;
+        assert!(res.is_ok());
+        assert_eq!(buf, vec![b'a', b'b', b'c']);
+    }
+
    #[tokio::test]
    async fn test_empty_buf_issues_no_syscall() {
-        let buf = Vec::new().slice_full();
+        let buf = Vec::new();
        let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt {
            expectations: VecDeque::new(),
        }));
-        let (_buf, res) = read_exact_at_impl(buf, 0, |buf, offset| {
+        let (_buf, res) = read_exact_at_impl(buf, 0, None, |buf, offset| {
            let mock_read_at = Arc::clone(&mock_read_at);
            async move { mock_read_at.lock().await.read_at(buf, offset).await }
        })
@@ -905,7 +935,7 @@ mod test_read_exact_at_impl {

    #[tokio::test]
    async fn test_two_read_at_calls_needed_until_buf_filled() {
-        let buf = Vec::with_capacity(4).slice_full();
+        let buf = Vec::with_capacity(4);
        let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt {
            expectations: VecDeque::from(vec![
                Expectation {
@@ -920,7 +950,7 @@ mod test_read_exact_at_impl {
                },
            ]),
        }));
-        let (buf, res) = read_exact_at_impl(buf, 0, |buf, offset| {
+        let (buf, res) = read_exact_at_impl(buf, 0, None, |buf, offset| {
            let mock_read_at = Arc::clone(&mock_read_at);
            async move { mock_read_at.lock().await.read_at(buf, offset).await }
        })
@@ -931,7 +961,7 @@ mod test_read_exact_at_impl {

    #[tokio::test]
    async fn test_eof_before_buffer_full() {
-        let buf = Vec::with_capacity(3).slice_full();
+        let buf = Vec::with_capacity(3);
        let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt {
            expectations: VecDeque::from(vec![
                Expectation {
@@ -951,7 +981,7 @@ mod test_read_exact_at_impl {
                },
            ]),
        }));
-        let (_buf, res) = read_exact_at_impl(buf, 0, |buf, offset| {
+        let (_buf, res) = read_exact_at_impl(buf, 0, None, |buf, offset| {
            let mock_read_at = Arc::clone(&mock_read_at);
            async move { mock_read_at.lock().await.read_at(buf, offset).await }
        })
@@ -1021,29 +1051,27 @@ impl VirtualFile {
        ctx: &RequestContext,
    ) -> Result<crate::tenant::block_io::BlockLease<'_>, std::io::Error> {
        use crate::page_cache::PAGE_SZ;
-        let slice = Vec::with_capacity(PAGE_SZ).slice_full();
-        assert_eq!(slice.bytes_total(), PAGE_SZ);
-        let slice = self
-            .read_exact_at(slice, blknum as u64 * (PAGE_SZ as u64), ctx)
+        let buf = vec![0; PAGE_SZ];
+        let buf = self
+            .read_exact_at(buf, blknum as u64 * (PAGE_SZ as u64), ctx)
            .await?;
-        Ok(crate::tenant::block_io::BlockLease::Vec(slice.into_inner()))
+        Ok(crate::tenant::block_io::BlockLease::Vec(buf))
    }

    async fn read_to_end(&mut self, buf: &mut Vec<u8>, ctx: &RequestContext) -> Result<(), Error> {
        let mut tmp = vec![0; 128];
        loop {
-            let slice = tmp.slice(..128);
-            let (slice, res) = self.read_at(slice, self.pos, ctx).await;
+            let res;
+            (tmp, res) = self.read_at(tmp, self.pos, ctx).await;
            match res {
                Ok(0) => return Ok(()),
                Ok(n) => {
                    self.pos += n as u64;
-                    buf.extend_from_slice(&slice[..n]);
+                    buf.extend_from_slice(&tmp[..n]);
                }
                Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {}
                Err(e) => return Err(e),
            }
-            tmp = slice.into_inner();
        }
    }
 }
@@ -1157,7 +1185,6 @@ mod tests {
    use crate::task_mgr::TaskKind;

    use super::*;
-    use owned_buffers_io::slice::SliceExt;
    use rand::seq::SliceRandom;
    use rand::thread_rng;
    use rand::Rng;
@@ -1179,16 +1206,13 @@ mod tests {
    impl MaybeVirtualFile {
        async fn read_exact_at(
            &self,
-            mut slice: tokio_epoll_uring::Slice<Vec<u8>>,
+            mut buf: Vec<u8>,
            offset: u64,
            ctx: &RequestContext,
-        ) -> Result<tokio_epoll_uring::Slice<Vec<u8>>, Error> {
+        ) -> Result<Vec<u8>, Error> {
            match self {
-                MaybeVirtualFile::VirtualFile(file) => file.read_exact_at(slice, offset, ctx).await,
-                MaybeVirtualFile::File(file) => {
-                    let rust_slice: &mut [u8] = slice.as_mut_rust_slice_full_zeroed();
-                    file.read_exact_at(rust_slice, offset).map(|()| slice)
-                }
+                MaybeVirtualFile::VirtualFile(file) => file.read_exact_at(buf, offset, ctx).await,
+                MaybeVirtualFile::File(file) => file.read_exact_at(&mut buf, offset).map(|()| buf),
            }
        }
        async fn write_all_at<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
@@ -1262,12 +1286,9 @@ mod tests {
            len: usize,
            ctx: &RequestContext,
        ) -> Result<String, Error> {
-            let slice = Vec::with_capacity(len).slice_full();
-            assert_eq!(slice.bytes_total(), len);
-            let slice = self.read_exact_at(slice, pos, ctx).await?;
-            let vec = slice.into_inner();
-            assert_eq!(vec.len(), len);
-            Ok(String::from_utf8(vec).unwrap())
+            let buf = vec![0; len];
+            let buf = self.read_exact_at(buf, pos, ctx).await?;
+            Ok(String::from_utf8(buf).unwrap())
        }
    }

@@ -1486,11 +1507,7 @@ mod tests {
                let mut rng = rand::rngs::OsRng;
                for _ in 1..1000 {
                    let f = &files[rng.gen_range(0..files.len())];
-                    buf = f
-                        .read_exact_at(buf.slice_full(), 0, &ctx)
-                        .await
-                        .unwrap()
-                        .into_inner();
+                    buf = f.read_exact_at(buf, 0, &ctx).await.unwrap();
                    assert!(buf == SAMPLE);
                }
            });
--- a/pageserver/src/virtual_file/io_engine.rs
+++ b/pageserver/src/virtual_file/io_engine.rs
@@ -107,7 +107,7 @@ use std::{
    sync::atomic::{AtomicU8, Ordering},
 };

-use super::{owned_buffers_io::slice::SliceExt, FileGuard, Metadata};
+use super::{FileGuard, Metadata};

 #[cfg(target_os = "linux")]
 fn epoll_uring_error_to_std(e: tokio_epoll_uring::Error<std::io::Error>) -> std::io::Error {
@@ -120,29 +120,38 @@ fn epoll_uring_error_to_std(e: tokio_epoll_uring::Error<std::io::Error>) -> std:
 }

 impl IoEngine {
-    pub(super) async fn read_at<Buf>(
+    pub(super) async fn read_at<B>(
        &self,
        file_guard: FileGuard,
        offset: u64,
-        mut slice: tokio_epoll_uring::Slice<Buf>,
-    ) -> (
-        (FileGuard, tokio_epoll_uring::Slice<Buf>),
-        std::io::Result<usize>,
-    )
+        mut buf: B,
+    ) -> ((FileGuard, B), std::io::Result<usize>)
    where
-        Buf: tokio_epoll_uring::IoBufMut + Send,
+        B: tokio_epoll_uring::BoundedBufMut + Send,
    {
        match self {
            IoEngine::NotSet => panic!("not initialized"),
            IoEngine::StdFs => {
-                let rust_slice = slice.as_mut_rust_slice_full_zeroed();
-                let res = file_guard.with_std_file(|std_file| std_file.read_at(rust_slice, offset));
-                ((file_guard, slice), res)
+                // SAFETY: `dst` only lives at most as long as this match arm, during which buf remains valid memory.
+                let dst = unsafe {
+                    std::slice::from_raw_parts_mut(buf.stable_mut_ptr(), buf.bytes_total())
+                };
+                let res = file_guard.with_std_file(|std_file| std_file.read_at(dst, offset));
+                if let Ok(nbytes) = &res {
+                    assert!(*nbytes <= buf.bytes_total());
+                    // SAFETY: see above assertion
+                    unsafe {
+                        buf.set_init(*nbytes);
+                    }
+                }
+                #[allow(dropping_references)]
+                drop(dst);
+                ((file_guard, buf), res)
            }
            #[cfg(target_os = "linux")]
            IoEngine::TokioEpollUring => {
                let system = tokio_epoll_uring_ext::thread_local_system().await;
-                let (resources, res) = system.read(file_guard, offset, slice).await;
+                let (resources, res) = system.read(file_guard, offset, buf).await;
                (resources, res.map_err(epoll_uring_error_to_std))
            }
        }
--- a/pageserver/src/virtual_file/owned_buffers_io/slice.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/slice.rs
@@ -1,121 +0,0 @@
-use tokio_epoll_uring::BoundedBuf;
-use tokio_epoll_uring::BoundedBufMut;
-use tokio_epoll_uring::IoBufMut;
-use tokio_epoll_uring::Slice;
-
-pub(crate) trait SliceExt {
-    /// Get a `&mut[0..self.bytes_total()`] slice, for when you need to do borrow-based IO.
-    ///
-    /// See the test case `test_slice_full_zeroed` for the difference to just doing `&slice[..]`
-    fn as_mut_rust_slice_full_zeroed(&mut self) -> &mut [u8];
-}
-
-impl<B> SliceExt for Slice<B>
-where
-    B: IoBufMut,
-{
-    #[inline(always)]
-    fn as_mut_rust_slice_full_zeroed(&mut self) -> &mut [u8] {
-        // zero-initialize the uninitialized parts of the buffer so we can create a Rust slice
-        //
-        // SAFETY: we own `slice`, don't write outside the bounds
-        unsafe {
-            let to_init = self.bytes_total() - self.bytes_init();
-            self.stable_mut_ptr()
-                .add(self.bytes_init())
-                .write_bytes(0, to_init);
-            self.set_init(self.bytes_total());
-        };
-        let bytes_total = self.bytes_total();
-        &mut self[0..bytes_total]
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use std::io::Read;
-
-    use super::*;
-    use bytes::Buf;
-    use tokio_epoll_uring::Slice;
-
-    #[test]
-    fn test_slice_full_zeroed() {
-        let make_fake_file = || bytes::BytesMut::from(&b"12345"[..]).reader();
-
-        // before we start the test, let's make sure we have a shared understanding of what slice_full does
-        {
-            let buf = Vec::with_capacity(3);
-            let slice: Slice<_> = buf.slice_full();
-            assert_eq!(slice.bytes_init(), 0);
-            assert_eq!(slice.bytes_total(), 3);
-            let rust_slice = &slice[..];
-            assert_eq!(
-                rust_slice.len(),
-                0,
-                "Slice only derefs to a &[u8] of the initialized part"
-            );
-        }
-
-        // and also let's establish a shared understanding of .slice()
-        {
-            let buf = Vec::with_capacity(3);
-            let slice: Slice<_> = buf.slice(0..2);
-            assert_eq!(slice.bytes_init(), 0);
-            assert_eq!(slice.bytes_total(), 2);
-            let rust_slice = &slice[..];
-            assert_eq!(
-                rust_slice.len(),
-                0,
-                "Slice only derefs to a &[u8] of the initialized part"
-            );
-        }
-
-        // the above leads to the easy mistake of using slice[..] for borrow-based IO like so:
-        {
-            let buf = Vec::with_capacity(3);
-            let mut slice: Slice<_> = buf.slice_full();
-            assert_eq!(slice[..].len(), 0);
-            let mut file = make_fake_file();
-            file.read_exact(&mut slice[..]).unwrap(); // one might think this reads 3 bytes but it reads 0
-            assert_eq!(&slice[..] as &[u8], &[][..] as &[u8]);
-        }
-
-        // With owned buffers IO like with VirtualFilem, you could totally
-        // pass in a `Slice` with bytes_init()=0 but bytes_total()=5
-        // and it will read 5 bytes into the slice, and return a slice that has bytes_init()=5.
-        {
-            // TODO: demo
-        }
-
-        //
-        // Ok, now that we have a shared understanding let's demo how to use the extension trait.
-        //
-
-        // slice_full()
-        {
-            let buf = Vec::with_capacity(3);
-            let mut slice: Slice<_> = buf.slice_full();
-            let rust_slice = slice.as_mut_rust_slice_full_zeroed();
-            assert_eq!(rust_slice.len(), 3);
-            assert_eq!(rust_slice, &[0, 0, 0]);
-            let mut file = make_fake_file();
-            file.read_exact(rust_slice).unwrap();
-            assert_eq!(rust_slice, b"123");
-            assert_eq!(&slice[..], b"123");
-        }
-
-        // .slice(..)
-        {
-            let buf = Vec::with_capacity(3);
-            let mut slice: Slice<_> = buf.slice(0..2);
-            let rust_slice = slice.as_mut_rust_slice_full_zeroed();
-            assert_eq!(rust_slice.len(), 2);
-            assert_eq!(rust_slice, &[0, 0]);
-            let mut file = make_fake_file();
-            file.read_exact(rust_slice).unwrap();
-            assert_eq!(rust_slice, b"12");
-            assert_eq!(&slice[..], b"12");
-        }
-    }
-}
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -343,33 +343,7 @@ impl WalIngest {
                        xlog_checkpoint.oldestActiveXid,
                        self.checkpoint.oldestActiveXid
                    );
-
-                    // A shutdown checkpoint has `oldestActiveXid == InvalidTransactionid`,
-                    // because at shutdown, all in-progress transactions will implicitly
-                    // end. Postgres startup code knows that, and allows hot standby to start
-                    // immediately from a shutdown checkpoint.
-                    //
-                    // In Neon, Postgres hot standby startup always behaves as if starting from
-                    // an online checkpoint. It needs a valid `oldestActiveXid` value, so
-                    // instead of overwriting self.checkpoint.oldestActiveXid with
-                    // InvalidTransactionid from the checkpoint WAL record, update it to a
-                    // proper value, knowing that there are no in-progress transactions at this
-                    // point, except for prepared transactions.
-                    //
-                    // See also the neon code changes in the InitWalRecovery() function.
-                    if xlog_checkpoint.oldestActiveXid == pg_constants::INVALID_TRANSACTION_ID
-                        && info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN
-                    {
-                        let mut oldest_active_xid = self.checkpoint.nextXid.value as u32;
-                        for xid in modification.tline.list_twophase_files(lsn, ctx).await? {
-                            if (xid.wrapping_sub(oldest_active_xid) as i32) < 0 {
-                                oldest_active_xid = xid;
-                            }
-                        }
-                        self.checkpoint.oldestActiveXid = oldest_active_xid;
-                    } else {
-                        self.checkpoint.oldestActiveXid = xlog_checkpoint.oldestActiveXid;
-                    }
+                    self.checkpoint.oldestActiveXid = xlog_checkpoint.oldestActiveXid;

                    // Write a new checkpoint key-value pair on every checkpoint record, even
                    // if nothing really changed. Not strictly required, but it seems nice to
@@ -401,7 +375,6 @@ impl WalIngest {
                if info == pg_constants::XLOG_RUNNING_XACTS {
                    let xlrec = crate::walrecord::XlRunningXacts::decode(&mut buf);
                    self.checkpoint.oldestActiveXid = xlrec.oldest_running_xid;
-                    self.checkpoint_modified = true;
                }
            }
            pg_constants::RM_REPLORIGIN_ID => {
@@ -1304,10 +1277,13 @@ impl WalIngest {
            xlrec.pageno, xlrec.oldest_xid, xlrec.oldest_xid_db
        );

-        // In Postgres, oldestXid and oldestXidDB are updated in memory when the CLOG is
-        // truncated, but a checkpoint record with the updated values isn't written until
-        // later. In Neon, a server can start at any LSN, not just on a checkpoint record,
-        // so we keep the oldestXid and oldestXidDB up-to-date.
+        // Here we treat oldestXid and oldestXidDB
+        // differently from postgres redo routines.
+        // In postgres checkpoint.oldestXid lags behind xlrec.oldest_xid
+        // until checkpoint happens and updates the value.
+        // Here we can use the most recent value.
+        // It's just an optimization, though and can be deleted.
+        // TODO Figure out if there will be any issues with replica.
        self.checkpoint.oldestXid = xlrec.oldest_xid;
        self.checkpoint.oldestXidDB = xlrec.oldest_xid_db;
        self.checkpoint_modified = true;
@@ -1408,31 +1384,14 @@ impl WalIngest {
            // Note: The multixact members can wrap around, even within one WAL record.
            offset = offset.wrapping_add(n_this_page as u32);
        }
-        let next_offset = offset;
-        assert!(xlrec.moff.wrapping_add(xlrec.nmembers) == next_offset);
-
-        // Update next-multi-xid and next-offset
-        //
-        // NB: In PostgreSQL, the next-multi-xid stored in the control file is allowed to
-        // go to 0, and it's fixed up by skipping to FirstMultiXactId in functions that
-        // read it, like GetNewMultiXactId(). This is different from how nextXid is
-        // incremented! nextXid skips over < FirstNormalTransactionId when the the value
-        // is stored, so it's never 0 in a checkpoint.
-        //
-        // I don't know why it's done that way, it seems less error-prone to skip over 0
-        // when the value is stored rather than when it's read. But let's do it the same
-        // way here.
-        let next_multi_xid = xlrec.mid.wrapping_add(1);
-
-        if self
-            .checkpoint
-            .update_next_multixid(next_multi_xid, next_offset)
-        {
+        if xlrec.mid >= self.checkpoint.nextMulti {
+            self.checkpoint.nextMulti = xlrec.mid + 1;
+            self.checkpoint_modified = true;
+        }
+        if xlrec.moff + xlrec.nmembers > self.checkpoint.nextMultiOffset {
+            self.checkpoint.nextMultiOffset = xlrec.moff + xlrec.nmembers;
            self.checkpoint_modified = true;
        }
-
-        // Also update the next-xid with the highest member. According to the comments in
-        // multixact_redo(), this shouldn't be necessary, but let's do the same here.
        let max_mbr_xid = xlrec.members.iter().fold(None, |acc, mbr| {
            if let Some(max_xid) = acc {
                if mbr.xid.wrapping_sub(max_xid) as i32 > 0 {
--- a/pgxn/neon/Makefile
+++ b/pgxn/neon/Makefile
@@ -6,7 +6,6 @@ OBJS = \
 	$(WIN32RES) \
 	extension_server.o \
 	file_cache.o \
-	hll.o \
 	libpagestore.o \
 	neon.o \
 	neon_utils.o \
@@ -23,7 +22,7 @@ SHLIB_LINK_INTERNAL = $(libpq)
 SHLIB_LINK = -lcurl

 EXTENSION = neon
-DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql neon--1.2--1.3.sql neon--1.3--1.2.sql neon--1.2--1.1.sql neon--1.1--1.0.sql  neon--1.3--1.4.sql neon--1.4--1.3.sql
+DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql neon--1.2--1.3.sql neon--1.3--1.2.sql neon--1.2--1.1.sql neon--1.1--1.0.sql
 PGFILEDESC = "neon - cloud storage for PostgreSQL"

 EXTRA_CLEAN = \
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -26,6 +26,7 @@
 #include "miscadmin.h"
 #include "pagestore_client.h"
 #include "common/hashfn.h"
+#include "lib/hyperloglog.h"
 #include "pgstat.h"
 #include "postmaster/bgworker.h"
 #include RELFILEINFO_HDR
@@ -39,8 +40,6 @@
 #include "utils/dynahash.h"
 #include "utils/guc.h"

-#include "hll.h"
-
 /*
 * Local file cache is used to temporary store relations pages in local file system.
 * All blocks of all relations are stored inside one file and addressed using shared hash map.
@@ -63,6 +62,7 @@
 #define BLOCKS_PER_CHUNK	128 /* 1Mb chunk */
 #define MB					((uint64)1024*1024)

+#define HYPER_LOG_LOG_BIT_WIDTH   10
 #define SIZE_MB_TO_CHUNKS(size) ((uint32)((size) * MB / BLCKSZ / BLOCKS_PER_CHUNK))

 typedef struct FileCacheEntry
@@ -87,7 +87,8 @@ typedef struct FileCacheControl
 	uint64		writes;
 	dlist_head	lru;			/* double linked list for LRU replacement
 								 * algorithm */
-	HyperLogLogState wss_estimation; /* estimation of working set size */
+	hyperLogLogState wss_estimation; /* estimation of wroking set size */
+	uint8_t		hyperloglog_hashes[(1 << HYPER_LOG_LOG_BIT_WIDTH) + 1];
 } FileCacheControl;

 static HTAB *lfc_hash;
@@ -237,7 +238,12 @@ lfc_shmem_startup(void)
 		dlist_init(&lfc_ctl->lru);

 		/* Initialize hyper-log-log structure for estimating working set size */
-		initSHLL(&lfc_ctl->wss_estimation);
+		initHyperLogLog(&lfc_ctl->wss_estimation, HYPER_LOG_LOG_BIT_WIDTH);
+
+		/* We need hashes in shared memory */
+		pfree(lfc_ctl->wss_estimation.hashesArr);
+		memset(lfc_ctl->hyperloglog_hashes, 0, sizeof lfc_ctl->hyperloglog_hashes);
+		lfc_ctl->wss_estimation.hashesArr = lfc_ctl->hyperloglog_hashes;

 		/* Recreate file cache on restart */
 		fd = BasicOpenFile(lfc_path, O_RDWR | O_CREAT | O_TRUNC);
@@ -539,7 +545,7 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,

 	/* Approximate working set */
 	tag.blockNum = blkno;
-	addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag)));
+	addHyperLogLog(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag)));

 	if (entry == NULL || (entry->bitmap[chunk_offs >> 5] & (1 << (chunk_offs & 31))) == 0)
 	{
@@ -980,38 +986,20 @@ local_cache_pages(PG_FUNCTION_ARGS)
 		SRF_RETURN_DONE(funcctx);
 }

-PG_FUNCTION_INFO_V1(approximate_working_set_size_seconds);
-
-Datum
-approximate_working_set_size_seconds(PG_FUNCTION_ARGS)
-{
-	if (lfc_size_limit != 0)
-	{
-		int32 dc;
-		time_t duration = PG_ARGISNULL(0) ? (time_t)-1 : PG_GETARG_INT32(0);
-		LWLockAcquire(lfc_lock, LW_SHARED);
-		dc = (int32) estimateSHLL(&lfc_ctl->wss_estimation, duration);
-		LWLockRelease(lfc_lock);
-		PG_RETURN_INT32(dc);
-	}
-	PG_RETURN_NULL();
-}
-
 PG_FUNCTION_INFO_V1(approximate_working_set_size);

 Datum
 approximate_working_set_size(PG_FUNCTION_ARGS)
 {
+	int32 dc = -1;
 	if (lfc_size_limit != 0)
 	{
-		int32 dc;
 		bool reset = PG_GETARG_BOOL(0);
 		LWLockAcquire(lfc_lock, reset ? LW_EXCLUSIVE : LW_SHARED);
-		dc = (int32) estimateSHLL(&lfc_ctl->wss_estimation, (time_t)-1);
+		dc = (int32) estimateHyperLogLog(&lfc_ctl->wss_estimation);
 		if (reset)
-			memset(lfc_ctl->wss_estimation.regs, 0, sizeof lfc_ctl->wss_estimation.regs);
+			memset(lfc_ctl->hyperloglog_hashes, 0, sizeof lfc_ctl->hyperloglog_hashes);
 		LWLockRelease(lfc_lock);
-		PG_RETURN_INT32(dc);
 	}
-	PG_RETURN_NULL();
+	PG_RETURN_INT32(dc);
 }
--- a/pgxn/neon/hll.c
+++ b/pgxn/neon/hll.c
@@ -1,193 +0,0 @@
-/*-------------------------------------------------------------------------
- *
- * hll.c
- *	  Sliding HyperLogLog cardinality estimator
- *
- * Portions Copyright (c) 2014-2023, PostgreSQL Global Development Group
- *
- * Implements https://hal.science/hal-00465313/document
- * 
- * Based on Hideaki Ohno's C++ implementation.  This is probably not ideally
- * suited to estimating the cardinality of very large sets;  in particular, we
- * have not attempted to further optimize the implementation as described in
- * the Heule, Nunkesser and Hall paper "HyperLogLog in Practice: Algorithmic
- * Engineering of a State of The Art Cardinality Estimation Algorithm".
- *
- * A sparse representation of HyperLogLog state is used, with fixed space
- * overhead.
- *
- * The copyright terms of Ohno's original version (the MIT license) follow.
- *
- * IDENTIFICATION
- *	  src/backend/lib/hyperloglog.c
- *
- *-------------------------------------------------------------------------
- */
-
-/*
- * Copyright (c) 2013 Hideaki Ohno <hide.o.j55{at}gmail.com>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the 'Software'), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#include <math.h>
-
-#include "postgres.h"
-#include "funcapi.h"
-#include "port/pg_bitutils.h"
-#include "utils/timestamp.h"
-#include "hll.h"
-
-
-#define POW_2_32			(4294967296.0)
-#define NEG_POW_2_32		(-4294967296.0)
-
-#define ALPHA_MM ((0.7213 / (1.0 + 1.079 / HLL_N_REGISTERS)) * HLL_N_REGISTERS * HLL_N_REGISTERS)
-
-/*
- * Worker for addHyperLogLog().
- *
- * Calculates the position of the first set bit in first b bits of x argument
- * starting from the first, reading from most significant to least significant
- * bits.
- *
- * Example (when considering fist 10 bits of x):
- *
- * rho(x = 0b1000000000)   returns 1
- * rho(x = 0b0010000000)   returns 3
- * rho(x = 0b0000000000)   returns b + 1
- *
- * "The binary address determined by the first b bits of x"
- *
- * Return value "j" used to index bit pattern to watch.
- */
-static inline uint8
-rho(uint32 x, uint8 b)
-{
-	uint8		j = 1;
-
-	if (x == 0)
-		return b + 1;
-
-	j = 32 - pg_leftmost_one_pos32(x);
-
-	if (j > b)
-		return b + 1;
-
-	return j;
-}
-
-/*
- * Initialize HyperLogLog track state
- */
-void
-initSHLL(HyperLogLogState *cState)
-{
-	memset(cState->regs, 0, sizeof(cState->regs));
-}
-
-/*
- * Adds element to the estimator, from caller-supplied hash.
- *
- * It is critical that the hash value passed be an actual hash value, typically
- * generated using hash_any().  The algorithm relies on a specific bit-pattern
- * observable in conjunction with stochastic averaging.  There must be a
- * uniform distribution of bits in hash values for each distinct original value
- * observed.
- */
-void
-addSHLL(HyperLogLogState *cState, uint32 hash)
-{
-	uint8		count;
-	uint32		index;
-	size_t		i;
-	size_t		j;
-
-	TimestampTz	now = GetCurrentTimestamp();
-	/* Use the first "k" (registerWidth) bits as a zero based index */
-	index = hash >> HLL_C_BITS;
-
-	/* Compute the rank of the remaining 32 - "k" (registerWidth) bits */
-	count = rho(hash << HLL_BIT_WIDTH, HLL_C_BITS);
-
-	cState->regs[index][count] = now;
-}
-
-static uint8
-getMaximum(const TimestampTz* reg, TimestampTz since)
-{
-	uint8 max = 0;
-
-	for (size_t i = 0; i < HLL_C_BITS + 1; i++)
-	{
-		if (reg[i] >= since)
-		{
-			max = i;
-		}
-	}
-
-	return max;
-}
-
-
-/*
- * Estimates cardinality, based on elements added so far
- */
-double
-estimateSHLL(HyperLogLogState *cState, time_t duration)
-{
-	double		result;
-	double		sum = 0.0;
-	size_t		i;
-	uint8       R[HLL_N_REGISTERS];
-	/* 0 indicates uninitialized timestamp, so if we need to cover the whole range than starts with 1 */
-	TimestampTz since = duration == (time_t)-1 ? 1 : GetCurrentTimestamp() - duration * USECS_PER_SEC;
-
-	for (i = 0; i < HLL_N_REGISTERS; i++)
-	{
-		R[i] = getMaximum(cState->regs[i], since);
-		sum += 1.0 / pow(2.0, R[i]);
-	}
-
-	/* result set to "raw" HyperLogLog estimate (E in the HyperLogLog paper) */
-	result = ALPHA_MM / sum;
-
-	if (result <= (5.0 / 2.0) * HLL_N_REGISTERS)
-	{
-		/* Small range correction */
-		int			zero_count = 0;
-
-		for (i = 0; i < HLL_N_REGISTERS; i++)
-		{
-			zero_count += R[i] == 0;
-		}
-
-		if (zero_count != 0)
-			result = HLL_N_REGISTERS * log((double) HLL_N_REGISTERS /
-										   zero_count);
-	}
-	else if (result > (1.0 / 30.0) * POW_2_32)
-	{
-		/* Large range correction */
-		result = NEG_POW_2_32 * log(1.0 - (result / POW_2_32));
-	}
-
-	return result;
-}
-
--- a/pgxn/neon/hll.h
+++ b/pgxn/neon/hll.h
@@ -1,86 +0,0 @@
-/*-------------------------------------------------------------------------
- *
- * hll.h
- *	  Sliding HyperLogLog cardinality estimator
- *
- * Portions Copyright (c) 2014-2023, PostgreSQL Global Development Group
- *
- * Implements https://hal.science/hal-00465313/document
- * 
- * Based on Hideaki Ohno's C++ implementation.  This is probably not ideally
- * suited to estimating the cardinality of very large sets;  in particular, we
- * have not attempted to further optimize the implementation as described in
- * the Heule, Nunkesser and Hall paper "HyperLogLog in Practice: Algorithmic
- * Engineering of a State of The Art Cardinality Estimation Algorithm".
- *
- * A sparse representation of HyperLogLog state is used, with fixed space
- * overhead.
- *
- * The copyright terms of Ohno's original version (the MIT license) follow.
- *
- * IDENTIFICATION
- *	  src/backend/lib/hyperloglog.c
- *
- *-------------------------------------------------------------------------
- */
-
-/*
- * Copyright (c) 2013 Hideaki Ohno <hide.o.j55{at}gmail.com>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the 'Software'), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#ifndef HLL_H
-#define HLL_H
-
-#define HLL_BIT_WIDTH   10
-#define HLL_C_BITS      (32 - HLL_BIT_WIDTH)
-#define HLL_N_REGISTERS (1 << HLL_BIT_WIDTH)
-
-/*
- * HyperLogLog is an approximate technique for computing the number of distinct
- * entries in a set.  Importantly, it does this by using a fixed amount of
- * memory.  See the 2007 paper "HyperLogLog: the analysis of a near-optimal
- * cardinality estimation algorithm" for more.
- *
- * Instead of a single counter for every bits register, we have a timestamp
- * for every valid number of bits we can encounter. Every time we encounter
- * a certain number of bits, we update the timestamp in those registers to
- * the current timestamp.
- *
- * We can query the sketch's stored cardinality for the range of some timestamp
- * up to now: For each register, we return the highest bits bucket that has a
- * modified timestamp >= the query timestamp. This value is the number of bits
- * for this register in the normal HLL calculation.
- *
- * The memory usage is 2^B * (C + 1) * sizeof(TimetampTz), or 184kiB.
- * Usage could be halved if we decide to reduce the required time dimension
- * precision; as 32 bits in second precision should be enough for statistics.
- * However, that is not yet implemented.
- */
-typedef struct HyperLogLogState
-{
-	TimestampTz regs[HLL_N_REGISTERS][HLL_C_BITS + 1];
-} HyperLogLogState;
-
-extern void   initSHLL(HyperLogLogState *cState);
-extern void   addSHLL(HyperLogLogState *cState, uint32 hash);
-extern double estimateSHLL(HyperLogLogState *cState, time_t dutration);
-
-#endif
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -427,17 +427,12 @@ pageserver_connect(shardno_t shard_no, int elevel)
 		values[n_pgsql_params] = NULL;

 		shard->conn = PQconnectStartParams(keywords, values, 1);
-		if (PQstatus(shard->conn) == CONNECTION_BAD)
+		if (!shard->conn)
 		{
-			char	   *msg = pchomp(PQerrorMessage(shard->conn));
-			CLEANUP_AND_DISCONNECT(shard);
-			ereport(elevel,
-					(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
-						errmsg(NEON_TAG "[shard %d] could not establish connection to pageserver", shard_no),
-						errdetail_internal("%s", msg)));
-			pfree(msg);
+			neon_shard_log(shard_no, elevel, "Failed to connect to pageserver: out of memory");
 			return false;
 		}
+
 		shard->state = PS_Connecting_Startup;
 		/* fallthrough */
 	}
--- a/pgxn/neon/neon--1.3--1.4.sql
+++ b/pgxn/neon/neon--1.3--1.4.sql
@@ -1,9 +0,0 @@
-\echo Use "ALTER EXTENSION neon UPDATE TO '1.4'" to load this file. \quit
-
-CREATE FUNCTION approximate_working_set_size_seconds(duration integer default null)
-RETURNS integer
-AS 'MODULE_PATHNAME', 'approximate_working_set_size_seconds'
-LANGUAGE C PARALLEL SAFE;
-
-GRANT EXECUTE ON FUNCTION approximate_working_set_size_seconds(integer) TO pg_monitor;
-
--- a/pgxn/neon/neon--1.4--1.3.sql
+++ b/pgxn/neon/neon--1.4--1.3.sql
@@ -1 +0,0 @@
-DROP FUNCTION IF EXISTS approximate_working_set_size_seconds(integer) CASCADE;
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -12,8 +12,6 @@
 #include "fmgr.h"

 #include "miscadmin.h"
-#include "access/subtrans.h"
-#include "access/twophase.h"
 #include "access/xact.h"
 #include "access/xlog.h"
 #include "storage/buf_internals.h"
@@ -24,12 +22,10 @@
 #include "replication/logical.h"
 #include "replication/slot.h"
 #include "replication/walsender.h"
-#include "storage/proc.h"
 #include "storage/procsignal.h"
 #include "tcop/tcopprot.h"
 #include "funcapi.h"
 #include "access/htup_details.h"
-#include "utils/builtins.h"
 #include "utils/pg_lsn.h"
 #include "utils/guc.h"
 #include "utils/wait_event.h"
@@ -270,293 +266,6 @@ LogicalSlotsMonitorMain(Datum main_arg)
 	}
 }

-/*
- * XXX: These private to procarray.c, but we need them here.
- */
-#define PROCARRAY_MAXPROCS	(MaxBackends + max_prepared_xacts)
-#define TOTAL_MAX_CACHED_SUBXIDS \
-	((PGPROC_MAX_CACHED_SUBXIDS + 1) * PROCARRAY_MAXPROCS)
-
-/*
- * Restore running-xact information by scanning the CLOG at startup.
- *
- * In PostgreSQL, a standby always has to wait for a running-xacts WAL record
- * to arrive before it can start accepting queries. Furthermore, if there are
- * transactions with too many subxids (> 64) open to fit in the in-memory
- * subxids cache, the running-xacts record will be marked as "suboverflowed",
- * and the standby will need to also wait for the currently in-progress
- * transactions to finish.
- *
- * That's not great in PostgreSQL, because a hot standby does not necessary
- * open up for queries immediately as you might expect. But it's worse in
- * Neon: A standby in Neon doesn't need to start WAL replay from a checkpoint
- * record; it can start at any LSN. Postgres arranges things so that there is
- * a running-xacts record soon after every checkpoint record, but when you
- * start from an arbitrary LSN, that doesn't help. If the primary is idle, or
- * not running at all, it might never write a new running-xacts record,
- * leaving the replica in a limbo where it can never start accepting queries.
- *
- * To mitigate that, we have an additional mechanism to find the running-xacts
- * information: we scan the CLOG, making note of any XIDs not marked as
- * committed or aborted. They are added to the Postgres known-assigned XIDs
- * array by calling ProcArrayApplyRecoveryInfo() in the caller of this
- * function.
- *
- * There is one big limitation with that mechanism: The size of the
- * known-assigned XIDs is limited, so if there are a lot of in-progress XIDs,
- * we have to give up. Furthermore, we don't know how many of the in-progress
- * XIDs are subtransactions, and if we use up all the space in the
- * known-assigned XIDs array for subtransactions, we might run out of space in
- * the array later during WAL replay, causing the replica to shut down with
- * "ERROR: too many KnownAssignedXids". The safe # of XIDs that we can add to
- * the known-assigned array without risking that error later is very low,
- * merely PGPROC_MAX_CACHED_SUBXIDS == 64, so we take our chances and use up
- * to half of the known-assigned XIDs array for the subtransactions, even
- * though that risks getting the error later.
- *
- * Note: It's OK if the recovered list of XIDs includes some transactions that
- * have crashed in the primary, and hence will never commit. They will be seen
- * as in-progress, until we see a new next running-acts record with an
- * oldestActiveXid that invalidates them. That's how the known-assigned XIDs
- * array always works.
- *
- * If scraping the CLOG doesn't succeed for some reason, like the subxid
- * overflow, Postgres will fall back to waiting for a running-xacts record
- * like usual.
- *
- * Returns true if a complete list of in-progress XIDs was scraped.
- */
-static bool
-RestoreRunningXactsFromClog(CheckPoint *checkpoint, TransactionId **xids, int *nxids)
-{
-	TransactionId from;
-	TransactionId till;
-	int			max_xcnt;
-	TransactionId *prepared_xids = NULL;
-	int			n_prepared_xids;
-	TransactionId *restored_xids = NULL;
-	int			n_restored_xids;
-	int			next_prepared_idx;
-
-	Assert(*xids == NULL);
-
-	/*
-	 * If the checkpoint doesn't have a valid oldestActiveXid, bail out. We
-	 * don't know where to start the scan.
-	 *
-	 * This shouldn't happen, because the pageserver always maintains a valid
-	 * oldestActiveXid nowadays. Except when starting at an old point in time
-	 * that was ingested before the pageserver was taught to do that.
-	 */
-	if (!TransactionIdIsValid(checkpoint->oldestActiveXid))
-	{
-		elog(LOG, "cannot restore running-xacts from CLOG because oldestActiveXid is not set");
-		goto fail;
-	}
-
-	/*
-	 * We will scan the CLOG starting from the oldest active XID.
-	 *
-	 * In some corner cases, the oldestActiveXid from the last checkpoint
-	 * might already have been truncated from the CLOG. That is,
-	 * oldestActiveXid might be older than oldestXid. That's possible because
-	 * oldestActiveXid is only updated at checkpoints. After the last
-	 * checkpoint, the oldest transaction might have committed, and the CLOG
-	 * might also have been already truncated. So if oldestActiveXid is older
-	 * than oldestXid, start at oldestXid instead. (Otherwise we'd try to
-	 * access CLOG segments that have already been truncated away.)
-	 */
-	from = TransactionIdPrecedes(checkpoint->oldestXid, checkpoint->oldestActiveXid)
-		? checkpoint->oldestActiveXid : checkpoint->oldestXid;
-	till = XidFromFullTransactionId(checkpoint->nextXid);
-
-	/*
-	 * To avoid "too many KnownAssignedXids" error later during replay, we
-	 * limit number of collected transactions. This is a tradeoff: if we are
-	 * willing to consume more of the KnownAssignedXids space for the XIDs
-	 * now, that allows us to start up, but we might run out of space later.
-	 *
-	 * The size of the KnownAssignedXids array is TOTAL_MAX_CACHED_SUBXIDS,
-	 * which is (PGPROC_MAX_CACHED_SUBXIDS + 1) * PROCARRAY_MAXPROCS). In
-	 * PostgreSQL, that's always enough because the primary will always write
-	 * an XLOG_XACT_ASSIGNMENT record if a transaction has more than
-	 * PGPROC_MAX_CACHED_SUBXIDS subtransactions. Seeing that record allows
-	 * the standby to mark the XIDs in pg_subtrans and removing them from the
-	 * KnowingAssignedXids array.
-	 *
-	 * Here, we don't know which XIDs belong to subtransactions that have
-	 * already been WAL-logged with an XLOG_XACT_ASSIGNMENT record. If we
-	 * wanted to be totally safe and avoid the possibility of getting a "too
-	 * many KnownAssignedXids" error later, we would have to limit ourselves
-	 * to PGPROC_MAX_CACHED_SUBXIDS, which is not much. And that includes top
-	 * transaction IDs too, because we cannot distinguish between top
-	 * transaction IDs and subtransactions here.
-	 *
-	 * Somewhat arbitrarily, we use up to half of KnownAssignedXids. That
-	 * strikes a sensible balance between being useful, and risking a "too
-	 * many KnownAssignedXids" error later.
-	 */
-	max_xcnt = TOTAL_MAX_CACHED_SUBXIDS / 2;
-
-	/*
-	 * Collect XIDs of prepared transactions in an array. This includes only
-	 * their top-level XIDs. We assume that StandbyRecoverPreparedTransactions
-	 * has already been called, so we can find all the sub-transactions in
-	 * pg_subtrans.
-	 */
-	PrescanPreparedTransactions(&prepared_xids, &n_prepared_xids);
-	qsort(prepared_xids, n_prepared_xids, sizeof(TransactionId), xidLogicalComparator);
-
-	/*
-	 * Scan the CLOG, collecting in-progress XIDs into 'restored_xids'.
-	 */
-	elog(DEBUG1, "scanning CLOG between %u and %u for in-progress XIDs", from, till);
-	restored_xids = (TransactionId *) palloc(max_xcnt * sizeof(TransactionId));
-	n_restored_xids = 0;
-	next_prepared_idx = 0;
-	for (TransactionId xid = from; xid != till;)
-	{
-		XLogRecPtr	xidlsn;
-		XidStatus	xidstatus;
-
-		xidstatus = TransactionIdGetStatus(xid, &xidlsn);
-
-		/*
-		 * "Merge" the prepared transactions into the restored_xids array as
-		 * we go.  The prepared transactions array is sorted. This is mostly
-		 * a sanity check to ensure that all the prpeared transactions are
-		 * seen as in-progress. (There is a check after the loop that we didn't
-		 * miss any.)
-		 */
-		if (next_prepared_idx < n_prepared_xids && xid == prepared_xids[next_prepared_idx])
-		{
-			/*
-			 * This is a top-level transaction ID of a prepared transaction.
-			 * Include it in the array.
-			 */
-
-			/* sanity check */
-			if (xidstatus != TRANSACTION_STATUS_IN_PROGRESS)
-			{
-				elog(LOG, "prepared transaction %u has unexpected status %X, cannot restore running-xacts from CLOG",
-					 xid, xidstatus);
-				Assert(false);
-				goto fail;
-			}
-
-			elog(DEBUG1, "XID %u: was next prepared xact (%d / %d)", xid, next_prepared_idx, n_prepared_xids);
-			next_prepared_idx++;
-		}
-		else if (xidstatus == TRANSACTION_STATUS_COMMITTED)
-		{
-			elog(DEBUG1, "XID %u: was committed", xid);
-			goto skip;
-		}
-		else if (xidstatus == TRANSACTION_STATUS_ABORTED)
-		{
-			elog(DEBUG1, "XID %u: was aborted", xid);
-			goto skip;
-		}
-		else if (xidstatus == TRANSACTION_STATUS_IN_PROGRESS)
-		{
-			/*
-			 * In-progress transactions are included in the array.
-			 *
-			 * Except subtransactions of the prepared transactions. They are
-			 * already set in pg_subtrans, and hence don't need to be tracked
-			 * in the known-assigned XIDs array.
-			 */
-			if (n_prepared_xids > 0)
-			{
-				TransactionId parent = SubTransGetParent(xid);
-
-				if (TransactionIdIsValid(parent))
-				{
-					/*
-					 * This is a subtransaction belonging to a prepared
-					 * transaction.
-					 *
-					 * Sanity check that it is in the prepared XIDs array. It
-					 * should be, because StandbyRecoverPreparedTransactions
-					 * populated pg_subtrans, and no other XID should be set
-					 * in it yet. (This also relies on the fact that
-					 * StandbyRecoverPreparedTransactions sets the parent of
-					 * each subxid to point directly to the top-level XID,
-					 * rather than restoring the original subtransaction
-					 * hierarchy.)
-					 */
-					if (bsearch(&parent, prepared_xids, next_prepared_idx,
-								sizeof(TransactionId), xidLogicalComparator) == NULL)
-					{
-						elog(LOG, "sub-XID %u has unexpected parent %u, cannot restore running-xacts from CLOG",
-							 xid, parent);
-						Assert(false);
-						goto fail;
-					}
-					elog(DEBUG1, "XID %u: was a subtransaction of prepared xid %u", xid, parent);
-					goto skip;
-				}
-			}
-
-			/* include it in the array */
-			elog(DEBUG1, "XID %u: is in progress", xid);
-		}
-		else
-		{
-			/*
-			 * SUB_COMMITTED is a transient state used at commit. We don't
-			 * expect to see that here.
-			 */
-			elog(LOG, "XID %u has unexpected status %X in pg_xact, cannot restore running-xacts from CLOG",
-				 xid, xidstatus);
-			Assert(false);
-			goto fail;
-		}
-
-		if (n_restored_xids >= max_xcnt)
-		{
-			/*
-			 * Overflowed. We won't be able to install the RunningTransactions
-			 * snapshot.
-			 */
-			elog(LOG, "too many running xacts to restore from the CLOG; oldestXid=%u oldestActiveXid=%u nextXid %u",
-				 checkpoint->oldestXid, checkpoint->oldestActiveXid,
-				 XidFromFullTransactionId(checkpoint->nextXid));
-			goto fail;
-		}
-
-		restored_xids[n_restored_xids++] = xid;
-
-	skip:
-		TransactionIdAdvance(xid);
-		continue;
-	}
-
-	/* sanity check */
-	if (next_prepared_idx != n_prepared_xids)
-	{
-		elog(LOG, "prepared transaction ID %u was not visited in the CLOG scan, cannot restore running-xacts from CLOG",
-			 prepared_xids[next_prepared_idx]);
-		Assert(false);
-		goto fail;
-	}
-
-	elog(LOG, "restored %d running xacts by scanning the CLOG; oldestXid=%u oldestActiveXid=%u nextXid %u",
-		 n_restored_xids, checkpoint->oldestXid, checkpoint->oldestActiveXid, XidFromFullTransactionId(checkpoint->nextXid));
-	*nxids = n_restored_xids;
-	*xids = restored_xids;
-	return true;
-
- fail:
-	*nxids = 0;
-	*xids = NULL;
-	if (restored_xids)
-		pfree(restored_xids);
-	if (prepared_xids)
-		pfree(prepared_xids);
-	return false;
-}
-
 void
 _PG_init(void)
 {
@@ -579,8 +288,6 @@ _PG_init(void)

 	pg_init_extension_server();

-	restore_running_xacts_callback = RestoreRunningXactsFromClog;
-
 	/*
 	 * Important: This must happen after other parts of the extension are
 	 * loaded, otherwise any settings to GUCs that were set before the
--- a/pgxn/neon_test_utils/Makefile
+++ b/pgxn/neon_test_utils/Makefile
@@ -7,7 +7,7 @@ OBJS = \
 	neontest.o

 EXTENSION = neon_test_utils
-DATA = neon_test_utils--1.3.sql
+DATA = neon_test_utils--1.1.sql
 PGFILEDESC = "neon_test_utils - helpers for neon testing and debugging"

 PG_CONFIG = pg_config
--- a/pgxn/neon_test_utils/neon_test_utils--1.1.sql
+++ b/pgxn/neon_test_utils/neon_test_utils--1.1.sql
@@ -41,25 +41,7 @@ RETURNS bytea
 AS 'MODULE_PATHNAME', 'get_raw_page_at_lsn_ex'
 LANGUAGE C PARALLEL UNSAFE;

-CREATE FUNCTION neon_xlogflush(lsn pg_lsn DEFAULT NULL)
+CREATE FUNCTION neon_xlogflush(lsn pg_lsn)
 RETURNS VOID
 AS 'MODULE_PATHNAME', 'neon_xlogflush'
 LANGUAGE C PARALLEL UNSAFE;
-
-CREATE FUNCTION trigger_panic()
-RETURNS VOID
-AS 'MODULE_PATHNAME', 'trigger_panic'
-LANGUAGE C PARALLEL UNSAFE;
-
-CREATE FUNCTION trigger_segfault()
-RETURNS VOID
-AS 'MODULE_PATHNAME', 'trigger_segfault'
-LANGUAGE C PARALLEL UNSAFE;
-
-- Alias for `trigger_segfault`, just because `SELECT 💣()` looks fun
-CREATE OR REPLACE FUNCTION 💣() RETURNS void
-LANGUAGE plpgsql AS $$
-BEGIN
-    PERFORM trigger_segfault();
-END;
-$$;
--- a/pgxn/neon_test_utils/neon_test_utils.control
+++ b/pgxn/neon_test_utils/neon_test_utils.control
@@ -1,6 +1,6 @@
 # neon_test_utils extension
 comment = 'helpers for neon testing and debugging'
-default_version = '1.3'
+default_version = '1.1'
 module_pathname = '$libdir/neon_test_utils'
 relocatable = true
 trusted = true
--- a/pgxn/neon_test_utils/neontest.c
+++ b/pgxn/neon_test_utils/neontest.c
@@ -15,7 +15,6 @@
 #include "access/relation.h"
 #include "access/xact.h"
 #include "access/xlog.h"
-#include "access/xlog_internal.h"
 #include "catalog/namespace.h"
 #include "fmgr.h"
 #include "funcapi.h"
@@ -42,8 +41,6 @@ PG_FUNCTION_INFO_V1(clear_buffer_cache);
 PG_FUNCTION_INFO_V1(get_raw_page_at_lsn);
 PG_FUNCTION_INFO_V1(get_raw_page_at_lsn_ex);
 PG_FUNCTION_INFO_V1(neon_xlogflush);
-PG_FUNCTION_INFO_V1(trigger_panic);
-PG_FUNCTION_INFO_V1(trigger_segfault);

 /*
 * Linkage to functions in neon module.
@@ -447,68 +444,12 @@ get_raw_page_at_lsn_ex(PG_FUNCTION_ARGS)

 /*
 * Directly calls XLogFlush(lsn) to flush WAL buffers.
- *
- * If 'lsn' is not specified (is NULL), flush all generated WAL.
 */
 Datum
 neon_xlogflush(PG_FUNCTION_ARGS)
 {
-	XLogRecPtr	lsn;
-
-	if (RecoveryInProgress())
-		ereport(ERROR,
-				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
-				 errmsg("recovery is in progress"),
-				 errhint("cannot flush WAL during recovery.")));
-
-	if (!PG_ARGISNULL(0))
-		lsn = PG_GETARG_LSN(0);
-	else
-	{
-		lsn = GetXLogInsertRecPtr();
-
-		/*---
-		 * The LSN returned by GetXLogInsertRecPtr() is the position where the
-		 * next inserted record would begin. If the last record ended just at
-		 * the page boundary, the next record will begin after the page header
-		 * on the next page, but the next page's page header has not been
-		 * written yet. If we tried to flush it, XLogFlush() would throw an
-		 * error:
-		 *
-		 * ERROR : xlog flush request %X/%X is not satisfied --- flushed only to %X/%X
-		 *
-		 * To avoid that, if the insert position points to just after the page
-		 * header, back off to page boundary.
-		 */
-		if (lsn % XLOG_BLCKSZ == SizeOfXLogShortPHD &&
-			XLogSegmentOffset(lsn, wal_segment_size) > XLOG_BLCKSZ)
-			lsn -= SizeOfXLogShortPHD;
-		else if (lsn % XLOG_BLCKSZ == SizeOfXLogLongPHD &&
-				 XLogSegmentOffset(lsn, wal_segment_size) < XLOG_BLCKSZ)
-			lsn -= SizeOfXLogLongPHD;
-	}
+	XLogRecPtr	lsn = PG_GETARG_LSN(0);

 	XLogFlush(lsn);
 	PG_RETURN_VOID();
 }
-
-/*
- * Function to trigger panic.
- */
-Datum
-trigger_panic(PG_FUNCTION_ARGS)
-{
-    elog(PANIC, "neon_test_utils: panic");
-    PG_RETURN_VOID();
-}
-
-/*
- * Function to trigger a segfault.
- */
-Datum
-trigger_segfault(PG_FUNCTION_ARGS)
-{
-    int *ptr = NULL;
-    *ptr = 42;
-    PG_RETURN_VOID();
-}
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand.

 [[package]]
 name = "aiohttp"
@@ -734,13 +734,13 @@ typing-extensions = ">=4.1.0"

 [[package]]
 name = "certifi"
-version = "2024.7.4"
+version = "2023.7.22"
 description = "Python package for providing Mozilla's CA Bundle."
 optional = false
 python-versions = ">=3.6"
 files = [
-    {file = "certifi-2024.7.4-py3-none-any.whl", hash = "sha256:c198e21b1289c2ab85ee4e67bb4b4ef3ead0892059901a8d5b622f24a1101e90"},
-    {file = "certifi-2024.7.4.tar.gz", hash = "sha256:5a1e7645bc0ec61a09e26c36f6106dd4cf40c6db3a1fb6352b0244e7fb057c7b"},
+    {file = "certifi-2023.7.22-py3-none-any.whl", hash = "sha256:92d6037539857d8206b8f6ae472e8b77db8058fec5937a1ef3f54304089edbb9"},
+    {file = "certifi-2023.7.22.tar.gz", hash = "sha256:539cc1d13202e33ca466e88b2807e29f4c13049d6d87031a3c110744495cb082"},
 ]

 [[package]]
@@ -3133,18 +3133,18 @@ multidict = ">=4.0"

 [[package]]
 name = "zipp"
-version = "3.19.1"
+version = "3.8.1"
 description = "Backport of pathlib-compatible object wrapper for zip files"
 optional = false
-python-versions = ">=3.8"
+python-versions = ">=3.7"
 files = [
-    {file = "zipp-3.19.1-py3-none-any.whl", hash = "sha256:2828e64edb5386ea6a52e7ba7cdb17bb30a73a858f5eb6eb93d8d36f5ea26091"},
-    {file = "zipp-3.19.1.tar.gz", hash = "sha256:35427f6d5594f4acf82d25541438348c26736fa9b3afa2754bcd63cdb99d8e8f"},
+    {file = "zipp-3.8.1-py3-none-any.whl", hash = "sha256:47c40d7fe183a6f21403a199b3e4192cca5774656965b0a4988ad2f8feb5f009"},
+    {file = "zipp-3.8.1.tar.gz", hash = "sha256:05b45f1ee8f807d0cc928485ca40a07cb491cf092ff587c0df9cb1fd154848d2"},
 ]

 [package.extras]
-doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"]
-test = ["big-O", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-itertools", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-ignore-flaky", "pytest-mypy", "pytest-ruff (>=0.2.1)"]
+docs = ["jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx"]
+testing = ["func-timeout", "jaraco.itertools", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)"]

 [[package]]
 name = "zstandard"
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Conrad Ludgate	4c78a5067f	compress cache key	2024-06-28 09:12:18 +01:00
Conrad Ludgate	108f08f982	proxy: cache a compressed version of the node info	2024-06-28 09:04:54 +01:00
				`@@ -1 +0,0 @@`
				`DROP FUNCTION IF EXISTS approximate_working_set_size_seconds(integer) CASCADE;`