test: allow requests to any pageserver get cancelled (#8413 )

Fix flakyness on `test_sharded_timeline_detach_ancestor` which does not reproduce on a fast enough runner by allowing cancelled request before completing on all pageservers. It was only allowed on half of the pageservers. Failure evidence: https://neon-github-public-dev.s3.amazonaws.com/reports/pr-8352/9972357040/index.html#suites/a1c2be32556270764423c495fad75d47/7cca3e3d94fe12f2
pageserver: layer count & size metrics (#8410 )
2026-05-17 13:10:38 +00:00 · 2024-07-17 22:03:02 +01:00 · 2024-07-17 21:55:20 +01:00 · 2024-07-17 15:19:40 -04:00 · 2024-07-17 17:35:27 +00:00 · 2024-07-17 16:22:38 +01:00
316 changed files with 15827 additions and 5786 deletions
--- a/.github/actions/allure-report-generate/action.yml
+++ b/.github/actions/allure-report-generate/action.yml
@@ -183,7 +183,7 @@ runs:
      uses: actions/cache@v4
      with:
        path: ~/.cache/pypoetry/virtualenvs
-        key: v2-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
+        key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-${{ hashFiles('poetry.lock') }}

    - name: Store Allure test stat in the DB (new)
      if: ${{ !cancelled() && inputs.store-test-results-into-db == 'true' }}
--- a/.github/actions/download/action.yml
+++ b/.github/actions/download/action.yml
@@ -26,7 +26,7 @@ runs:
        TARGET: ${{ inputs.path }}
        ARCHIVE: /tmp/downloads/${{ inputs.name }}.tar.zst
        SKIP_IF_DOES_NOT_EXIST: ${{ inputs.skip-if-does-not-exist }}
-        PREFIX: artifacts/${{ inputs.prefix || format('{0}/{1}', github.run_id, github.run_attempt) }}
+        PREFIX: artifacts/${{ inputs.prefix || format('{0}/{1}/{2}', github.event.pull_request.head.sha || github.sha, github.run_id, github.run_attempt) }}
      run: |
        BUCKET=neon-github-public-dev
        FILENAME=$(basename $ARCHIVE)
--- a/.github/actions/neon-project-create/action.yml
+++ b/.github/actions/neon-project-create/action.yml
@@ -9,8 +9,8 @@ inputs:
    description: 'Region ID, if not set the project will be created in the default region'
    default: aws-us-east-2
  postgres_version:
-    description: 'Postgres version; default is 15'
-    default: '15'
+    description: 'Postgres version; default is 16'
+    default: '16'
  api_host:
    description: 'Neon API host'
    default: console-stage.neon.build
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -56,14 +56,14 @@ runs:
      if: inputs.build_type != 'remote'
      uses: ./.github/actions/download
      with:
-        name: neon-${{ runner.os }}-${{ inputs.build_type }}-artifact
+        name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build_type }}-artifact
        path: /tmp/neon

    - name: Download Neon binaries for the previous release
      if: inputs.build_type != 'remote'
      uses: ./.github/actions/download
      with:
-        name: neon-${{ runner.os }}-${{ inputs.build_type }}-artifact
+        name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build_type }}-artifact
        path: /tmp/neon-previous
        prefix: latest

@@ -89,7 +89,7 @@ runs:
      uses: actions/cache@v4
      with:
        path: ~/.cache/pypoetry/virtualenvs
-        key: v2-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
+        key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-${{ hashFiles('poetry.lock') }}

    - name: Install Python deps
      shell: bash -euxo pipefail {0}
@@ -114,6 +114,8 @@ runs:
        export PLATFORM=${PLATFORM:-github-actions-selfhosted}
        export POSTGRES_DISTRIB_DIR=${POSTGRES_DISTRIB_DIR:-/tmp/neon/pg_install}
        export DEFAULT_PG_VERSION=${PG_VERSION#v}
+        export LD_LIBRARY_PATH=${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/lib
+        export BENCHMARK_CONNSTR=${BENCHMARK_CONNSTR:-}

        if [ "${BUILD_TYPE}" = "remote" ]; then
          export REMOTE_ENV=1
@@ -178,7 +180,15 @@ runs:

        # Wake up the cluster if we use remote neon instance
        if [ "${{ inputs.build_type }}" = "remote" ] && [ -n "${BENCHMARK_CONNSTR}" ]; then
-          ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/psql ${BENCHMARK_CONNSTR} -c "SELECT version();"
+          QUERIES=("SELECT version()")
+          if [[ "${PLATFORM}" = "neon"* ]]; then
+            QUERIES+=("SHOW neon.tenant_id")
+            QUERIES+=("SHOW neon.timeline_id")
+          fi
+
+          for q in "${QUERIES[@]}"; do
+            ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/psql ${BENCHMARK_CONNSTR} -c "${q}"
+          done
        fi

        # Run the tests.
--- a/.github/actions/upload/action.yml
+++ b/.github/actions/upload/action.yml
@@ -8,7 +8,7 @@ inputs:
    description: "A directory or file to upload"
    required: true
  prefix:
-    description: "S3 prefix. Default is '${GITHUB_RUN_ID}/${GITHUB_RUN_ATTEMPT}'"
+    description: "S3 prefix. Default is '${GITHUB_SHA}/${GITHUB_RUN_ID}/${GITHUB_RUN_ATTEMPT}'"
    required: false

 runs:
@@ -45,7 +45,7 @@ runs:
      env:
        SOURCE: ${{ inputs.path }}
        ARCHIVE: /tmp/uploads/${{ inputs.name }}.tar.zst
-        PREFIX: artifacts/${{ inputs.prefix || format('{0}/{1}', github.run_id, github.run_attempt) }}
+        PREFIX: artifacts/${{ inputs.prefix || format('{0}/{1}/{2}', github.event.pull_request.head.sha || github.sha, github.run_id , github.run_attempt) }}
      run: |
        BUCKET=neon-github-public-dev
        FILENAME=$(basename $ARCHIVE)
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -56,15 +56,26 @@ concurrency:
 jobs:
  bench:
    if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
+    strategy:
+      matrix:
+        include:
+          - DEFAULT_PG_VERSION: 16
+            PLATFORM: "neon-staging"
+            region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
+            provisioner: 'k8s-pod' 
+          - DEFAULT_PG_VERSION: 16
+            PLATFORM: "azure-staging"
+            region_id: 'azure-eastus2'
+            provisioner: 'k8s-neonvm'
    env:
      TEST_PG_BENCH_DURATIONS_MATRIX: "300"
      TEST_PG_BENCH_SCALES_MATRIX: "10,100"
      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
-      DEFAULT_PG_VERSION: 14
+      DEFAULT_PG_VERSION: ${{ matrix.DEFAULT_PG_VERSION }}
      TEST_OUTPUT: /tmp/test_output
      BUILD_TYPE: remote
      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
-      PLATFORM: "neon-staging"
+      PLATFORM: ${{ matrix.PLATFORM }}

    runs-on: [ self-hosted, us-east-2, x64 ]
    container:
@@ -77,7 +88,7 @@ jobs:
    - name: Download Neon artifact
      uses: ./.github/actions/download
      with:
-        name: neon-${{ runner.os }}-release-artifact
+        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
        path: /tmp/neon/
        prefix: latest

@@ -85,9 +96,10 @@ jobs:
      id: create-neon-project
      uses: ./.github/actions/neon-project-create
      with:
-        region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
+        region_id: ${{ matrix.region_id }}
        postgres_version: ${{ env.DEFAULT_PG_VERSION }}
        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
+        provisioner: ${{ matrix.provisioner }}

    - name: Run benchmark
      uses: ./.github/actions/run-python-test-set
@@ -96,10 +108,18 @@ jobs:
        test_selection: performance
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
+        pg_version: ${{ env.DEFAULT_PG_VERSION }}
        # Set --sparse-ordering option of pytest-order plugin
        # to ensure tests are running in order of appears in the file.
        # It's important for test_perf_pgbench.py::test_pgbench_remote_* tests
-        extra_params: -m remote_cluster --sparse-ordering --timeout 5400 --ignore test_runner/performance/test_perf_olap.py --ignore test_runner/performance/test_perf_pgvector_queries.py
+        extra_params:
+          -m remote_cluster
+          --sparse-ordering
+          --timeout 14400
+          --ignore test_runner/performance/test_perf_olap.py
+          --ignore test_runner/performance/test_perf_pgvector_queries.py
+          --ignore test_runner/performance/test_logical_replication.py
+          --ignore test_runner/performance/test_physical_replication.py
      env:
        BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -125,6 +145,71 @@ jobs:
      env:
        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}

+  replication-tests:
+    if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
+    env:
+      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
+      DEFAULT_PG_VERSION: 14
+      TEST_OUTPUT: /tmp/test_output
+      BUILD_TYPE: remote
+      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
+      PLATFORM: "neon-staging"
+
+    runs-on: [ self-hosted, us-east-2, x64 ]
+    container:
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
+      options: --init
+
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Download Neon artifact
+      uses: ./.github/actions/download
+      with:
+        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
+        path: /tmp/neon/
+        prefix: latest
+
+    - name: Run benchmark
+      uses: ./.github/actions/run-python-test-set
+      with:
+        build_type: ${{ env.BUILD_TYPE }}
+        test_selection: performance/test_logical_replication.py
+        run_in_parallel: false
+        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
+        extra_params: -m remote_cluster --timeout 5400
+      env:
+        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
+        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
+        NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }}
+
+    - name: Run benchmark
+      uses: ./.github/actions/run-python-test-set
+      with:
+        build_type: ${{ env.BUILD_TYPE }}
+        test_selection: performance/test_physical_replication.py
+        run_in_parallel: false
+        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
+        extra_params: -m remote_cluster --timeout 5400
+        pg_version: ${{ env.DEFAULT_PG_VERSION }}
+      env:
+        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
+        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
+        NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }}
+
+    - name: Create Allure report
+      if: ${{ !cancelled() }}
+      uses: ./.github/actions/allure-report-generate
+
+    - name: Post to a Slack channel
+      if: ${{ github.event.schedule && failure() }}
+      uses: slackapi/slack-github-action@v1
+      with:
+        channel-id: "C033QLM5P7D" # dev-staging-stream
+        slack-message: "Periodic replication testing: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+      env:
+        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
+
  generate-matrices:
    if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
    # Create matrices for the benchmarking jobs, so we run benchmarks on rds only once a week (on Saturday)
@@ -132,11 +217,14 @@ jobs:
    # Available platforms:
    # - neon-captest-new: Freshly created project (1 CU)
    # - neon-captest-freetier: Use freetier-sized compute (0.25 CU)
+    # - neonvm-captest-azure-new: Freshly created project (1 CU) in azure region
+    # - neonvm-captest-azure-freetier: Use freetier-sized compute (0.25 CU) in azure region
    # - neon-captest-reuse: Reusing existing project
    # - rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs
    # - rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
    env:
      RUN_AWS_RDS_AND_AURORA: ${{ github.event.inputs.run_AWS_RDS_AND_AURORA || 'false' }}
+      DEFAULT_REGION_ID: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
    runs-on: ubuntu-22.04
    outputs:
      pgbench-compare-matrix: ${{ steps.pgbench-compare-matrix.outputs.matrix }}
@@ -147,23 +235,33 @@ jobs:
    - name: Generate matrix for pgbench benchmark
      id: pgbench-compare-matrix
      run: |
+        region_id_default=${{ env.DEFAULT_REGION_ID }}
        matrix='{
+          "pg_version" : [
+            16
+          ],
+          "region_id" : [
+            "'"$region_id_default"'"
+            ],
          "platform": [
            "neon-captest-new",
            "neon-captest-reuse",
            "neonvm-captest-new"
          ],
          "db_size": [ "10gb" ],
-          "include": [{ "platform": "neon-captest-freetier",         "db_size": "3gb"  },
-                      { "platform": "neon-captest-new",              "db_size": "50gb" },
-                      { "platform": "neonvm-captest-freetier",       "db_size": "3gb"  },
-                      { "platform": "neonvm-captest-new",            "db_size": "50gb" },
-                      { "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb" }]
+          "include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neon-captest-freetier",         "db_size": "3gb"  },
+                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neon-captest-new",              "db_size": "50gb" },
+                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier",       "db_size": "3gb"  },
+                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new",            "db_size": "50gb" },
+                      { "pg_version": 16, "region_id": "azure-eastus2",      "platform": "neonvm-azure-captest-freetier", "db_size": "3gb"  },
+                      { "pg_version": 16, "region_id": "azure-eastus2",      "platform": "neonvm-azure-captest-new",      "db_size": "10gb" },
+                      { "pg_version": 16, "region_id": "azure-eastus2",      "platform": "neonvm-azure-captest-new",      "db_size": "50gb" },
+                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb" }]
        }'

        if [ "$(date +%A)" = "Saturday" ]; then
-          matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "db_size": "10gb"},
-                                                     { "platform": "rds-aurora",   "db_size": "50gb"}]')
+          matrix=$(echo "$matrix" | jq '.include += [{ "pg_version": 14, "region_id": "'"$region_id_default"'", "platform": "rds-postgres", "db_size": "10gb"},
+                                                     { "pg_version": 14, "region_id": "'"$region_id_default"'", "platform": "rds-aurora",   "db_size": "50gb"}]')
        fi

        echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
@@ -215,7 +313,7 @@ jobs:
      TEST_PG_BENCH_DURATIONS_MATRIX: "60m"
      TEST_PG_BENCH_SCALES_MATRIX: ${{ matrix.db_size }}
      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
-      DEFAULT_PG_VERSION: 14
+      DEFAULT_PG_VERSION: ${{ matrix.pg_version }}
      TEST_OUTPUT: /tmp/test_output
      BUILD_TYPE: remote
      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
@@ -235,24 +333,19 @@ jobs:
    - name: Download Neon artifact
      uses: ./.github/actions/download
      with:
-        name: neon-${{ runner.os }}-release-artifact
+        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
        path: /tmp/neon/
        prefix: latest

-    - name: Add Postgres binaries to PATH
-      run: |
-        ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
-        echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
-
    - name: Create Neon Project
-      if: contains(fromJson('["neon-captest-new", "neon-captest-freetier", "neonvm-captest-new", "neonvm-captest-freetier"]'), matrix.platform)
+      if: contains(fromJson('["neon-captest-new", "neon-captest-freetier", "neonvm-captest-new", "neonvm-captest-freetier", "neonvm-azure-captest-freetier", "neonvm-azure-captest-new"]'), matrix.platform)
      id: create-neon-project
      uses: ./.github/actions/neon-project-create
      with:
-        region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
+        region_id: ${{ matrix.region_id }}
        postgres_version: ${{ env.DEFAULT_PG_VERSION }}
        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
-        compute_units: ${{ (matrix.platform == 'neon-captest-freetier' && '[0.25, 0.25]') || '[1, 1]' }}
+        compute_units: ${{ (contains(matrix.platform, 'captest-freetier') && '[0.25, 0.25]') || '[1, 1]' }}
        provisioner: ${{ (contains(matrix.platform, 'neonvm-') && 'k8s-neonvm') || 'k8s-pod' }}

    - name: Set up Connection String
@@ -265,7 +358,7 @@ jobs:
          neonvm-captest-sharding-reuse)
            CONNSTR=${{ secrets.BENCHMARK_CAPTEST_SHARDING_CONNSTR }}
            ;;
-          neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier)
+          neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier | neonvm-azure-captest-new | neonvm-azure-captest-freetier)
            CONNSTR=${{ steps.create-neon-project.outputs.dsn }}
            ;;
          rds-aurora)
@@ -282,16 +375,6 @@ jobs:

        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT

-        QUERIES=("SELECT version()")
-        if [[ "${PLATFORM}" = "neon"* ]]; then
-          QUERIES+=("SHOW neon.tenant_id")
-          QUERIES+=("SHOW neon.timeline_id")
-        fi
-
-        for q in "${QUERIES[@]}"; do
-          psql ${CONNSTR} -c "${q}"
-        done
-
    - name: Benchmark init
      uses: ./.github/actions/run-python-test-set
      with:
@@ -300,6 +383,7 @@ jobs:
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_init
+        pg_version: ${{ env.DEFAULT_PG_VERSION }}
      env:
        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -313,6 +397,7 @@ jobs:
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_simple_update
+        pg_version: ${{ env.DEFAULT_PG_VERSION }}
      env:
        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -326,6 +411,7 @@ jobs:
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_select_only
+        pg_version: ${{ env.DEFAULT_PG_VERSION }}
      env:
        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -352,6 +438,12 @@ jobs:
        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}

  pgbench-pgvector:
+    strategy:
+      matrix:
+        include:
+          - PLATFORM: "neon-captest-pgvector"
+          - PLATFORM: "azure-captest-pgvector"
+            
    env:
      TEST_PG_BENCH_DURATIONS_MATRIX: "15m"
      TEST_PG_BENCH_SCALES_MATRIX: "1"
@@ -360,7 +452,7 @@ jobs:
      TEST_OUTPUT: /tmp/test_output
      BUILD_TYPE: remote
      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
-      PLATFORM: "neon-captest-pgvector"
+      PLATFORM: ${{ matrix.PLATFORM }}

    runs-on: [ self-hosted, us-east-2, x64 ]
    container:
@@ -373,29 +465,27 @@ jobs:
    - name: Download Neon artifact
      uses: ./.github/actions/download
      with:
-        name: neon-${{ runner.os }}-release-artifact
+        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
        path: /tmp/neon/
        prefix: latest

-    - name: Add Postgres binaries to PATH
-      run: |
-        ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
-        echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
-
    - name: Set up Connection String
      id: set-up-connstr
      run: |
-        CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR }}
-        
-        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
+        case "${PLATFORM}" in
+          neon-captest-pgvector)
+            CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR }}
+            ;;
+          azure-captest-pgvector)
+            CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR_AZURE }}
+            ;;
+          *)
+            echo >&2 "Unknown PLATFORM=${PLATFORM}"
+            exit 1
+            ;;
+        esac

-        QUERIES=("SELECT version()")
-        QUERIES+=("SHOW neon.tenant_id")
-        QUERIES+=("SHOW neon.timeline_id")
-        
-        for q in "${QUERIES[@]}"; do
-          psql ${CONNSTR} -c "${q}"
-        done
+        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT

    - name: Benchmark pgvector hnsw indexing
      uses: ./.github/actions/run-python-test-set
@@ -405,6 +495,7 @@ jobs:
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 21600 -k test_pgvector_indexing
+        pg_version: ${{ env.DEFAULT_PG_VERSION }}
      env:
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
@@ -417,12 +508,13 @@ jobs:
        test_selection: performance/test_perf_pgvector_queries.py
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
-        extra_params: -m remote_cluster --timeout 21600 
+        extra_params: -m remote_cluster --timeout 21600
+        pg_version: ${{ env.DEFAULT_PG_VERSION }}
      env:
        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
-    
+
    - name: Create Allure report
      if: ${{ !cancelled() }}
      uses: ./.github/actions/allure-report-generate
@@ -432,7 +524,7 @@ jobs:
      uses: slackapi/slack-github-action@v1
      with:
        channel-id: "C033QLM5P7D" # dev-staging-stream
-        slack-message: "Periodic perf testing neon-captest-pgvector: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+        slack-message: "Periodic perf testing ${PLATFORM}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
      env:
        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}

@@ -473,15 +565,10 @@ jobs:
    - name: Download Neon artifact
      uses: ./.github/actions/download
      with:
-        name: neon-${{ runner.os }}-release-artifact
+        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
        path: /tmp/neon/
        prefix: latest

-    - name: Add Postgres binaries to PATH
-      run: |
-        ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
-        echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
-
    - name: Set up Connection String
      id: set-up-connstr
      run: |
@@ -503,16 +590,6 @@ jobs:

        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT

-        QUERIES=("SELECT version()")
-        if [[ "${PLATFORM}" = "neon"* ]]; then
-          QUERIES+=("SHOW neon.tenant_id")
-          QUERIES+=("SHOW neon.timeline_id")
-        fi
-
-        for q in "${QUERIES[@]}"; do
-          psql ${CONNSTR} -c "${q}"
-        done
-
    - name: ClickBench benchmark
      uses: ./.github/actions/run-python-test-set
      with:
@@ -576,15 +653,10 @@ jobs:
    - name: Download Neon artifact
      uses: ./.github/actions/download
      with:
-        name: neon-${{ runner.os }}-release-artifact
+        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
        path: /tmp/neon/
        prefix: latest

-    - name: Add Postgres binaries to PATH
-      run: |
-        ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
-        echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
-
    - name: Get Connstring Secret Name
      run: |
        case "${PLATFORM}" in
@@ -613,16 +685,6 @@ jobs:

        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT

-        QUERIES=("SELECT version()")
-        if [[ "${PLATFORM}" = "neon"* ]]; then
-          QUERIES+=("SHOW neon.tenant_id")
-          QUERIES+=("SHOW neon.timeline_id")
-        fi
-
-        for q in "${QUERIES[@]}"; do
-          psql ${CONNSTR} -c "${q}"
-        done
-
    - name: Run TPC-H benchmark
      uses: ./.github/actions/run-python-test-set
      with:
@@ -677,15 +739,10 @@ jobs:
    - name: Download Neon artifact
      uses: ./.github/actions/download
      with:
-        name: neon-${{ runner.os }}-release-artifact
+        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
        path: /tmp/neon/
        prefix: latest

-    - name: Add Postgres binaries to PATH
-      run: |
-        ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
-        echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
-
    - name: Set up Connection String
      id: set-up-connstr
      run: |
@@ -707,16 +764,6 @@ jobs:

        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT

-        QUERIES=("SELECT version()")
-        if [[ "${PLATFORM}" = "neon"* ]]; then
-          QUERIES+=("SHOW neon.tenant_id")
-          QUERIES+=("SHOW neon.timeline_id")
-        fi
-
-        for q in "${QUERIES[@]}"; do
-          psql ${CONNSTR} -c "${q}"
-        done
-
    - name: Run user examples
      uses: ./.github/actions/run-python-test-set
      with:
@@ -725,6 +772,7 @@ jobs:
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 21600 -k test_user_examples
+        pg_version: ${{ env.DEFAULT_PG_VERSION }}
      env:
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
--- a/.github/workflows/build-build-tools-image.yml
+++ b/.github/workflows/build-build-tools-image.yml
@@ -63,14 +63,16 @@ jobs:
          mkdir -p /tmp/.docker-custom
          echo DOCKER_CONFIG=/tmp/.docker-custom >> $GITHUB_ENV

-      - uses: docker/setup-buildx-action@v2
+      - uses: docker/setup-buildx-action@v3
+        with:
+          cache-binary: false

-      - uses: docker/login-action@v2
+      - uses: docker/login-action@v3
        with:
          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}

-      - uses: docker/build-push-action@v4
+      - uses: docker/build-push-action@v6
        with:
          context: .
          provenance: false
@@ -78,10 +80,11 @@ jobs:
          pull: true
          file: Dockerfile.build-tools
          cache-from: type=registry,ref=neondatabase/build-tools:cache-${{ matrix.arch }}
-          cache-to: type=registry,ref=neondatabase/build-tools:cache-${{ matrix.arch }},mode=max
+          cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=neondatabase/build-tools:cache-{0},mode=max', matrix.arch) || '' }}
          tags: neondatabase/build-tools:${{ inputs.image-tag }}-${{ matrix.arch }}

      - name: Remove custom docker config directory
+        if: always()
        run: |
          rm -rf /tmp/.docker-custom

--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -30,7 +30,7 @@ jobs:
    if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
    uses: ./.github/workflows/check-permissions.yml
    with:
-      github-event-name: ${{ github.event_name}}
+      github-event-name: ${{ github.event_name }}

  cancel-previous-e2e-tests:
    needs: [ check-permissions ]
@@ -109,7 +109,7 @@ jobs:
        uses: actions/cache@v4
        with:
          path: ~/.cache/pypoetry/virtualenvs
-          key: v2-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
+          key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-${{ hashFiles('poetry.lock') }}

      - name: Install Python deps
        run: ./scripts/pysync
@@ -149,7 +149,7 @@ jobs:
 #            !~/.cargo/registry/src
 #            ~/.cargo/git/
 #            target/
-#          key: v1-${{ runner.os }}-cargo-clippy-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}
+#          key: v1-${{ runner.os }}-${{ runner.arch }}-cargo-clippy-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}

      # Some of our rust modules use FFI and need those to be checked
      - name: Get postgres headers
@@ -291,29 +291,29 @@ jobs:
 #            target/
 #          # Fall back to older versions of the key, if no cache for current Cargo.lock was found
 #          key: |
-#            v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}
-#            v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-
+#            v1-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}
+#            v1-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-

      - name: Cache postgres v14 build
        id: cache_pg_14
        uses: actions/cache@v4
        with:
          path: pg_install/v14
-          key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}

      - name: Cache postgres v15 build
        id: cache_pg_15
        uses: actions/cache@v4
        with:
          path: pg_install/v15
-          key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}

      - name: Cache postgres v16 build
        id: cache_pg_16
        uses: actions/cache@v4
        with:
          path: pg_install/v16
-          key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}

      - name: Build postgres v14
        if: steps.cache_pg_14.outputs.cache-hit != 'true'
@@ -335,6 +335,8 @@ jobs:

      - name: Run cargo build
        run: |
+          PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
+          export PQ_LIB_DIR
          ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests

      # Do install *before* running rust tests because they might recompile the
@@ -383,6 +385,11 @@ jobs:
        env:
          NEXTEST_RETRIES: 3
        run: |
+          PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
+          export PQ_LIB_DIR
+          LD_LIBRARY_PATH=$(pwd)/pg_install/v16/lib
+          export LD_LIBRARY_PATH
+
          #nextest does not yet support running doctests
          cargo test --doc $CARGO_FLAGS $CARGO_FEATURES

@@ -411,7 +418,7 @@ jobs:
      - name: Upload Neon artifact
        uses: ./.github/actions/upload
        with:
-          name: neon-${{ runner.os }}-${{ matrix.build_type }}-artifact
+          name: neon-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-artifact
          path: /tmp/neon

      # XXX: keep this after the binaries.list is formed, so the coverage can properly work later
@@ -490,7 +497,7 @@ jobs:
        uses: actions/cache@v4
        with:
          path: ~/.cache/pypoetry/virtualenvs
-          key: v1-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
+          key: v1-${{ runner.os }}-${{ runner.arch }}-python-deps-${{ hashFiles('poetry.lock') }}

      - name: Install Python deps
        run: ./scripts/pysync
@@ -639,7 +646,7 @@ jobs:
      - name: Get Neon artifact
        uses: ./.github/actions/download
        with:
-          name: neon-${{ runner.os }}-${{ matrix.build_type }}-artifact
+          name: neon-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-artifact
          path: /tmp/neon

      - name: Get coverage artifact
@@ -744,14 +751,16 @@ jobs:
        run: |
          mkdir -p .docker-custom
          echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
-      - uses: docker/setup-buildx-action@v2
+      - uses: docker/setup-buildx-action@v3
+        with:
+          cache-binary: false

      - uses: docker/login-action@v3
        with:
          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}

-      - uses: docker/build-push-action@v5
+      - uses: docker/build-push-action@v6
        with:
          context: .
          build-args: |
@@ -763,7 +772,7 @@ jobs:
          pull: true
          file: Dockerfile
          cache-from: type=registry,ref=neondatabase/neon:cache-${{ matrix.arch }}
-          cache-to: type=registry,ref=neondatabase/neon:cache-${{ matrix.arch }},mode=max
+          cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=neondatabase/neon:cache-{0},mode=max', matrix.arch) || '' }}
          tags: |
            neondatabase/neon:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}

@@ -822,11 +831,12 @@ jobs:
        run: |
          mkdir -p .docker-custom
          echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
-      - uses: docker/setup-buildx-action@v2
+      - uses: docker/setup-buildx-action@v3
        with:
+          cache-binary: false
          # Disable parallelism for docker buildkit.
          # As we already build everything with `make -j$(nproc)`, running it in additional level of parallelisam blows up the Runner.
-          config-inline: |
+          buildkitd-config-inline: |
            [worker.oci]
              max-parallelism = 1

@@ -842,7 +852,7 @@ jobs:
          password: ${{ secrets.AWS_SECRET_KEY_DEV }}

      - name: Build compute-node image
-        uses: docker/build-push-action@v5
+        uses: docker/build-push-action@v6
        with:
          context: .
          build-args: |
@@ -855,13 +865,13 @@ jobs:
          pull: true
          file: Dockerfile.compute-node
          cache-from: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache-${{ matrix.arch }}
-          cache-to: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache-${{ matrix.arch }},mode=max
+          cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=neondatabase/compute-node-{0}:cache-{1},mode=max', matrix.version, matrix.arch) || '' }}
          tags: |
            neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}

      - name: Build neon extensions test image
        if: matrix.version == 'v16'
-        uses: docker/build-push-action@v5
+        uses: docker/build-push-action@v6
        with:
          context: .
          build-args: |
@@ -875,14 +885,14 @@ jobs:
          file: Dockerfile.compute-node
          target: neon-pg-ext-test
          cache-from: type=registry,ref=neondatabase/neon-test-extensions-${{ matrix.version }}:cache-${{ matrix.arch }}
-          cache-to: type=registry,ref=neondatabase/neon-test-extensions-${{ matrix.version }}:cache-${{ matrix.arch }},mode=max
+          cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=neondatabase/neon-test-extensions-{0}:cache-{1},mode=max', matrix.version, matrix.arch) || '' }}
          tags: |
            neondatabase/neon-test-extensions-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}-${{ matrix.arch }}

      - name: Build compute-tools image
        # compute-tools are Postgres independent, so build it only once
        if: matrix.version == 'v16'
-        uses: docker/build-push-action@v5
+        uses: docker/build-push-action@v6
        with:
          target: compute-tools-image
          context: .
@@ -1245,6 +1255,7 @@ jobs:
        run: |
          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=false
+            gh workflow --repo neondatabase/azure run deploy.yml -f dockerTag=${{needs.tag.outputs.build-tag}}
          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main \
              -f deployPgSniRouter=false \
@@ -1325,6 +1336,7 @@ jobs:
        env:
          BUCKET: neon-github-public-dev
          PREFIX: artifacts/latest
+          COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
        run: |
          # Update compatibility snapshot for the release
          for pg_version in v14 v15 v16; do
@@ -1338,8 +1350,8 @@ jobs:

          # Update Neon artifact for the release (reuse already uploaded artifact)
          for build_type in debug release; do
-            OLD_PREFIX=artifacts/${GITHUB_RUN_ID}
-            FILENAME=neon-${{ runner.os }}-${build_type}-artifact.tar.zst
+            OLD_PREFIX=artifacts/${COMMIT_SHA}/${GITHUB_RUN_ID}
+            FILENAME=neon-${{ runner.os }}-${{ runner.arch }}-${build_type}-artifact.tar.zst

            S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${OLD_PREFIX} | jq -r '.Contents[]?.Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
            if [ -z "${S3_KEY}" ]; then
@@ -1357,3 +1369,31 @@ jobs:
    with:
      from-tag: ${{ needs.build-build-tools-image.outputs.image-tag }}
    secrets: inherit
+
+  # This job simplifies setting branch protection rules (in GitHub UI)
+  # by allowing to set only this job instead of listing many others.
+  # It also makes it easier to rename or parametrise jobs (using matrix)
+  # which requires changes in branch protection rules
+  #
+  # Note, that we can't add external check (like `neon-cloud-e2e`) we still need to use GitHub UI for that.
+  #
+  # https://github.com/neondatabase/neon/settings/branch_protection_rules
+  conclusion:
+    if: always()
+    # Format `needs` differently to make the list more readable.
+    # Usually we do `needs: [...]`
+    needs:
+      - check-codestyle-python
+      - check-codestyle-rust
+      - regress-tests
+      - test-images
+    runs-on: ubuntu-22.04
+    steps:
+      # The list of possible results:
+      # https://docs.github.com/en/actions/learn-github-actions/contexts#needs-context
+      - name: Fail the job if any of the dependencies do not succeed
+        run: exit 1
+        if: |
+          contains(needs.*.result, 'failure')
+          || contains(needs.*.result, 'cancelled')
+          || contains(needs.*.result, 'skipped')
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -232,12 +232,19 @@ jobs:

      - name: Run cargo build
        run: |
+          PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
+          export PQ_LIB_DIR
          mold -run cargo build --locked $CARGO_FLAGS $CARGO_FEATURES --bins --tests -j$(nproc)

      - name: Run cargo test
        env:
          NEXTEST_RETRIES: 3
        run: |
+          PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
+          export PQ_LIB_DIR
+          LD_LIBRARY_PATH=$(pwd)/pg_install/v16/lib
+          export LD_LIBRARY_PATH
+
          cargo nextest run $CARGO_FEATURES -j$(nproc)

          # Run separate tests for real S3
@@ -378,7 +385,7 @@ jobs:
        run: make walproposer-lib -j$(nproc)

      - name: Produce the build stats
-        run: cargo build --all --release --timings -j$(nproc)
+        run: PQ_LIB_DIR=$(pwd)/pg_install/v16/lib cargo build --all --release --timings -j$(nproc)

      - name: Upload the build stats
        id: upload-stats
--- a/.github/workflows/periodic_pagebench.yml
+++ b/.github/workflows/periodic_pagebench.yml
@@ -0,0 +1,155 @@
+name: Periodic pagebench performance test on dedicated EC2 machine in eu-central-1 region
+
+on:
+  schedule:
+    # * is a special character in YAML so you have to quote this string
+    #          ┌───────────── minute (0 - 59)
+    #          │ ┌───────────── hour (0 - 23)
+    #          │ │ ┌───────────── day of the month (1 - 31)
+    #          │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
+    #          │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
+    - cron:  '0 18 * * *' # Runs at 6 PM UTC every day
+  workflow_dispatch: # Allows manual triggering of the workflow
+    inputs:
+      commit_hash:
+        type: string
+        description: 'The long neon repo commit hash for the system under test (pageserver) to be tested.'
+        required: false
+        default: ''
+
+defaults:
+  run:
+    shell: bash -euo pipefail {0}
+
+concurrency:
+  group: ${{ github.workflow }}
+  cancel-in-progress: false
+
+jobs:
+  trigger_bench_on_ec2_machine_in_eu_central_1:
+    runs-on: [ self-hosted, gen3, small ]
+    container:
+      image: neondatabase/build-tools:pinned
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+      options: --init
+    timeout-minutes: 360  # Set the timeout to 6 hours
+    env:
+      API_KEY: ${{ secrets.PERIODIC_PAGEBENCH_EC2_RUNNER_API_KEY }}
+      RUN_ID: ${{ github.run_id }}
+      AWS_ACCESS_KEY_ID: ${{ secrets.AWS_EC2_US_TEST_RUNNER_ACCESS_KEY_ID }}
+      AWS_SECRET_ACCESS_KEY : ${{ secrets.AWS_EC2_US_TEST_RUNNER_ACCESS_KEY_SECRET }}
+      AWS_DEFAULT_REGION : "eu-central-1"
+      AWS_INSTANCE_ID : "i-02a59a3bf86bc7e74"
+    steps:
+    # we don't need the neon source code because we run everything remotely
+    # however we still need the local github actions to run the allure step below
+    - uses: actions/checkout@v4
+
+    - name: Show my own (github runner) external IP address - usefull for IP allowlisting
+      run: curl https://ifconfig.me
+
+    - name: Start EC2 instance and wait for the instance to boot up
+      run: |
+        aws ec2 start-instances --instance-ids $AWS_INSTANCE_ID
+        aws ec2 wait instance-running --instance-ids $AWS_INSTANCE_ID
+        sleep 60 # sleep some time to allow cloudinit and our API server to start up
+
+    - name: Determine public IP of the EC2 instance and set env variable EC2_MACHINE_URL_US
+      run: |
+        public_ip=$(aws ec2 describe-instances --instance-ids $AWS_INSTANCE_ID --query 'Reservations[*].Instances[*].PublicIpAddress' --output text)
+        echo "Public IP of the EC2 instance: $public_ip"
+        echo "EC2_MACHINE_URL_US=https://${public_ip}:8443" >> $GITHUB_ENV
+
+    - name: Determine commit hash
+      env:
+        INPUT_COMMIT_HASH: ${{ github.event.inputs.commit_hash }}
+      run: |
+        if [ -z "$INPUT_COMMIT_HASH" ]; then
+          echo "COMMIT_HASH=$(curl -s https://api.github.com/repos/neondatabase/neon/commits/main | jq -r '.sha')" >> $GITHUB_ENV
+        else
+          echo "COMMIT_HASH=$INPUT_COMMIT_HASH" >> $GITHUB_ENV
+        fi
+
+    - name: Start Bench with run_id   
+      run: |
+        curl -k -X 'POST' \
+        "${EC2_MACHINE_URL_US}/start_test/${GITHUB_RUN_ID}" \
+        -H 'accept: application/json' \
+        -H 'Content-Type: application/json' \
+        -H "Authorization: Bearer $API_KEY" \
+        -d "{\"neonRepoCommitHash\": \"${COMMIT_HASH}\"}"
+
+    - name: Poll Test Status
+      id: poll_step
+      run: |
+        status=""
+        while [[ "$status" != "failure" && "$status" != "success" ]]; do
+          response=$(curl -k -X 'GET' \
+          "${EC2_MACHINE_URL_US}/test_status/${GITHUB_RUN_ID}" \
+          -H 'accept: application/json' \
+          -H "Authorization: Bearer $API_KEY")
+          echo "Response: $response"
+          set +x
+          status=$(echo $response | jq -r '.status')
+          echo "Test status: $status"
+          if [[ "$status" == "failure" ]]; then
+            echo "Test failed"
+            exit 1 # Fail the job step if status is failure
+          elif [[ "$status" == "success" || "$status" == "null" ]]; then
+            break
+          elif [[ "$status" == "too_many_runs" ]]; then
+            echo "Too many runs already running"
+            echo "too_many_runs=true" >> "$GITHUB_OUTPUT"
+            exit 1
+          fi
+
+          sleep 60 # Poll every 60 seconds
+        done
+
+    - name: Retrieve Test Logs
+      if: always() && steps.poll_step.outputs.too_many_runs != 'true'
+      run: |
+        curl -k -X 'GET' \
+        "${EC2_MACHINE_URL_US}/test_log/${GITHUB_RUN_ID}" \
+        -H 'accept: application/gzip' \
+        -H "Authorization: Bearer $API_KEY" \
+        --output "test_log_${GITHUB_RUN_ID}.gz"
+    
+    - name: Unzip Test Log and Print it into this job's log
+      if: always() && steps.poll_step.outputs.too_many_runs != 'true'
+      run: |
+        gzip -d "test_log_${GITHUB_RUN_ID}.gz"
+        cat "test_log_${GITHUB_RUN_ID}"
+
+    - name: Create Allure report
+      env:
+        AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+        AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
+      if: ${{ !cancelled() }}
+      uses: ./.github/actions/allure-report-generate
+
+    - name: Post to a Slack channel
+      if: ${{ github.event.schedule && failure() }}
+      uses: slackapi/slack-github-action@v1
+      with:
+        channel-id: "C033QLM5P7D" # dev-staging-stream
+        slack-message: "Periodic pagebench testing on dedicated hardware: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+      env:
+        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
+
+    - name: Cleanup Test Resources
+      if: always() 
+      run: |
+        curl -k -X 'POST' \
+        "${EC2_MACHINE_URL_US}/cleanup_test/${GITHUB_RUN_ID}" \
+        -H 'accept: application/json' \
+        -H "Authorization: Bearer $API_KEY" \
+        -d ''
+
+    - name: Stop EC2 instance and wait for the instance to be stopped
+      if: always() && steps.poll_step.outputs.too_many_runs != 'true'
+      run: |
+        aws ec2 stop-instances --instance-ids $AWS_INSTANCE_ID
+        aws ec2 wait instance-stopped --instance-ids $AWS_INSTANCE_ID
--- a/.github/workflows/pg-clients.yml
+++ b/.github/workflows/pg-clients.yml
@@ -0,0 +1,115 @@
+name: Test Postgres client libraries
+
+on:
+  schedule:
+    # * is a special character in YAML so you have to quote this string
+    #          ┌───────────── minute (0 - 59)
+    #          │ ┌───────────── hour (0 - 23)
+    #          │ │ ┌───────────── day of the month (1 - 31)
+    #          │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
+    #          │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
+    - cron:  '23 02 * * *' # run once a day, timezone is utc
+  pull_request:
+    paths:
+      - '.github/workflows/pg-clients.yml'
+      - 'test_runner/pg_clients/**'
+      - 'poetry.lock'
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref_name }}
+  cancel-in-progress: ${{ github.event_name == 'pull_request' }}
+
+defaults:
+  run:
+    shell: bash -euxo pipefail {0}
+
+env:
+  DEFAULT_PG_VERSION: 16
+  PLATFORM: neon-captest-new
+  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
+  AWS_DEFAULT_REGION: eu-central-1
+
+jobs:
+  check-permissions:
+    if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
+    uses: ./.github/workflows/check-permissions.yml
+    with:
+      github-event-name: ${{ github.event_name }}
+
+  check-build-tools-image:
+    needs: [ check-permissions ]
+    uses: ./.github/workflows/check-build-tools-image.yml
+
+  build-build-tools-image:
+    needs: [ check-build-tools-image ]
+    uses: ./.github/workflows/build-build-tools-image.yml
+    with:
+      image-tag: ${{ needs.check-build-tools-image.outputs.image-tag }}
+    secrets: inherit
+
+  test-postgres-client-libs:
+    needs: [ build-build-tools-image ]
+    runs-on: ubuntu-22.04
+
+    container:
+      image: ${{ needs.build-build-tools-image.outputs.image }}
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+      options: --init --user root
+
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Download Neon artifact
+      uses: ./.github/actions/download
+      with:
+        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
+        path: /tmp/neon/
+        prefix: latest
+
+    - name: Create Neon Project
+      id: create-neon-project
+      uses: ./.github/actions/neon-project-create
+      with:
+        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
+        postgres_version: ${{ env.DEFAULT_PG_VERSION }}
+
+    - name: Run tests
+      uses: ./.github/actions/run-python-test-set
+      with:
+        build_type: remote
+        test_selection: pg_clients
+        run_in_parallel: false
+        extra_params: -m remote_cluster
+        pg_version: ${{ env.DEFAULT_PG_VERSION }}
+      env:
+        BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
+
+    - name: Delete Neon Project
+      if: always()
+      uses: ./.github/actions/neon-project-delete
+      with:
+        project_id: ${{ steps.create-neon-project.outputs.project_id }}
+        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
+
+    - name: Create Allure report
+      if: ${{ !cancelled() }}
+      id: create-allure-report
+      uses: ./.github/actions/allure-report-generate
+      with:
+        store-test-results-into-db: true
+      env:
+        REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
+
+    - name: Post to a Slack channel
+      if: github.event.schedule && failure()
+      uses: slackapi/slack-github-action@v1
+      with:
+        channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream
+        slack-message: |
+          Testing Postgres clients: <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|${{ job.status }}> (<${{ steps.create-allure-report.outputs.report-url }}|test report>)
+      env:
+        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
--- a/.github/workflows/pg_clients.yml
+++ b/.github/workflows/pg_clients.yml
@@ -1,98 +0,0 @@
-name: Test Postgres client libraries
-
-on:
-  schedule:
-    # * is a special character in YAML so you have to quote this string
-    #          ┌───────────── minute (0 - 59)
-    #          │ ┌───────────── hour (0 - 23)
-    #          │ │ ┌───────────── day of the month (1 - 31)
-    #          │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
-    #          │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
-    - cron:  '23 02 * * *' # run once a day, timezone is utc
-
-  workflow_dispatch:
-
-concurrency:
-  # Allow only one workflow per any non-`main` branch.
-  group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
-  cancel-in-progress: true
-
-jobs:
-  test-postgres-client-libs:
-    # TODO: switch to gen2 runner, requires docker
-    runs-on: ubuntu-22.04
-
-    env:
-      DEFAULT_PG_VERSION: 14
-      TEST_OUTPUT: /tmp/test_output
-
-    steps:
-    - name: Checkout
-      uses: actions/checkout@v4
-
-    - uses: actions/setup-python@v4
-      with:
-        python-version: 3.9
-
-    - name: Install Poetry
-      uses: snok/install-poetry@v1
-
-    - name: Cache poetry deps
-      uses: actions/cache@v4
-      with:
-        path: ~/.cache/pypoetry/virtualenvs
-        key: v2-${{ runner.os }}-python-deps-ubunutu-latest-${{ hashFiles('poetry.lock') }}
-
-    - name: Install Python deps
-      shell: bash -euxo pipefail {0}
-      run: ./scripts/pysync
-
-    - name: Create Neon Project
-      id: create-neon-project
-      uses: ./.github/actions/neon-project-create
-      with:
-        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
-        postgres_version: ${{ env.DEFAULT_PG_VERSION }}
-
-    - name: Run pytest
-      env:
-        REMOTE_ENV: 1
-        BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
-        POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
-      shell: bash -euxo pipefail {0}
-      run: |
-        # Test framework expects we have psql binary;
-        # but since we don't really need it in this test, let's mock it
-        mkdir -p "$POSTGRES_DISTRIB_DIR/v${DEFAULT_PG_VERSION}/bin" && touch "$POSTGRES_DISTRIB_DIR/v${DEFAULT_PG_VERSION}/bin/psql";
-        ./scripts/pytest \
-          --junitxml=$TEST_OUTPUT/junit.xml \
-          --tb=short \
-          --verbose \
-          -m "remote_cluster" \
-          -rA "test_runner/pg_clients"
-
-    - name: Delete Neon Project
-      if: ${{ always() }}
-      uses: ./.github/actions/neon-project-delete
-      with:
-        project_id: ${{ steps.create-neon-project.outputs.project_id }}
-        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
-
-    # We use GitHub's action upload-artifact because `ubuntu-latest` doesn't have configured AWS CLI.
-    # It will be fixed after switching to gen2 runner
-    - name: Upload python test logs
-      if: always()
-      uses: actions/upload-artifact@v4
-      with:
-        retention-days: 7
-        name: python-test-pg_clients-${{ runner.os }}-stage-logs
-        path: ${{ env.TEST_OUTPUT }}
-
-    - name: Post to a Slack channel
-      if: ${{ github.event.schedule && failure() }}
-      uses: slackapi/slack-github-action@v1
-      with:
-        channel-id: "C033QLM5P7D" # dev-staging-stream
-        slack-message: "Testing Postgres clients: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
-      env:
-        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
--- a/.neon_clippy_args
+++ b/.neon_clippy_args
@@ -1,4 +1,5 @@
 # * `-A unknown_lints` – do not warn about unknown lint suppressions
 #                        that people with newer toolchains might use
 # * `-D warnings`      - fail on any warnings (`cargo` returns non-zero exit status)
-export CLIPPY_COMMON_ARGS="--locked --workspace --all-targets -- -A unknown_lints -D warnings"
+# * `-D clippy::todo`  - don't let `todo!()` slip into `main`
+export CLIPPY_COMMON_ARGS="--locked --workspace --all-targets -- -A unknown_lints -D warnings -D clippy::todo"
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1236,6 +1236,7 @@ dependencies = [
 "regex",
 "remote_storage",
 "reqwest 0.12.4",
+ "rlimit",
 "rust-ini",
 "serde",
 "serde_json",
@@ -1246,7 +1247,7 @@ dependencies = [
 "tokio-postgres",
 "tokio-stream",
 "tokio-util",
- "toml_edit",
+ "toml_edit 0.19.10",
 "tracing",
 "tracing-opentelemetry",
 "tracing-subscriber",
@@ -1362,8 +1363,8 @@ dependencies = [
 "tokio",
 "tokio-postgres",
 "tokio-util",
- "toml",
- "toml_edit",
+ "toml 0.7.4",
+ "toml_edit 0.19.10",
 "tracing",
 "url",
 "utils",
@@ -1397,9 +1398,9 @@ dependencies = [

 [[package]]
 name = "crc32c"
-version = "0.6.5"
+version = "0.6.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "89254598aa9b9fa608de44b3ae54c810f0f06d755e24c50177f1f8f31ff50ce2"
+checksum = "3a47af21622d091a8f0fb295b88bc886ac74efcc613efc19f5d0b21de5c89e47"
 dependencies = [
 "rustc_version",
 ]
@@ -1651,6 +1652,16 @@ dependencies = [
 "rusticata-macros",
 ]

+[[package]]
+name = "deranged"
+version = "0.3.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b42b6fa04a440b495c8b04d0e71b707c585f83cb9cb28cf8cd0d976c315e31b4"
+dependencies = [
+ "powerfmt",
+ "serde",
+]
+
 [[package]]
 name = "desim"
 version = "0.1.0"
@@ -1669,9 +1680,9 @@ dependencies = [

 [[package]]
 name = "diesel"
-version = "2.1.4"
+version = "2.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "62c6fcf842f17f8c78ecf7c81d75c5ce84436b41ee07e03f490fbb5f5a8731d8"
+checksum = "62d6dcd069e7b5fe49a302411f759d4cf1cf2c27fe798ef46fb8baefc053dd2b"
 dependencies = [
 "bitflags 2.4.1",
 "byteorder",
@@ -1684,11 +1695,12 @@ dependencies = [

 [[package]]
 name = "diesel_derives"
-version = "2.1.2"
+version = "2.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ef8337737574f55a468005a83499da720f20c65586241ffea339db9ecdfd2b44"
+checksum = "59de76a222c2b8059f789cbe07afbfd8deb8c31dd0bc2a21f85e256c1def8259"
 dependencies = [
 "diesel_table_macro_syntax",
+ "dsl_auto_type",
 "proc-macro2",
 "quote",
 "syn 2.0.52",
@@ -1696,9 +1708,9 @@ dependencies = [

 [[package]]
 name = "diesel_migrations"
-version = "2.1.0"
+version = "2.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6036b3f0120c5961381b570ee20a02432d7e2d27ea60de9578799cf9156914ac"
+checksum = "8a73ce704bad4231f001bff3314d91dce4aba0770cee8b233991859abc15c1f6"
 dependencies = [
 "diesel",
 "migrations_internals",
@@ -1707,9 +1719,9 @@ dependencies = [

 [[package]]
 name = "diesel_table_macro_syntax"
-version = "0.1.0"
+version = "0.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fc5557efc453706fed5e4fa85006fe9817c224c3f480a34c7e5959fd700921c5"
+checksum = "209c735641a413bc68c4923a9d6ad4bcb3ca306b794edaa7eb0b3228a99ffb25"
 dependencies = [
 "syn 2.0.52",
 ]
@@ -1745,6 +1757,20 @@ dependencies = [
 "const-random",
 ]

+[[package]]
+name = "dsl_auto_type"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0892a17df262a24294c382f0d5997571006e7a4348b4327557c4ff1cd4a8bccc"
+dependencies = [
+ "darling",
+ "either",
+ "heck 0.5.0",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.52",
+]
+
 [[package]]
 name = "dyn-clone"
 version = "1.0.14"
@@ -2002,16 +2028,6 @@ dependencies = [
 "tokio-util",
 ]

-[[package]]
-name = "fs2"
-version = "0.4.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9564fc758e15025b46aa6643b1b77d047d1a56a1aea6e01002ac0c7026876213"
-dependencies = [
- "libc",
- "winapi",
-]
-
 [[package]]
 name = "fsevent-sys"
 version = "4.1.0"
@@ -2993,9 +3009,9 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"

 [[package]]
 name = "measured"
-version = "0.0.21"
+version = "0.0.22"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "652bc741286361c06de8cb4d89b21a6437f120c508c51713663589eeb9928ac5"
+checksum = "3051f3a030d55d680cdef6ca50e80abd1182f8da29f2344a7c9cb575721138f0"
 dependencies = [
 "bytes",
 "crossbeam-utils",
@@ -3011,9 +3027,9 @@ dependencies = [

 [[package]]
 name = "measured-derive"
-version = "0.0.21"
+version = "0.0.22"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6ea497f33e1e856a376c32ad916f69a0bd3c597db1f912a399f842b01a4a685d"
+checksum = "b9e6777fc80a575f9503d908c8b498782a6c3ee88a06cb416dc3941401e43b94"
 dependencies = [
 "heck 0.5.0",
 "proc-macro2",
@@ -3023,9 +3039,9 @@ dependencies = [

 [[package]]
 name = "measured-process"
-version = "0.0.21"
+version = "0.0.22"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b364ccb66937a814b6b2ad751d1a2f7a9d5a78c761144036825fb36bb0771000"
+checksum = "7c4b80445aeb08e832d87bf1830049a924cdc1d6b7ef40b6b9b365bff17bf8ec"
 dependencies = [
 "libc",
 "measured",
@@ -3084,19 +3100,19 @@ dependencies = [

 [[package]]
 name = "migrations_internals"
-version = "2.1.0"
+version = "2.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0f23f71580015254b020e856feac3df5878c2c7a8812297edd6c0a485ac9dada"
+checksum = "fd01039851e82f8799046eabbb354056283fb265c8ec0996af940f4e85a380ff"
 dependencies = [
 "serde",
- "toml",
+ "toml 0.8.14",
 ]

 [[package]]
 name = "migrations_macros"
-version = "2.1.0"
+version = "2.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cce3325ac70e67bbab5bd837a31cae01f1a6db64e0e744a33cb03a543469ef08"
+checksum = "ffb161cc72176cb37aa47f1fc520d3ef02263d67d661f44f05d05a079e1237fd"
 dependencies = [
 "migrations_internals",
 "proc-macro2",
@@ -3260,6 +3276,12 @@ dependencies = [
 "num-traits",
 ]

+[[package]]
+name = "num-conv"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9"
+
 [[package]]
 name = "num-integer"
 version = "0.1.45"
@@ -3576,7 +3598,7 @@ dependencies = [
 "thiserror",
 "tokio",
 "tokio-util",
- "toml_edit",
+ "toml_edit 0.19.10",
 "utils",
 "workspace_hack",
 ]
@@ -3652,6 +3674,7 @@ dependencies = [
 "sysinfo",
 "tenant_size_model",
 "thiserror",
+ "tikv-jemallocator",
 "tokio",
 "tokio-epoll-uring",
 "tokio-io-timeout",
@@ -3659,7 +3682,7 @@ dependencies = [
 "tokio-stream",
 "tokio-tar",
 "tokio-util",
- "toml_edit",
+ "toml_edit 0.19.10",
 "tracing",
 "twox-hash",
 "url",
@@ -4062,6 +4085,7 @@ dependencies = [
 "tokio-postgres",
 "tokio-postgres-rustls",
 "tokio-rustls 0.25.0",
+ "tokio-util",
 "tracing",
 "workspace_hack",
 ]
@@ -4102,6 +4126,12 @@ dependencies = [
 "workspace_hack",
 ]

+[[package]]
+name = "powerfmt"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391"
+
 [[package]]
 name = "ppv-lite86"
 version = "0.2.17"
@@ -4374,6 +4404,7 @@ dependencies = [
 "tracing-opentelemetry",
 "tracing-subscriber",
 "tracing-utils",
+ "typed-json",
 "url",
 "urlencoding",
 "utils",
@@ -4665,7 +4696,7 @@ dependencies = [
 "tokio",
 "tokio-stream",
 "tokio-util",
- "toml_edit",
+ "toml_edit 0.19.10",
 "tracing",
 "utils",
 "workspace_hack",
@@ -4862,6 +4893,15 @@ dependencies = [
 "windows-sys 0.48.0",
 ]

+[[package]]
+name = "rlimit"
+version = "0.10.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3560f70f30a0f16d11d01ed078a07740fe6b489667abc7c7b029155d9f21c3d8"
+dependencies = [
+ "libc",
+]
+
 [[package]]
 name = "routerify"
 version = "3.0.0"
@@ -5130,7 +5170,6 @@ dependencies = [
 "crc32c",
 "desim",
 "fail",
- "fs2",
 "futures",
 "git-version",
 "hex",
@@ -5157,6 +5196,8 @@ dependencies = [
 "sha2",
 "signal-hook",
 "storage_broker",
+ "strum",
+ "strum_macros",
 "thiserror",
 "tokio",
 "tokio-io-timeout",
@@ -5164,7 +5205,7 @@ dependencies = [
 "tokio-stream",
 "tokio-tar",
 "tokio-util",
- "toml_edit",
+ "toml_edit 0.19.10",
 "tracing",
 "tracing-subscriber",
 "url",
@@ -5381,9 +5422,9 @@ checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4"

 [[package]]
 name = "serde"
-version = "1.0.183"
+version = "1.0.203"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "32ac8da02677876d532745a130fc9d8e6edfa81a269b107c5b00829b91d8eb3c"
+checksum = "7253ab4de971e72fb7be983802300c30b5a7f0c2e56fab8abfc6a214307c0094"
 dependencies = [
 "serde_derive",
 ]
@@ -5400,9 +5441,9 @@ dependencies = [

 [[package]]
 name = "serde_derive"
-version = "1.0.183"
+version = "1.0.203"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "aafe972d60b0b9bee71a91b92fee2d4fb3c9d7e8f6b179aa99f27203d99a4816"
+checksum = "500cbc0ebeb6f46627f50f3f5811ccf6bf00643be300b4c3eabc0ef55dc5b5ba"
 dependencies = [
 "proc-macro2",
 "quote",
@@ -5443,9 +5484,9 @@ dependencies = [

 [[package]]
 name = "serde_spanned"
-version = "0.6.2"
+version = "0.6.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "93107647184f6027e3b7dcb2e11034cf95ffa1e3a682c67951963ac69c1c007d"
+checksum = "79e674e01f999af37c49f70a6ede167a8a60b2503e56c5599532a65baa5969a0"
 dependencies = [
 "serde",
 ]
@@ -6092,12 +6133,15 @@ dependencies = [

 [[package]]
 name = "time"
-version = "0.3.21"
+version = "0.3.36"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8f3403384eaacbca9923fa06940178ac13e4edb725486d70e8e15881d0c836cc"
+checksum = "5dfd88e563464686c916c7e46e623e520ddc6d79fa6641390f2e3fa86e83e885"
 dependencies = [
+ "deranged",
 "itoa",
 "js-sys",
+ "num-conv",
+ "powerfmt",
 "serde",
 "time-core",
 "time-macros",
@@ -6105,16 +6149,17 @@ dependencies = [

 [[package]]
 name = "time-core"
-version = "0.1.1"
+version = "0.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7300fbefb4dadc1af235a9cef3737cea692a9d97e1b9cbcd4ebdae6f8868e6fb"
+checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3"

 [[package]]
 name = "time-macros"
-version = "0.2.9"
+version = "0.2.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "372950940a5f07bf38dbe211d7283c9e6d7327df53794992d293e534c733d09b"
+checksum = "3f252a68540fde3a3877aeea552b832b40ab9a69e318efd078774a01ddee1ccf"
 dependencies = [
+ "num-conv",
 "time-core",
 ]

@@ -6330,14 +6375,26 @@ dependencies = [
 "serde",
 "serde_spanned",
 "toml_datetime",
- "toml_edit",
+ "toml_edit 0.19.10",
+]
+
+[[package]]
+name = "toml"
+version = "0.8.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6f49eb2ab21d2f26bd6db7bf383edc527a7ebaee412d17af4d40fdccd442f335"
+dependencies = [
+ "serde",
+ "serde_spanned",
+ "toml_datetime",
+ "toml_edit 0.22.14",
 ]

 [[package]]
 name = "toml_datetime"
-version = "0.6.2"
+version = "0.6.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5a76a9312f5ba4c2dec6b9161fdf25d87ad8a09256ccea5a556fef03c706a10f"
+checksum = "4badfd56924ae69bcc9039335b2e017639ce3f9b001c393c1b2d1ef846ce2cbf"
 dependencies = [
 "serde",
 ]
@@ -6352,7 +6409,20 @@ dependencies = [
 "serde",
 "serde_spanned",
 "toml_datetime",
- "winnow",
+ "winnow 0.4.6",
+]
+
+[[package]]
+name = "toml_edit"
+version = "0.22.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f21c7aaf97f1bd9ca9d4f9e73b0a6c74bd5afef56f2bc931943a6e1c37e04e38"
+dependencies = [
+ "indexmap 2.0.1",
+ "serde",
+ "serde_spanned",
+ "toml_datetime",
+ "winnow 0.6.13",
 ]

 [[package]]
@@ -6432,17 +6502,6 @@ version = "0.3.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52"

-[[package]]
-name = "trace"
-version = "0.1.0"
-dependencies = [
- "anyhow",
- "clap",
- "pageserver_api",
- "utils",
- "workspace_hack",
-]
-
 [[package]]
 name = "tracing"
 version = "0.1.37"
@@ -6607,6 +6666,16 @@ dependencies = [
 "static_assertions",
 ]

+[[package]]
+name = "typed-json"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6024a8d0025400b3f6b189366e9aa92012cf9c4fe1cd2620848dd61425c49eed"
+dependencies = [
+ "serde",
+ "serde_json",
+]
+
 [[package]]
 name = "typenum"
 version = "1.16.0"
@@ -6771,6 +6840,7 @@ dependencies = [
 "tokio-stream",
 "tokio-tar",
 "tokio-util",
+ "toml_edit 0.19.10",
 "tracing",
 "tracing-error",
 "tracing-subscriber",
@@ -7335,6 +7405,15 @@ dependencies = [
 "memchr",
 ]

+[[package]]
+name = "winnow"
+version = "0.6.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "59b5e5f6c299a3c7890b876a2a587f3115162487e704907d9b6cd29473052ba1"
+dependencies = [
+ "memchr",
+]
+
 [[package]]
 name = "winreg"
 version = "0.50.0"
@@ -7377,13 +7456,12 @@ dependencies = [
 "clap",
 "clap_builder",
 "crossbeam-utils",
+ "deranged",
 "either",
 "fail",
 "futures-channel",
- "futures-core",
 "futures-executor",
 "futures-io",
- "futures-sink",
 "futures-util",
 "getrandom 0.2.11",
 "hashbrown 0.14.5",
@@ -7401,7 +7479,9 @@ dependencies = [
 "num-traits",
 "once_cell",
 "parquet",
+ "proc-macro2",
 "prost",
+ "quote",
 "rand 0.8.5",
 "regex",
 "regex-automata 0.4.3",
@@ -7418,13 +7498,14 @@ dependencies = [
 "syn 1.0.109",
 "syn 2.0.52",
 "sync_wrapper",
+ "tikv-jemalloc-sys",
 "time",
 "time-macros",
 "tokio",
 "tokio-rustls 0.24.0",
 "tokio-util",
 "toml_datetime",
- "toml_edit",
+ "toml_edit 0.19.10",
 "tonic",
 "tower",
 "tracing",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -15,7 +15,6 @@ members = [
    "storage_controller",
    "storage_scrubber",
    "workspace_hack",
-    "trace",
    "libs/compute_api",
    "libs/pageserver_api",
    "libs/postgres_ffi",
@@ -84,7 +83,6 @@ enumset = "1.0.12"
 fail = "0.5.0"
 fallible-iterator = "0.2"
 framed-websockets = { version = "0.1.0", git = "https://github.com/neondatabase/framed-websockets" }
-fs2 = "0.4.3"
 futures = "0.3"
 futures-core = "0.3"
 futures-util = "0.3"
@@ -111,8 +109,8 @@ lasso = "0.7"
 leaky-bucket = "1.0.1"
 libc = "0.2"
 md5 = "0.7.0"
-measured = { version = "0.0.21", features=["lasso"] }
-measured-process = { version = "0.0.21" }
+measured = { version = "0.0.22", features=["lasso"] }
+measured-process = { version = "0.0.22" }
 memoffset = "0.8"
 nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] }
 notify = "6.0.0"
@@ -186,6 +184,7 @@ tracing-error = "0.2.0"
 tracing-opentelemetry = "0.21.0"
 tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json", "ansi"] }
 twox-hash = { version = "1.6.3", default-features = false }
+typed-json = "0.1"
 url = "2.2"
 urlencoding = "2.1"
 uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] }
--- a/5
+++ b/5
@@ -42,12 +42,13 @@ ARG CACHEPOT_BUCKET=neon-github-dev
 COPY --from=pg-build /home/nonroot/pg_install/v14/include/postgresql/server pg_install/v14/include/postgresql/server
 COPY --from=pg-build /home/nonroot/pg_install/v15/include/postgresql/server pg_install/v15/include/postgresql/server
 COPY --from=pg-build /home/nonroot/pg_install/v16/include/postgresql/server pg_install/v16/include/postgresql/server
+COPY --from=pg-build /home/nonroot/pg_install/v16/lib                       pg_install/v16/lib
 COPY --chown=nonroot . .

 # Show build caching stats to check if it was used in the end.
 # Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats.
 RUN set -e \
-    && RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment" cargo build  \
+    && PQ_LIB_DIR=$(pwd)/pg_install/v16/lib RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment" cargo build \
      --bin pg_sni_router  \
      --bin pageserver  \
      --bin pagectl  \
@@ -56,6 +57,7 @@ RUN set -e \
      --bin storage_controller  \
      --bin proxy  \
      --bin neon_local \
+      --bin storage_scrubber \
      --locked --release \
    && cachepot -s

@@ -82,6 +84,7 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_controller  /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy               /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local          /usr/local/bin
+COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_scrubber    /usr/local/bin

 COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/
 COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/
--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -1,5 +1,13 @@
 FROM debian:bullseye-slim

+# Use ARG as a build-time environment variable here to allow.
+# It's not supposed to be set outside.
+# Alternatively it can be obtained using the following command
+# ```
+# . /etc/os-release && echo "${VERSION_CODENAME}"
+# ```
+ARG DEBIAN_VERSION_CODENAME=bullseye
+
 # Add nonroot user
 RUN useradd -ms /bin/bash nonroot -b /home
 SHELL ["/bin/bash", "-c"]
@@ -26,7 +34,6 @@ RUN set -e \
        liblzma-dev \
        libncurses5-dev \
        libncursesw5-dev \
-        libpq-dev \
        libreadline-dev \
        libseccomp-dev \
        libsqlite3-dev \
@@ -67,19 +74,24 @@ RUN curl -sL "https://github.com/peak/s5cmd/releases/download/v${S5CMD_VERSION}/
 # LLVM
 ENV LLVM_VERSION=18
 RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \
-    && echo "deb http://apt.llvm.org/bullseye/ llvm-toolchain-bullseye-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \
+    && echo "deb http://apt.llvm.org/${DEBIAN_VERSION_CODENAME}/ llvm-toolchain-${DEBIAN_VERSION_CODENAME}-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \
    && apt update \
    && apt install -y clang-${LLVM_VERSION} llvm-${LLVM_VERSION} \
    && bash -c 'for f in /usr/bin/clang*-${LLVM_VERSION} /usr/bin/llvm*-${LLVM_VERSION}; do ln -s "${f}" "${f%-${LLVM_VERSION}}"; done' \
    && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*

-# PostgreSQL 14
-RUN curl -fsSL 'https://www.postgresql.org/media/keys/ACCC4CF8.asc' | apt-key add - \
-    && echo 'deb http://apt.postgresql.org/pub/repos/apt bullseye-pgdg main' > /etc/apt/sources.list.d/pgdg.list \
+# Install docker
+RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg \
+    && echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/debian ${DEBIAN_VERSION_CODENAME} stable" > /etc/apt/sources.list.d/docker.list \
    && apt update \
-    && apt install -y postgresql-client-14 \
+    && apt install -y docker-ce docker-ce-cli \
    && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*

+# Configure sudo & docker
+RUN usermod -aG sudo nonroot && \
+    echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers && \
+    usermod -aG docker nonroot
+
 # AWS CLI
 RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "awscliv2.zip" \
    && unzip -q awscliv2.zip \
@@ -113,10 +125,10 @@ RUN for package in Capture::Tiny DateTime Devel::Cover Digest::MD5 File::Spec JS
    && rm -rf ../lcov.tar.gz

 # Compile and install the static OpenSSL library
-ENV OPENSSL_VERSION=3.2.2
+ENV OPENSSL_VERSION=1.1.1w
 ENV OPENSSL_PREFIX=/usr/local/openssl
 RUN wget -O /tmp/openssl-${OPENSSL_VERSION}.tar.gz https://www.openssl.org/source/openssl-${OPENSSL_VERSION}.tar.gz && \
-    echo "197149c18d9e9f292c43f0400acaba12e5f52cacfe050f3d199277ea738ec2e7 /tmp/openssl-${OPENSSL_VERSION}.tar.gz" | sha256sum --check && \
+    echo "cf3098950cb4d853ad95c0841f1f9c6d3dc102dccfcacd521d93925208b76ac8 /tmp/openssl-${OPENSSL_VERSION}.tar.gz" | sha256sum --check && \
    cd /tmp && \
    tar xzvf /tmp/openssl-${OPENSSL_VERSION}.tar.gz && \
    rm /tmp/openssl-${OPENSSL_VERSION}.tar.gz && \
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -311,9 +311,12 @@ RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz
 FROM build-deps AS rum-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

+COPY patches/rum.patch /rum.patch
+
 RUN wget https://github.com/postgrespro/rum/archive/refs/tags/1.3.13.tar.gz -O rum.tar.gz && \
    echo "6ab370532c965568df6210bd844ac6ba649f53055e48243525b0b7e5c4d69a7d rum.tar.gz" | sha256sum --check && \
    mkdir rum-src && cd rum-src && tar xzf ../rum.tar.gz --strip-components=1 -C . && \
+    patch -p1 < /rum.patch && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/rum.control
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -44,3 +44,4 @@ vm_monitor = { version = "0.1", path = "../libs/vm_monitor/" }
 zstd = "0.13"
 bytes = "1.0"
 rust-ini = "0.20.0"
+rlimit = "0.10.1"
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -6,7 +6,7 @@
 //! - Every start is a fresh start, so the data directory is removed and
 //!   initialized again on each run.
 //! - If remote_extension_config is provided, it will be used to fetch extensions list
-//!  and download `shared_preload_libraries` from the remote storage.
+//!   and download `shared_preload_libraries` from the remote storage.
 //! - Next it will put configuration files into the `PGDATA` directory.
 //! - Sync safekeepers and get commit LSN.
 //! - Get `basebackup` from pageserver using the returned on the previous step LSN.
@@ -33,7 +33,6 @@
 //!             -b /usr/local/bin/postgres \
 //!             -r http://pg-ext-s3-gateway \
 //! ```
-//!
 use std::collections::HashMap;
 use std::fs::File;
 use std::path::Path;
@@ -64,6 +63,7 @@ use compute_tools::monitor::launch_monitor;
 use compute_tools::params::*;
 use compute_tools::spec::*;
 use compute_tools::swap::resize_swap;
+use rlimit::{setrlimit, Resource};

 // this is an arbitrary build tag. Fine as a default / for testing purposes
 // in-case of not-set environment var
@@ -72,6 +72,9 @@ const BUILD_TAG_DEFAULT: &str = "latest";
 fn main() -> Result<()> {
    let (build_tag, clap_args) = init()?;

+    // enable core dumping for all child processes
+    setrlimit(Resource::CORE, rlimit::INFINITY, rlimit::INFINITY)?;
+
    let (pg_handle, start_pg_result) = {
        // Enter startup tracing context
        let _startup_context_guard = startup_context_from_env();
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -56,6 +56,7 @@ pub struct ComputeNode {
    /// - we push new spec and it does reconfiguration
    /// - but then something happens and compute pod / VM is destroyed,
    ///   so k8s controller starts it again with the **old** spec
+    ///
    /// and the same for empty computes:
    /// - we started compute without any spec
    /// - we push spec and it does configuration
@@ -798,7 +799,11 @@ impl ComputeNode {
        // In this case we need to connect with old `zenith_admin` name
        // and create new user. We cannot simply rename connected user,
        // but we can create a new one and grant it all privileges.
-        let connstr = self.connstr.clone();
+        let mut connstr = self.connstr.clone();
+        connstr
+            .query_pairs_mut()
+            .append_pair("application_name", "apply_config");
+
        let mut client = match Client::connect(connstr.as_str(), NoTls) {
            Err(e) => match e.code() {
                Some(&SqlState::INVALID_PASSWORD)
@@ -867,15 +872,19 @@ impl ComputeNode {

        // Run migrations separately to not hold up cold starts
        thread::spawn(move || {
+            let mut connstr = connstr.clone();
+            connstr
+                .query_pairs_mut()
+                .append_pair("application_name", "migrations");
+
            let mut client = Client::connect(connstr.as_str(), NoTls)?;
            handle_migrations(&mut client).context("apply_config handle_migrations")
        });
        Ok(())
    }

-    // We could've wrapped this around `pg_ctl reload`, but right now we don't use
-    // `pg_ctl` for start / stop, so this just seems much easier to do as we already
-    // have opened connection to Postgres and superuser access.
+    // Wrapped this around `pg_ctl reload`, but right now we don't use
+    // `pg_ctl` for start / stop.
    #[instrument(skip_all)]
    fn pg_reload_conf(&self) -> Result<()> {
        let pgctl_bin = Path::new(&self.pgbin).parent().unwrap().join("pg_ctl");
@@ -1108,7 +1117,7 @@ impl ComputeNode {
    // EKS worker nodes have following core dump settings:
    //   /proc/sys/kernel/core_pattern -> core
    //   /proc/sys/kernel/core_uses_pid -> 1
-    //   ulimint -c -> unlimited
+    //   ulimit -c -> unlimited
    // which results in core dumps being written to postgres data directory as core.<pid>.
    //
    // Use that as a default location and pattern, except macos where core dumps are written
@@ -1387,7 +1396,9 @@ pub fn forward_termination_signal() {
    let pg_pid = PG_PID.load(Ordering::SeqCst);
    if pg_pid != 0 {
        let pg_pid = nix::unistd::Pid::from_raw(pg_pid as i32);
-        // use 'immediate' shutdown (SIGQUIT): https://www.postgresql.org/docs/current/server-shutdown.html
-        kill(pg_pid, Signal::SIGQUIT).ok();
+        // Use 'fast' shutdown (SIGINT) because it also creates a shutdown checkpoint, which is important for
+        // ROs to get a list of running xacts faster instead of going through the CLOG.
+        // See https://www.postgresql.org/docs/current/server-shutdown.html for the list of modes and signals.
+        kill(pg_pid, Signal::SIGINT).ok();
    }
 }
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -83,12 +83,6 @@ pub fn write_postgres_conf(
        ComputeMode::Replica => {
            // hot_standby is 'on' by default, but let's be explicit
            writeln!(file, "hot_standby=on")?;
-
-            // Inform the replica about the primary state
-            // Default is 'false'
-            if let Some(primary_is_running) = spec.primary_is_running {
-                writeln!(file, "neon.primary_is_running={}", primary_is_running)?;
-            }
        }
    }

--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -11,6 +11,7 @@ pub mod logger;
 pub mod catalog;
 pub mod compute;
 pub mod extension_server;
+mod migration;
 pub mod monitor;
 pub mod params;
 pub mod pg_helpers;
--- a/compute_tools/src/migration.rs
+++ b/compute_tools/src/migration.rs
@@ -0,0 +1,105 @@
+use anyhow::{Context, Result};
+use postgres::Client;
+use tracing::info;
+
+pub(crate) struct MigrationRunner<'m> {
+    client: &'m mut Client,
+    migrations: &'m [&'m str],
+}
+
+impl<'m> MigrationRunner<'m> {
+    pub fn new(client: &'m mut Client, migrations: &'m [&'m str]) -> Self {
+        // The neon_migration.migration_id::id column is a bigint, which is equivalent to an i64
+        assert!(migrations.len() + 1 < i64::MAX as usize);
+
+        Self { client, migrations }
+    }
+
+    fn get_migration_id(&mut self) -> Result<i64> {
+        let query = "SELECT id FROM neon_migration.migration_id";
+        let row = self
+            .client
+            .query_one(query, &[])
+            .context("run_migrations get migration_id")?;
+
+        Ok(row.get::<&str, i64>("id"))
+    }
+
+    fn update_migration_id(&mut self, migration_id: i64) -> Result<()> {
+        let setval = format!("UPDATE neon_migration.migration_id SET id={}", migration_id);
+
+        self.client
+            .simple_query(&setval)
+            .context("run_migrations update id")?;
+
+        Ok(())
+    }
+
+    fn prepare_migrations(&mut self) -> Result<()> {
+        let query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
+        self.client.simple_query(query)?;
+
+        let query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)";
+        self.client.simple_query(query)?;
+
+        let query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING";
+        self.client.simple_query(query)?;
+
+        let query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin";
+        self.client.simple_query(query)?;
+
+        let query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC";
+        self.client.simple_query(query)?;
+
+        Ok(())
+    }
+
+    pub fn run_migrations(mut self) -> Result<()> {
+        self.prepare_migrations()?;
+
+        let mut current_migration = self.get_migration_id()? as usize;
+        while current_migration < self.migrations.len() {
+            macro_rules! migration_id {
+                ($cm:expr) => {
+                    ($cm + 1) as i64
+                };
+            }
+
+            let migration = self.migrations[current_migration];
+
+            if migration.starts_with("-- SKIP") {
+                info!("Skipping migration id={}", migration_id!(current_migration));
+            } else {
+                info!(
+                    "Running migration id={}:\n{}\n",
+                    migration_id!(current_migration),
+                    migration
+                );
+
+                self.client
+                    .simple_query("BEGIN")
+                    .context("begin migration")?;
+
+                self.client.simple_query(migration).with_context(|| {
+                    format!(
+                        "run_migrations migration id={}",
+                        migration_id!(current_migration)
+                    )
+                })?;
+
+                // Migration IDs start at 1
+                self.update_migration_id(migration_id!(current_migration))?;
+
+                self.client
+                    .simple_query("COMMIT")
+                    .context("commit migration")?;
+
+                info!("Finished migration id={}", migration_id!(current_migration));
+            }
+
+            current_migration += 1;
+        }
+
+        Ok(())
+    }
+}
--- a/compute_tools/src/migrations/0001-neon_superuser_bypass_rls.sql
+++ b/compute_tools/src/migrations/0001-neon_superuser_bypass_rls.sql
--- a/compute_tools/src/migrations/0002-alter_roles.sql
+++ b/compute_tools/src/migrations/0002-alter_roles.sql
--- a/compute_tools/src/migrations/0003-grant_pg_create_subscription_to_neon_superuser.sql
+++ b/compute_tools/src/migrations/0003-grant_pg_create_subscription_to_neon_superuser.sql
--- a/compute_tools/src/migrations/0004-grant_pg_monitor_to_neon_superuser.sql
+++ b/compute_tools/src/migrations/0004-grant_pg_monitor_to_neon_superuser.sql
--- a/compute_tools/src/migrations/0005-grant_all_on_tables_to_neon_superuser.sql
+++ b/compute_tools/src/migrations/0005-grant_all_on_tables_to_neon_superuser.sql
--- a/compute_tools/src/migrations/0006-grant_all_on_sequences_to_neon_superuser.sql
+++ b/compute_tools/src/migrations/0006-grant_all_on_sequences_to_neon_superuser.sql
--- a/compute_tools/src/migrations/0007-grant_all_on_tables_to_neon_superuser_with_grant_option.sql
+++ b/compute_tools/src/migrations/0007-grant_all_on_tables_to_neon_superuser_with_grant_option.sql
--- a/compute_tools/src/migrations/0008-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql
+++ b/compute_tools/src/migrations/0008-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql
--- a/compute_tools/src/migrations/0009-revoke_replication_for_previously_allowed_roles.sql
+++ b/compute_tools/src/migrations/0009-revoke_replication_for_previously_allowed_roles.sql
--- a/compute_tools/src/migrations/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql
+++ b/compute_tools/src/migrations/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql
@@ -0,0 +1,7 @@
+DO $$
+BEGIN
+    IF (SELECT setting::numeric >= 160000 FROM pg_settings WHERE name = 'server_version_num') THEN
+       EXECUTE 'GRANT EXECUTE ON FUNCTION pg_export_snapshot TO neon_superuser';
+       EXECUTE 'GRANT EXECUTE ON FUNCTION pg_log_standby_snapshot TO neon_superuser';
+    END IF;
+END $$;
--- a/compute_tools/src/monitor.rs
+++ b/compute_tools/src/monitor.rs
@@ -17,7 +17,11 @@ const MONITOR_CHECK_INTERVAL: Duration = Duration::from_millis(500);
 // should be handled gracefully.
 fn watch_compute_activity(compute: &ComputeNode) {
    // Suppose that `connstr` doesn't change
-    let connstr = compute.connstr.as_str();
+    let mut connstr = compute.connstr.clone();
+    connstr
+        .query_pairs_mut()
+        .append_pair("application_name", "compute_activity_monitor");
+    let connstr = connstr.as_str();

    // During startup and configuration we connect to every Postgres database,
    // but we don't want to count this as some user activity. So wait until
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -489,7 +489,7 @@ pub fn handle_postgres_logs(stderr: std::process::ChildStderr) -> JoinHandle<()>
 /// Read Postgres logs from `stderr` until EOF. Buffer is flushed on one of the following conditions:
 /// - next line starts with timestamp
 /// - EOF
-/// - no new lines were written for the last second
+/// - no new lines were written for the last 100 milliseconds
 async fn handle_postgres_logs_async(stderr: tokio::process::ChildStderr) -> Result<()> {
    let mut lines = tokio::io::BufReader::new(stderr).lines();
    let timeout_duration = Duration::from_millis(100);
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -10,6 +10,7 @@ use tracing::{error, info, info_span, instrument, span_enabled, warn, Level};

 use crate::config;
 use crate::logger::inlinify;
+use crate::migration::MigrationRunner;
 use crate::params::PG_HBA_ALL_MD5;
 use crate::pg_helpers::*;

@@ -776,84 +777,25 @@ pub fn handle_migrations(client: &mut Client) -> Result<()> {

    // Add new migrations in numerical order.
    let migrations = [
-        include_str!("./migrations/0000-neon_superuser_bypass_rls.sql"),
-        include_str!("./migrations/0001-alter_roles.sql"),
-        include_str!("./migrations/0002-grant_pg_create_subscription_to_neon_superuser.sql"),
-        include_str!("./migrations/0003-grant_pg_monitor_to_neon_superuser.sql"),
-        include_str!("./migrations/0004-grant_all_on_tables_to_neon_superuser.sql"),
-        include_str!("./migrations/0005-grant_all_on_sequences_to_neon_superuser.sql"),
+        include_str!("./migrations/0001-neon_superuser_bypass_rls.sql"),
+        include_str!("./migrations/0002-alter_roles.sql"),
+        include_str!("./migrations/0003-grant_pg_create_subscription_to_neon_superuser.sql"),
+        include_str!("./migrations/0004-grant_pg_monitor_to_neon_superuser.sql"),
+        include_str!("./migrations/0005-grant_all_on_tables_to_neon_superuser.sql"),
+        include_str!("./migrations/0006-grant_all_on_sequences_to_neon_superuser.sql"),
        include_str!(
-            "./migrations/0006-grant_all_on_tables_to_neon_superuser_with_grant_option.sql"
+            "./migrations/0007-grant_all_on_tables_to_neon_superuser_with_grant_option.sql"
        ),
        include_str!(
-            "./migrations/0007-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql"
+            "./migrations/0008-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql"
+        ),
+        include_str!("./migrations/0009-revoke_replication_for_previously_allowed_roles.sql"),
+        include_str!(
+            "./migrations/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql"
        ),
-        include_str!("./migrations/0008-revoke_replication_for_previously_allowed_roles.sql"),
    ];

-    let mut func = || {
-        let query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
-        client.simple_query(query)?;
-
-        let query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)";
-        client.simple_query(query)?;
-
-        let query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING";
-        client.simple_query(query)?;
-
-        let query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin";
-        client.simple_query(query)?;
-
-        let query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC";
-        client.simple_query(query)?;
-        Ok::<_, anyhow::Error>(())
-    };
-    func().context("handle_migrations prepare")?;
-
-    let query = "SELECT id FROM neon_migration.migration_id";
-    let row = client
-        .query_one(query, &[])
-        .context("handle_migrations get migration_id")?;
-    let mut current_migration: usize = row.get::<&str, i64>("id") as usize;
-    let starting_migration_id = current_migration;
-
-    let query = "BEGIN";
-    client
-        .simple_query(query)
-        .context("handle_migrations begin")?;
-
-    while current_migration < migrations.len() {
-        let migration = &migrations[current_migration];
-        if migration.starts_with("-- SKIP") {
-            info!("Skipping migration id={}", current_migration);
-        } else {
-            info!(
-                "Running migration id={}:\n{}\n",
-                current_migration, migration
-            );
-            client.simple_query(migration).with_context(|| {
-                format!("handle_migrations current_migration={}", current_migration)
-            })?;
-        }
-        current_migration += 1;
-    }
-    let setval = format!(
-        "UPDATE neon_migration.migration_id SET id={}",
-        migrations.len()
-    );
-    client
-        .simple_query(&setval)
-        .context("handle_migrations update id")?;
-
-    let query = "COMMIT";
-    client
-        .simple_query(query)
-        .context("handle_migrations commit")?;
-
-    info!(
-        "Ran {} migrations",
-        (migrations.len() - starting_migration_id)
-    );
+    MigrationRunner::new(client, &migrations).run_migrations()?;

    Ok(())
 }
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -21,10 +21,8 @@ use pageserver_api::config::{
    DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT,
    DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT,
 };
-use pageserver_api::controller_api::PlacementPolicy;
-use pageserver_api::models::{
-    ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo,
-};
+use pageserver_api::controller_api::{PlacementPolicy, TenantCreateRequest};
+use pageserver_api::models::{ShardParameters, TimelineCreateRequest, TimelineInfo};
 use pageserver_api::shard::{ShardCount, ShardStripeSize, TenantShardId};
 use postgres_backend::AuthType;
 use postgres_connection::parse_host_port;
@@ -600,13 +598,9 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
        Some(("import", import_match)) => {
            let tenant_id = get_tenant_id(import_match, env)?;
            let timeline_id = parse_timeline_id(import_match)?.expect("No timeline id provided");
-            let name = import_match
-                .get_one::<String>("node-name")
-                .ok_or_else(|| anyhow!("No node name provided"))?;
-            let update_catalog = import_match
-                .get_one::<bool>("update-catalog")
-                .cloned()
-                .unwrap_or_default();
+            let branch_name = import_match
+                .get_one::<String>("branch-name")
+                .ok_or_else(|| anyhow!("No branch name provided"))?;

            // Parse base inputs
            let base_tarfile = import_match
@@ -633,24 +627,11 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
                .copied()
                .context("Failed to parse postgres version from the argument string")?;

-            let mut cplane = ComputeControlPlane::load(env.clone())?;
            println!("Importing timeline into pageserver ...");
            pageserver
                .timeline_import(tenant_id, timeline_id, base, pg_wal, pg_version)
                .await?;
-            env.register_branch_mapping(name.to_string(), tenant_id, timeline_id)?;
-
-            println!("Creating endpoint for imported timeline ...");
-            cplane.new_endpoint(
-                name,
-                tenant_id,
-                timeline_id,
-                None,
-                None,
-                pg_version,
-                ComputeMode::Primary,
-                !update_catalog,
-            )?;
+            env.register_branch_mapping(branch_name.to_string(), tenant_id, timeline_id)?;
            println!("Done");
        }
        Some(("branch", branch_match)) => {
@@ -865,20 +846,13 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re

            let allow_multiple = sub_args.get_flag("allow-multiple");

-            // If --safekeepers argument is given, use only the listed safekeeper nodes.
-            let safekeepers =
-                if let Some(safekeepers_str) = sub_args.get_one::<String>("safekeepers") {
-                    let mut safekeepers: Vec<NodeId> = Vec::new();
-                    for sk_id in safekeepers_str.split(',').map(str::trim) {
-                        let sk_id = NodeId(u64::from_str(sk_id).map_err(|_| {
-                            anyhow!("invalid node ID \"{sk_id}\" in --safekeepers list")
-                        })?);
-                        safekeepers.push(sk_id);
-                    }
-                    safekeepers
-                } else {
-                    env.safekeepers.iter().map(|sk| sk.id).collect()
-                };
+            // If --safekeepers argument is given, use only the listed
+            // safekeeper nodes; otherwise all from the env.
+            let safekeepers = if let Some(safekeepers) = parse_safekeepers(sub_args)? {
+                safekeepers
+            } else {
+                env.safekeepers.iter().map(|sk| sk.id).collect()
+            };

            let endpoint = cplane
                .endpoints
@@ -982,7 +956,10 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
                        })
                        .collect::<Vec<_>>()
                };
-            endpoint.reconfigure(pageservers, None).await?;
+            // If --safekeepers argument is given, use only the listed
+            // safekeeper nodes; otherwise all from the env.
+            let safekeepers = parse_safekeepers(sub_args)?;
+            endpoint.reconfigure(pageservers, None, safekeepers).await?;
        }
        "stop" => {
            let endpoint_id = sub_args
@@ -1004,6 +981,23 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
    Ok(())
 }

+/// Parse --safekeepers as list of safekeeper ids.
+fn parse_safekeepers(sub_args: &ArgMatches) -> Result<Option<Vec<NodeId>>> {
+    if let Some(safekeepers_str) = sub_args.get_one::<String>("safekeepers") {
+        let mut safekeepers: Vec<NodeId> = Vec::new();
+        for sk_id in safekeepers_str.split(',').map(str::trim) {
+            let sk_id = NodeId(
+                u64::from_str(sk_id)
+                    .map_err(|_| anyhow!("invalid node ID \"{sk_id}\" in --safekeepers list"))?,
+            );
+            safekeepers.push(sk_id);
+        }
+        Ok(Some(safekeepers))
+    } else {
+        Ok(None)
+    }
+}
+
 fn handle_mappings(sub_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Result<()> {
    let (sub_name, sub_args) = match sub_match.subcommand() {
        Some(ep_subcommand_data) => ep_subcommand_data,
@@ -1487,8 +1481,7 @@ fn cli() -> Command {
                .about("Import timeline from basebackup directory")
                .arg(tenant_id_arg.clone())
                .arg(timeline_id_arg.clone())
-                .arg(Arg::new("node-name").long("node-name")
-                    .help("Name to assign to the imported timeline"))
+                .arg(branch_name_arg.clone())
                .arg(Arg::new("base-tarfile")
                    .long("base-tarfile")
                    .value_parser(value_parser!(PathBuf))
@@ -1504,7 +1497,6 @@ fn cli() -> Command {
                .arg(Arg::new("end-lsn").long("end-lsn")
                    .help("Lsn the basebackup ends at"))
                .arg(pg_version_arg.clone())
-                .arg(update_catalog.clone())
            )
        ).subcommand(
            Command::new("tenant")
@@ -1609,7 +1601,7 @@ fn cli() -> Command {
                    .about("Start postgres.\n If the endpoint doesn't exist yet, it is created.")
                    .arg(endpoint_id_arg.clone())
                    .arg(endpoint_pageserver_id_arg.clone())
-                    .arg(safekeepers_arg)
+                    .arg(safekeepers_arg.clone())
                    .arg(remote_ext_config_args)
                    .arg(create_test_user)
                    .arg(allow_multiple.clone())
@@ -1618,6 +1610,7 @@ fn cli() -> Command {
                .subcommand(Command::new("reconfigure")
                            .about("Reconfigure the endpoint")
                            .arg(endpoint_pageserver_id_arg)
+                            .arg(safekeepers_arg)
                            .arg(endpoint_id_arg.clone())
                            .arg(tenant_id_arg.clone())
                )
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -499,6 +499,23 @@ impl Endpoint {
            .join(",")
    }

+    /// Map safekeepers ids to the actual connection strings.
+    fn build_safekeepers_connstrs(&self, sk_ids: Vec<NodeId>) -> Result<Vec<String>> {
+        let mut safekeeper_connstrings = Vec::new();
+        if self.mode == ComputeMode::Primary {
+            for sk_id in sk_ids {
+                let sk = self
+                    .env
+                    .safekeepers
+                    .iter()
+                    .find(|node| node.id == sk_id)
+                    .ok_or_else(|| anyhow!("safekeeper {sk_id} does not exist"))?;
+                safekeeper_connstrings.push(format!("127.0.0.1:{}", sk.get_compute_port()));
+            }
+        }
+        Ok(safekeeper_connstrings)
+    }
+
    pub async fn start(
        &self,
        auth_token: &Option<String>,
@@ -523,18 +540,7 @@ impl Endpoint {
        let pageserver_connstring = Self::build_pageserver_connstr(&pageservers);
        assert!(!pageserver_connstring.is_empty());

-        let mut safekeeper_connstrings = Vec::new();
-        if self.mode == ComputeMode::Primary {
-            for sk_id in safekeepers {
-                let sk = self
-                    .env
-                    .safekeepers
-                    .iter()
-                    .find(|node| node.id == sk_id)
-                    .ok_or_else(|| anyhow!("safekeeper {sk_id} does not exist"))?;
-                safekeeper_connstrings.push(format!("127.0.0.1:{}", sk.get_compute_port()));
-            }
-        }
+        let safekeeper_connstrings = self.build_safekeepers_connstrs(safekeepers)?;

        // check for file remote_extensions_spec.json
        // if it is present, read it and pass to compute_ctl
@@ -592,7 +598,6 @@ impl Endpoint {
            remote_extensions,
            pgbouncer_settings: None,
            shard_stripe_size: Some(shard_stripe_size),
-            primary_is_running: None,
        };
        let spec_path = self.endpoint_path().join("spec.json");
        std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
@@ -741,6 +746,7 @@ impl Endpoint {
        &self,
        mut pageservers: Vec<(Host, u16)>,
        stripe_size: Option<ShardStripeSize>,
+        safekeepers: Option<Vec<NodeId>>,
    ) -> Result<()> {
        let mut spec: ComputeSpec = {
            let spec_path = self.endpoint_path().join("spec.json");
@@ -775,6 +781,12 @@ impl Endpoint {
            spec.shard_stripe_size = stripe_size.map(|s| s.0 as usize);
        }

+        // If safekeepers are not specified, don't change them.
+        if let Some(safekeepers) = safekeepers {
+            let safekeeper_connstrings = self.build_safekeepers_connstrs(safekeepers)?;
+            spec.safekeeper_connstrings = safekeeper_connstrings;
+        }
+
        let client = reqwest::Client::builder()
            .timeout(Duration::from_secs(30))
            .build()
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -325,11 +325,16 @@ impl LocalEnv {
        }
    }

-    pub fn pg_bin_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
-        Ok(self.pg_distrib_dir(pg_version)?.join("bin"))
+    pub fn pg_dir(&self, pg_version: u32, dir_name: &str) -> anyhow::Result<PathBuf> {
+        Ok(self.pg_distrib_dir(pg_version)?.join(dir_name))
    }
+
+    pub fn pg_bin_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
+        self.pg_dir(pg_version, "bin")
+    }
+
    pub fn pg_lib_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
-        Ok(self.pg_distrib_dir(pg_version)?.join("lib"))
+        self.pg_dir(pg_version, "lib")
    }

    pub fn pageserver_bin(&self) -> PathBuf {
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -15,10 +15,8 @@ use std::time::Duration;

 use anyhow::{bail, Context};
 use camino::Utf8PathBuf;
-use futures::SinkExt;
 use pageserver_api::models::{
-    self, AuxFilePolicy, LocationConfig, ShardParameters, TenantHistorySize, TenantInfo,
-    TimelineInfo,
+    self, AuxFilePolicy, LocationConfig, TenantHistorySize, TenantInfo, TimelineInfo,
 };
 use pageserver_api::shard::TenantShardId;
 use pageserver_client::mgmt_api;
@@ -351,11 +349,6 @@ impl PageServerNode {
                .map(|x| x.parse::<NonZeroU64>())
                .transpose()
                .context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?,
-            trace_read_requests: settings
-                .remove("trace_read_requests")
-                .map(|x| x.parse::<bool>())
-                .transpose()
-                .context("Failed to parse 'trace_read_requests' as bool")?,
            eviction_policy: settings
                .remove("eviction_policy")
                .map(serde_json::from_str)
@@ -397,28 +390,6 @@ impl PageServerNode {
        }
    }

-    pub async fn tenant_create(
-        &self,
-        new_tenant_id: TenantId,
-        generation: Option<u32>,
-        settings: HashMap<&str, &str>,
-    ) -> anyhow::Result<TenantId> {
-        let config = Self::parse_config(settings.clone())?;
-
-        let request = models::TenantCreateRequest {
-            new_tenant_id: TenantShardId::unsharded(new_tenant_id),
-            generation,
-            config,
-            shard_parameters: ShardParameters::default(),
-            // Placement policy is not meaningful for creations not done via storage controller
-            placement_policy: None,
-        };
-        if !settings.is_empty() {
-            bail!("Unrecognized tenant settings: {settings:?}")
-        }
-        Ok(self.http_client.tenant_create(&request).await?)
-    }
-
    pub async fn tenant_config(
        &self,
        tenant_id: TenantId,
@@ -478,11 +449,6 @@ impl PageServerNode {
                    .map(|x| x.parse::<NonZeroU64>())
                    .transpose()
                    .context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?,
-                trace_read_requests: settings
-                    .remove("trace_read_requests")
-                    .map(|x| x.parse::<bool>())
-                    .transpose()
-                    .context("Failed to parse 'trace_read_requests' as bool")?,
                eviction_policy: settings
                    .remove("eviction_policy")
                    .map(serde_json::from_str)
@@ -589,60 +555,39 @@ impl PageServerNode {
        pg_wal: Option<(Lsn, PathBuf)>,
        pg_version: u32,
    ) -> anyhow::Result<()> {
-        let (client, conn) = self.page_server_psql_client().await?;
-        // The connection object performs the actual communication with the database,
-        // so spawn it off to run on its own.
-        tokio::spawn(async move {
-            if let Err(e) = conn.await {
-                eprintln!("connection error: {}", e);
-            }
-        });
-        let client = std::pin::pin!(client);
-
        // Init base reader
        let (start_lsn, base_tarfile_path) = base;
        let base_tarfile = tokio::fs::File::open(base_tarfile_path).await?;
-        let base_tarfile = tokio_util::io::ReaderStream::new(base_tarfile);
+        let base_tarfile =
+            mgmt_api::ReqwestBody::wrap_stream(tokio_util::io::ReaderStream::new(base_tarfile));

        // Init wal reader if necessary
        let (end_lsn, wal_reader) = if let Some((end_lsn, wal_tarfile_path)) = pg_wal {
            let wal_tarfile = tokio::fs::File::open(wal_tarfile_path).await?;
-            let wal_reader = tokio_util::io::ReaderStream::new(wal_tarfile);
+            let wal_reader =
+                mgmt_api::ReqwestBody::wrap_stream(tokio_util::io::ReaderStream::new(wal_tarfile));
            (end_lsn, Some(wal_reader))
        } else {
            (start_lsn, None)
        };

-        let copy_in = |reader, cmd| {
-            let client = &client;
-            async move {
-                let writer = client.copy_in(&cmd).await?;
-                let writer = std::pin::pin!(writer);
-                let mut writer = writer.sink_map_err(|e| {
-                    std::io::Error::new(std::io::ErrorKind::Other, format!("{e}"))
-                });
-                let mut reader = std::pin::pin!(reader);
-                writer.send_all(&mut reader).await?;
-                writer.into_inner().finish().await?;
-                anyhow::Ok(())
-            }
-        };
-
        // Import base
-        copy_in(
-            base_tarfile,
-            format!(
-                "import basebackup {tenant_id} {timeline_id} {start_lsn} {end_lsn} {pg_version}"
-            ),
-        )
-        .await?;
-        // Import wal if necessary
-        if let Some(wal_reader) = wal_reader {
-            copy_in(
-                wal_reader,
-                format!("import wal {tenant_id} {timeline_id} {start_lsn} {end_lsn}"),
+        self.http_client
+            .import_basebackup(
+                tenant_id,
+                timeline_id,
+                start_lsn,
+                end_lsn,
+                pg_version,
+                base_tarfile,
            )
            .await?;
+
+        // Import wal if necessary
+        if let Some(wal_reader) = wal_reader {
+            self.http_client
+                .import_wal(tenant_id, timeline_id, start_lsn, end_lsn, wal_reader)
+                .await?;
        }

        Ok(())
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -5,12 +5,11 @@ use crate::{
 use camino::{Utf8Path, Utf8PathBuf};
 use pageserver_api::{
    controller_api::{
-        NodeConfigureRequest, NodeRegisterRequest, TenantCreateResponse, TenantLocateResponse,
-        TenantShardMigrateRequest, TenantShardMigrateResponse,
+        NodeConfigureRequest, NodeRegisterRequest, TenantCreateRequest, TenantCreateResponse,
+        TenantLocateResponse, TenantShardMigrateRequest, TenantShardMigrateResponse,
    },
    models::{
-        TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse,
-        TimelineCreateRequest, TimelineInfo,
+        TenantShardSplitRequest, TenantShardSplitResponse, TimelineCreateRequest, TimelineInfo,
    },
    shard::{ShardStripeSize, TenantShardId},
 };
@@ -156,16 +155,16 @@ impl StorageController {
        .expect("non-Unicode path")
    }

-    /// Find the directory containing postgres binaries, such as `initdb` and `pg_ctl`
+    /// Find the directory containing postgres subdirectories, such `bin` and `lib`
    ///
    /// This usually uses STORAGE_CONTROLLER_POSTGRES_VERSION of postgres, but will fall back
    /// to other versions if that one isn't found.  Some automated tests create circumstances
    /// where only one version is available in pg_distrib_dir, such as `test_remote_extensions`.
-    pub async fn get_pg_bin_dir(&self) -> anyhow::Result<Utf8PathBuf> {
+    async fn get_pg_dir(&self, dir_name: &str) -> anyhow::Result<Utf8PathBuf> {
        let prefer_versions = [STORAGE_CONTROLLER_POSTGRES_VERSION, 15, 14];

        for v in prefer_versions {
-            let path = Utf8PathBuf::from_path_buf(self.env.pg_bin_dir(v)?).unwrap();
+            let path = Utf8PathBuf::from_path_buf(self.env.pg_dir(v, dir_name)?).unwrap();
            if tokio::fs::try_exists(&path).await? {
                return Ok(path);
            }
@@ -173,11 +172,20 @@ impl StorageController {

        // Fall through
        anyhow::bail!(
-            "Postgres binaries not found in {}",
-            self.env.pg_distrib_dir.display()
+            "Postgres directory '{}' not found in {}",
+            dir_name,
+            self.env.pg_distrib_dir.display(),
        );
    }

+    pub async fn get_pg_bin_dir(&self) -> anyhow::Result<Utf8PathBuf> {
+        self.get_pg_dir("bin").await
+    }
+
+    pub async fn get_pg_lib_dir(&self) -> anyhow::Result<Utf8PathBuf> {
+        self.get_pg_dir("lib").await
+    }
+
    /// Readiness check for our postgres process
    async fn pg_isready(&self, pg_bin_dir: &Utf8Path) -> anyhow::Result<bool> {
        let bin_path = pg_bin_dir.join("pg_isready");
@@ -230,12 +238,17 @@ impl StorageController {
            .unwrap()
            .join("storage_controller_db");
        let pg_bin_dir = self.get_pg_bin_dir().await?;
+        let pg_lib_dir = self.get_pg_lib_dir().await?;
        let pg_log_path = pg_data_path.join("postgres.log");

        if !tokio::fs::try_exists(&pg_data_path).await? {
            // Initialize empty database
            let initdb_path = pg_bin_dir.join("initdb");
            let mut child = Command::new(&initdb_path)
+                .envs(vec![
+                    ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
+                    ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
+                ])
                .args(["-D", pg_data_path.as_ref()])
                .spawn()
                .expect("Failed to spawn initdb");
@@ -270,7 +283,10 @@ impl StorageController {
            &self.env.base_data_dir,
            pg_bin_dir.join("pg_ctl").as_std_path(),
            db_start_args,
-            [],
+            vec![
+                ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
+                ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
+            ],
            background_process::InitialPidFile::Create(self.postgres_pid_file()),
            retry_timeout,
            || self.pg_isready(&pg_bin_dir),
@@ -325,7 +341,10 @@ impl StorageController {
            &self.env.base_data_dir,
            &self.env.storage_controller_bin(),
            args,
-            [],
+            vec![
+                ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
+                ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
+            ],
            background_process::InitialPidFile::Create(self.pid_file()),
            retry_timeout,
            || async {
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -4,13 +4,13 @@ use std::{str::FromStr, time::Duration};
 use clap::{Parser, Subcommand};
 use pageserver_api::{
    controller_api::{
-        NodeAvailabilityWrapper, NodeDescribeResponse, ShardSchedulingPolicy,
+        NodeAvailabilityWrapper, NodeDescribeResponse, ShardSchedulingPolicy, TenantCreateRequest,
        TenantDescribeResponse, TenantPolicyRequest,
    },
    models::{
        EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary,
-        ShardParameters, TenantConfig, TenantConfigRequest, TenantCreateRequest,
-        TenantShardSplitRequest, TenantShardSplitResponse,
+        ShardParameters, TenantConfig, TenantConfigRequest, TenantShardSplitRequest,
+        TenantShardSplitResponse,
    },
    shard::{ShardStripeSize, TenantShardId},
 };
@@ -56,6 +56,10 @@ enum Command {
        #[arg(long)]
        scheduling: Option<NodeSchedulingPolicy>,
    },
+    NodeDelete {
+        #[arg(long)]
+        node_id: NodeId,
+    },
    /// Modify a tenant's policies in the storage controller
    TenantPolicy {
        #[arg(long)]
@@ -336,14 +340,18 @@ async fn main() -> anyhow::Result<()> {
                .await?;
        }
        Command::TenantCreate { tenant_id } => {
-            vps_client
-                .tenant_create(&TenantCreateRequest {
-                    new_tenant_id: TenantShardId::unsharded(tenant_id),
-                    generation: None,
-                    shard_parameters: ShardParameters::default(),
-                    placement_policy: Some(PlacementPolicy::Attached(1)),
-                    config: TenantConfig::default(),
-                })
+            storcon_client
+                .dispatch::<_, ()>(
+                    Method::POST,
+                    "v1/tenant".to_string(),
+                    Some(TenantCreateRequest {
+                        new_tenant_id: TenantShardId::unsharded(tenant_id),
+                        generation: None,
+                        shard_parameters: ShardParameters::default(),
+                        placement_policy: Some(PlacementPolicy::Attached(1)),
+                        config: TenantConfig::default(),
+                    }),
+                )
                .await?;
        }
        Command::TenantDelete { tenant_id } => {
@@ -353,13 +361,16 @@ async fn main() -> anyhow::Result<()> {
            tracing::info!("Delete status: {}", status);
        }
        Command::Nodes {} => {
-            let resp = storcon_client
+            let mut resp = storcon_client
                .dispatch::<(), Vec<NodeDescribeResponse>>(
                    Method::GET,
                    "control/v1/node".to_string(),
                    None,
                )
                .await?;
+
+            resp.sort_by(|a, b| a.listen_http_addr.cmp(&b.listen_http_addr));
+
            let mut table = comfy_table::Table::new();
            table.set_header(["Id", "Hostname", "Scheduling", "Availability"]);
            for node in resp {
@@ -391,13 +402,16 @@ async fn main() -> anyhow::Result<()> {
                .await?;
        }
        Command::Tenants {} => {
-            let resp = storcon_client
+            let mut resp = storcon_client
                .dispatch::<(), Vec<TenantDescribeResponse>>(
                    Method::GET,
                    "control/v1/tenant".to_string(),
                    None,
                )
                .await?;
+
+            resp.sort_by(|a, b| a.tenant_id.cmp(&b.tenant_id));
+
            let mut table = comfy_table::Table::new();
            table.set_header([
                "TenantId",
@@ -646,6 +660,11 @@ async fn main() -> anyhow::Result<()> {
                .dispatch::<(), ()>(Method::POST, format!("debug/v1/node/{node_id}/drop"), None)
                .await?;
        }
+        Command::NodeDelete { node_id } => {
+            storcon_client
+                .dispatch::<(), ()>(Method::DELETE, format!("control/v1/node/{node_id}"), None)
+                .await?;
+        }
        Command::TenantSetTimeBasedEviction {
            tenant_id,
            period,
--- a/docker-compose/README.md
+++ b/docker-compose/README.md
@@ -0,0 +1,10 @@
+
+# Example docker compose configuration
+
+The configuration in this directory is used for testing Neon docker images: it is
+not intended for deploying a usable system.  To run a development environment where
+you can experiment with a minature Neon system, use `cargo neon` rather than container images.
+
+This configuration does not start the storage controller, because the controller
+needs a way to reconfigure running computes, and no such thing exists in this setup.
+
--- a/docker-compose/compute_wrapper/shell/compute.sh
+++ b/docker-compose/compute_wrapper/shell/compute.sh
@@ -23,11 +23,10 @@ echo "Page server is ready."
 echo "Create a tenant and timeline"
 generate_id tenant_id
 PARAMS=(
-     -sb 
-     -X POST
+     -X PUT
     -H "Content-Type: application/json"
-     -d "{\"new_tenant_id\": \"${tenant_id}\"}"
-     http://pageserver:9898/v1/tenant/
+     -d "{\"mode\": \"AttachedSingle\", \"generation\": 1, \"tenant_conf\": {}}"
+     "http://pageserver:9898/v1/tenant/${tenant_id}/location_config"
 )
 result=$(curl "${PARAMS[@]}")
 echo $result | jq .
--- a/docs/rfcs/033-storage-controller-drain-and-fill.md
+++ b/docs/rfcs/033-storage-controller-drain-and-fill.md
@@ -0,0 +1,345 @@
+# Graceful Restarts of Storage Controller Managed Clusters
+
+## Summary
+This RFC describes new storage controller APIs for draining and filling tenant shards from/on pageserver nodes.
+It also covers how these new APIs should be used by an orchestrator (e.g. Ansible) in order to implement
+graceful cluster restarts.
+
+## Motivation
+
+Pageserver restarts cause read availablity downtime for tenants.
+
+For example pageserver-3 @ us-east-1 was unavailable for a randomly
+picked tenant (which requested on-demand activation) for around 30 seconds
+during the restart at 2024-04-03 16:37 UTC.
+
+Note that lots of shutdowns on loaded pageservers do not finish within the
+[10 second systemd enforced timeout](https://github.com/neondatabase/aws/blob/0a5280b383e43c063d43cbf87fa026543f6d6ad4/.github/ansible/systemd/pageserver.service#L16). This means we are shutting down without flushing ephemeral layers
+and have to reingest data in order to serve requests after restarting, potentially making first request latencies worse.
+
+This problem is not yet very acutely felt in storage controller managed pageservers since
+tenant density is much lower there. However, we are planning on eventually migrating all
+pageservers to storage controller management, so it makes sense to solve the issue proactively.
+
+## Requirements
+
+- Pageserver re-deployments cause minimal downtime for tenants
+- The storage controller exposes HTTP API hooks for draining and filling tenant shards
+from a given pageserver. Said hooks can be used by an orchestrator proces or a human operator.
+- The storage controller exposes some HTTP API to cancel draining and filling background operations.
+- Failures to drain or fill the node should not be fatal. In such cases, cluster restarts should proceed
+as usual (with downtime).
+- Progress of draining/filling is visible through metrics
+
+## Non Goals
+
+- Integration with the control plane
+- Graceful restarts for large non-HA tenants.
+
+## Impacted Components
+
+- storage controller
+- deployment orchestrator (i.e. Ansible)
+- pageserver (indirectly)
+
+## Terminology
+
+** Draining ** is the process through which all tenant shards that can be migrated from a given pageserver
+are distributed across the rest of the cluster.
+
+** Filling ** is the symmetric opposite of draining. In this process tenant shards are migrated onto a given
+pageserver until the cluster reaches a resonable, quiescent distribution of tenant shards across pageservers.
+
+** Node scheduling policies ** act as constraints to the scheduler. For instance, when a
+node is set in the `Paused` policy, no further shards will be scheduled on it.
+
+** Node ** is a pageserver. Term is used interchangeably in this RFC.
+
+** Deployment orchestrator ** is a generic term for whatever drives our deployments.
+Currently, it's an Ansible playbook.
+
+## Background
+
+### Storage Controller Basics (skip if already familiar)
+
+Fundamentally, the storage controller is a reconciler which aims to move from the observed mapping between pageservers and tenant shards to an intended mapping. Pageserver nodes and tenant shards metadata is durably persisted in a database, but note that the mapping between the two entities is not durably persisted. Instead, this mapping (*observed state*) is constructed at startup by sending `GET location_config` requests to registered pageservers.
+
+An internal scheduler maps tenant shards to pageservers while respecting certain constraints. The result of scheduling is the *intent state*. When the intent state changes, a *reconciliation* will inform pageservers about the new assigment via `PUT location_config` requests and will notify the compute via the configured hook.
+
+### Background Optimizations
+
+The storage controller performs scheduling optimizations in the background. It will
+migrate attachments to warm secondaries and replace secondaries in order to balance
+the cluster out.
+
+### Reconciliations Concurrency Limiting
+
+There's a hard limit on the number of reconciles that the storage controller
+can have in flight at any given time. To get an idea of scales, the limit is
+128 at the time of writing.
+
+## Implementation
+
+Note: this section focuses on the core functionality of the graceful restart process.
+It doesn't neccesarily describe the most efficient approach. Optimizations are described
+separately in a later section.
+
+### Overall Flow
+
+This section describes how to implement graceful restarts from the perspective
+of Ansible, the deployment orchestrator. Pageservers are already restarted sequentially.
+The orchestrator shall implement the following epilogue and prologue steps for each
+pageserver restart:
+
+#### Prologue
+
+The orchestrator shall first fetch the pageserver node id from the control plane or
+the pageserver it aims to restart directly. Next, it issues an HTTP request
+to the storage controller in order to start the drain of said pageserver node.
+All error responses are retried with a short back-off. When a 202 (Accepted)
+HTTP code is returned, the drain has started. Now the orchestrator polls the
+node status endpoint exposed by the storage controller in order to await the
+end of the drain process. When the `policy` field of the node status response
+becomes `PauseForRestart`, the drain has completed and the orchestrator can
+proceed with restarting the pageserver.
+
+The prologue is subject to an overall timeout. It will have a value in the ballpark
+of minutes. As storage controller managed pageservers become more loaded this timeout
+will likely have to increase.
+
+#### Epilogue
+
+After restarting the pageserver, the orchestrator issues an HTTP request
+to the storage controller to kick off the filling process. This API call
+may be retried for all error codes with a short backoff. This also serves
+as a synchronization primitive as the fill will be refused if the pageserver
+has not yet re-attached to the storage controller. When a 202(Accepted) HTTP
+code is returned, the fill has started. Now the orchestrator polls the node
+status endpoint exposed by the storage controller in order to await the end of
+the filling process. When the `policy` field of the node status response becomes
+`Active`, the fill has completed and the orchestrator may proceed to the next pageserver.
+
+Again, the epilogue is subject to an overall timeout. We can start off with
+using the same timeout as for the prologue, but can also consider relying on
+the storage controller's background optimizations with a shorter timeout.
+
+In the case that the deployment orchestrator times out, it attempts to cancel
+the fill. This operation shall be retried with a short back-off. If it ultimately
+fails it will require manual intervention to set the nodes scheduling policy to
+`NodeSchedulingPolicy::Active`. Not doing that is not immediately problematic,
+but it constrains the scheduler as mentioned previously.
+
+### Node Scheduling Policy State Machine
+
+The state machine below encodes the behaviours discussed above and
+the various failover situations described in a later section.
+
+Assuming no failures and/or timeouts the flow should be:
+`Active -> Draining -> PauseForRestart -> Active -> Filling -> Active`
+
+```
+                          Operator requested drain
+               +-----------------------------------------+
+               |                                         |
+       +-------+-------+                         +-------v-------+
+       |               |                         |               |
+       |     Pause     |             +----------->    Draining   +----------+
+       |               |             |           |               |          |
+       +---------------+             |           +-------+-------+          |
+                                     |                   |                  |
+                                     |                   |                  |
+                      Drain requested|                   |                  |
+                                     |                   |Drain complete    | Drain failed
+                                     |                   |                  | Cancelled/PS reattach/Storcon restart
+                                     |                   |                  |
+                             +-------+-------+           |                  |
+                             |               |           |                  |
+               +-------------+    Active     <-----------+------------------+
+               |             |               |           |
+Fill requested |             +---^---^-------+           |
+               |                 |   |                   |
+               |                 |   |                   |
+               |                 |   |                   |
+               |   Fill completed|   |                   |
+               |                 |   |PS reattach        |
+               |                 |   |after restart      |
+       +-------v-------+         |   |           +-------v-------+
+       |               |         |   |           |               |
+       |    Filling    +---------+   +-----------+PauseForRestart|
+       |               |                         |               |
+       +---------------+                         +---------------+
+```
+
+### Draining/Filling APIs
+
+The storage controller API to trigger the draining of a given node is:
+`PUT /v1/control/node/:node_id/{drain,fill}`.
+
+The following HTTP non-success return codes are used.
+All of them are safely retriable from the perspective of the storage controller.
+- 404: Requested node was not found
+- 503: Requested node is known to the storage controller, but unavailable
+- 412: Drain precondition failed: there is no other node to drain to or the node's schedulling policy forbids draining
+- 409: A {drain, fill} is already in progress. Only one such background operation
+is allowed per node.
+
+When the drain is accepted and commenced a 202 HTTP code is returned.
+
+Drains and fills shall be cancellable by the deployment orchestrator or a
+human operator via: `DELETE /v1/control/node/:node_id/{drain,fill}`. A 200
+response is returned when the cancelation is successful. Errors are retriable.
+
+### Drain Process
+
+Before accpeting a drain request the following validations is applied:
+* Ensure that the node is known the storage controller
+* Ensure that the schedulling policy is `NodeSchedulingPolicy::Active` or `NodeSchedulingPolicy::Pause`
+* Ensure that another drain or fill is not already running on the node
+* Ensure that a drain is possible (i.e. check that there is at least one
+schedulable node to drain to)
+
+After accepting the drain, the scheduling policy of the node is set to
+`NodeSchedulingPolicy::Draining` and persisted in both memory and the database.
+This disallows the optimizer from adding or removing shards from the node which
+is desirable to avoid them racing.
+
+Next, a separate Tokio task is spawned to manage the draining. For each tenant
+shard attached to the node being drained, demote the node to a secondary and
+attempt to schedule the node away. Scheduling might fail due to unsatisfiable
+constraints, but that is fine. Draining is a best effort process since it might
+not always be possible to cut over all shards.
+
+Importantly, this task manages the concurrency of issued reconciles in order to
+avoid drowning out the target pageservers and to allow other important reconciles
+to proceed.
+
+Once the triggered reconciles have finished or timed out, set the node's scheduling
+policy to `NodeSchedulingPolicy::PauseForRestart` to signal the end of the drain.
+
+A note on non HA tenants: These tenants do not have secondaries, so by the description
+above, they would not be migrated. It makes sense to skip them (especially the large ones)
+since, depending on tenant size, this might be more disruptive than the restart since the
+pageserver we've moved to do will need to on-demand download the entire working set for the tenant.
+We can consider expanding to small non-HA tenants in the future.
+
+### Fill Process
+
+Before accpeting a fill request the following validations is applied:
+* Ensure that the node is known the storage controller
+* Ensure that the schedulling policy is `NodeSchedulingPolicy::Active`.
+This is the only acceptable policy for the fill starting state. When a node re-attaches,
+it set the scheduling policy to `NodeSchedulingPolicy::Active` if it was equal to
+`NodeSchedulingPolicy::PauseForRestart` or `NodeSchedulingPolicy::Draining` (possible end states for a node drain).
+* Ensure that another drain or fill is not already running on the node
+
+After accepting the drain, the scheduling policy of the node is set to
+`NodeSchedulingPolicy::Filling` and persisted in both memory and the database.
+This disallows the optimizer from adding or removing shards from the node which
+is desirable to avoid them racing.
+
+Next, a separate Tokio task is spawned to manage the draining. For each tenant
+shard where the filled node is a secondary, promote the secondary. This is done
+until we run out of shards or the counts of attached shards become balanced across
+the cluster.
+
+Like for draining, the concurrency of spawned reconciles is limited.
+
+### Failure Modes & Handling
+
+Failures are generally handled by transition back into the `Active`
+(neutral) state. This simplifies the implementation greatly at the
+cost of adding transitions to the state machine. For example, we
+could detect the `Draining` state upon restart and proceed with a drain,
+but how should the storage controller know that's what the orchestrator
+needs still?
+
+#### Storage Controller Crash
+
+When the storage controller starts up reset the node scheduling policy
+of all nodes in states `Draining`, `Filling` or `PauseForRestart` to
+`Active`. The rationale is that when the storage controller restarts,
+we have lost context of what the deployment orchestrator wants. It also
+has the benefit of making things easier to reason about.
+
+#### Pageserver Crash During Drain
+
+The pageserver will attempt to re-attach during restart at which
+point the node scheduling policy will be set back to `Active`, thus
+reenabling the scheduler to use the node.
+
+#### Non-drained Pageserver Crash During Drain
+
+What should happen when a pageserver we are draining to crashes during the
+process. Two reasonable options are: cancel the drain and focus on the failover
+*or* do both, but prioritise failover. Since the number of concurrent reconciles
+produced by drains/fills are limited, we get the later behaviour for free.
+My suggestion is we take this approach, but the cancellation option is trivial
+to implement as well.
+
+#### Pageserver Crash During Fill
+
+The pageserver will attempt to re-attach during restart at which
+point the node scheduling policy will be set back to `Active`, thus
+reenabling the scheduler to use the node.
+
+#### Pageserver Goes unavailable During Drain/Fill
+
+The drain and fill jobs handle this by stopping early. When the pageserver
+is detected as online by storage controller heartbeats, reset its scheduling
+policy to `Active`. If a restart happens instead, see the pageserver crash
+failure mode.
+
+#### Orchestrator Drain Times Out
+
+Orchestrator will still proceed with the restart.
+When the pageserver re-attaches, the scheduling policy is set back to
+`Active`.
+
+#### Orchestrator Fill Times Out
+
+Orchestrator will attempt to cancel the fill operation. If that fails,
+the fill will continue until it quiesces and the node will be left
+in the `Filling` scheduling policy. This hinders the scheduler, but is
+otherwise harmless. A human operator can handle this by setting the scheduling
+policy to `Active`, or we can bake in a fill timeout into the storage controller.
+
+## Optimizations
+
+### Location Warmth
+
+When cutting over to a secondary, the storage controller will wait for it to
+become "warm" (i.e. download enough of the tenants data). This means that some
+reconciliations can take significantly longer than others and hold up precious
+reconciliations units. As an optimization, the drain stage can only cut over
+tenants that are already "warm". Similarly, the fill stage can prioritise the
+"warmest" tenants in the fill.
+
+Given that the number of tenants by the storage controller will be fairly low
+for the foreseable future, the first implementation could simply query the tenants
+for secondary status. This doesn't scale well with increasing tenant counts, so
+eventually we will need new pageserver API endpoints to report the sets of
+"warm" and "cold" nodes.
+
+## Alternatives Considered
+
+### Draining and Filling Purely as Scheduling Constraints
+
+At its core, the storage controller is a big background loop that detects changes
+in the environment and reacts on them. One could express draining and filling
+of nodes purely in terms of constraining the scheduler (as opposed to having
+such background tasks).
+
+While theoretically nice, I think that's harder to implement and more importantly operate and reason about.
+Consider cancellation of a drain/fill operation. We would have to update the scheduler state, create
+an entirely new schedule (intent state) and start work on applying that. It gets trickier if we wish
+to cancel the reconciliation tasks spawned by drain/fill nodes. How would we know which ones belong
+to the conceptual drain/fill? One could add labels to reconciliations, but it gets messy in my opinion.
+
+It would also mean that reconciliations themselves have side effects that persist in the database
+(persist something to the databse when the drain is done), which I'm not conceptually fond of.
+
+## Proof of Concept
+
+This RFC is accompanied by a POC which implements nearly everything mentioned here
+apart from the optimizations and some of the failure handling:
+https://github.com/neondatabase/neon/pull/7682
--- a/docs/rfcs/034-ancestor-deletion.md
+++ b/docs/rfcs/034-ancestor-deletion.md
@@ -0,0 +1,252 @@
+# Ancestor Timeline Deletion
+
+Created on: 2024-02-23
+
+Author: John Spray
+
+# Summary
+
+When a tenant creates a new timeline that they will treat as their 'main' history,
+it is awkward to permanently retain an 'old main' timeline as its ancestor. Currently
+this is necessary because it is forbidden to delete a timeline which has descendents.
+
+A new pageserver API is proposed to 'adopt' data from a parent timeline into
+one of its children, such that the link between ancestor and child can be severed,
+leaving the parent in a state where it may then be deleted.
+
+# Motivation
+
+Retaining parent timelines currently has two costs:
+
+- Cognitive load on users, who have to remember which is the "real" main timeline.
+- Storage capacity cost, as the parent timeline will retain layers up to the
+  child's timeline point, even if the child fully covers its keyspace with image
+  layers and will never actually read from the parent.
+
+# Solution
+
+A new pageserver API `PUT /v1/tenant/:tenant_id/timeline/:timeline_id/detach_ancestor`
+will be added. The `timeline_id` in this URL is that of the _child_ timeline that we
+wish to detach from its parent.
+
+On success, this API will leave the following state:
+
+- The detached child timeline will no longer have an ancestor, and will contain all
+  the data needed to service reads without recursing into an ancestor.
+- Any other children of the parent whose timeline points were at a lower LSN than
+  the detached child timeline will be modified to have the child timeline as their
+  new parent.
+- The parent timeline will still exist, but the child will no longer have it as an
+  ancestor. If this was the last timeline that depended on the parent, then the
+  parent will become deletable.
+
+This API's implementation will consist of a series of retryable steps, such that
+on failures/timeout it can safely be called again to reach the target state.
+
+## Example
+
+### Before
+
+The user has "rolled back" their project to LSN X, resulting in a "new main"
+timeline. The parent "old main" timeline still exists, and they would like
+to clean it up.
+
+They have two other timelines A and B. A is from before the rollback point,
+and B is from after the rollback point.
+
+```
+----"old main" timeline-------X-------------------------------------------->
+                |             |                         |
+                |-> child A   |                         |
+                              |-> "new main" timeline   |
+                                                        -> child B
+
+```
+
+### After calling detach ancestor API
+
+The "new main" timeline is no longer dependent on old main, and neither
+is child A, because it had a branch point before X.
+
+The user may now choose to delete child B and "old main" to get to
+a pristine state. Child B is likely to be unwanted since the user
+chose to roll back to X, and it branches from after X. However, we
+don't assume this in the API; it is up to the user to delete it.
+
+```
+|----"old main" timeline---------------------------------------------------->
+                                                         |
+                                                         |
+                                                         |
+                                                         -> child B
+
+|----"new main" timeline--------->
+                 |
+                 |-> child A
+
+
+```
+
+### After removing timelines
+
+We end up with a totally clean state that leaves no trace that a rollback
+ever happened: there is only one root timeline.
+
+```
+| ----"new main" timeline----------->
+                |
+                |-> child A
+
+
+```
+
+## Caveats
+
+Important things for API users to bear in mind:
+
+- this API does not delete the parent timeline: you must still do that explicitly.
+- if there are other child timelines ahead of the branch point of the detached
+  child, the parent won't be deletable: you must either delete or detach those
+  children.
+- do _not_ simply loop over all children and detach them all: this can have an
+  extremely high storage cost. The detach ancestor API is intended for use on a single
+  timeline to make it the new "main".
+- The detach ancestor API should also not be
+  exposed directly to the user as button/API, because they might decide
+  to click it for all the children and thereby generate many copies of the
+  parent's data -- the detach ancestor API should be used as part
+  of a high level "clean up after rollback" feature.
+
+## `detach_ancestor` API implementation
+
+Terms used in the following sections:
+
+- "the child": the timeline whose ID is specified in the detach ancestor API URL, also
+  called "new main" in the example.
+- "the parent": the parent of "the child". Also called "old main" in the example.
+- "the branch point" the ancestor_lsn of "the child"
+
+### Phase 1: write out adopted layers to S3
+
+The child will "adopt" layers from the parent, such that its end state contains
+all the parent's history as well as its own.
+
+For all layers in the parent's layer map whose high LSN is below the branch
+point, issue S3 CopyObject requests to duplicate them into the child timeline's
+prefix. Do not add them to the child's layer map yet.
+
+For delta layers in the parent's layer map which straddle the branch point, read them
+and write out only content up to the branch point into new layer objects.
+
+This is a long running operation if the parent has many layers: it should be
+implemented in a way that resumes rather than restarting from scratch, if the API
+times out and is called again.
+
+As an optimization, if there are no other timelines that will be adopted into
+the child, _and_ the child's image layers already full cover the branch LSN,
+then we may skip adopting layers.
+
+### Phase 2: update the child's index
+
+Having written out all needed layers in phase 1, atomically link them all
+into the child's IndexPart and upload to S3. This may be done while the
+child Timeline is still running.
+
+### Phase 3: modify timelines ancestry
+
+Modify the child's ancestor to None, and upload its IndexPart to persist the change.
+
+For all timelines which have the same parent as the child, and have a branch
+point lower than our branch point, switch their ancestor_timeline to the child,
+and upload their IndexPart to persist the change.
+
+## Alternatives considered
+
+### Generate full image layer on child, rather than adopting parent deltas
+
+This would work for the case of a single child, but would prevent re-targeting
+other timelines that depended on the parent. If we detached many children this
+way, the storage cost would become prohibitive (consider a 1TB database with
+100 child timelines: it would cost 100TiB if they all generated their own image layers).
+
+### Don't rewrite anything: just fake it in the API
+
+We could add a layer of indirection that let a child "pretend" that it had no
+ancestor, when in reality it still had the parent. The pageserver API could
+accept deletion of ancestor timelines, and just update child metadata to make
+them look like they have no ancestor.
+
+This would not achieve the desired reduction in storage cost, and may well be more
+complex to maintain than simply implementing the API described in this RFC.
+
+### Avoid copying objects: enable child index to use parent layers directly
+
+We could teach IndexPart to store a TimelineId for each layer, such that a child
+timeline could reference a parent's layers directly, rather than copying them
+into the child's prefix.
+
+This would impose a cost for the normal case of indices that only target the
+timeline's own layers, add complexity, and break the useful simplifying
+invariant that timelines "own" their own path. If child timelines were
+referencing layers from the parent, we would have to ensure that the parent
+never runs GC/compaction again, which would make the API less flexible (the
+proposal in this RFC enables deletion of the parent but doesn't require it.)
+
+## Performance
+
+### Adopting layers
+
+- CopyObject is a relatively cheap operation, but we may need to issue tens of thousands
+  of such requests: this can take up to tens of seconds and will compete for RemoteStorage
+  semaphore units with other activity on the pageserver.
+- If we are running on storage backend that doesn't implement CopyObject, then
+  this part will be much more expensive as we would stream all layer content
+  through the pageserver. This is no different to issuing a lot
+  of reads to a timeline that does not have a warm local cache: it will move
+  a lot of gigabytes, but that shouldn't break anything.
+- Generating truncated layers for delta that straddle the branch point will
+  require streaming read/write of all the layers in question.
+
+### Updating timeline ancestry
+
+The simplest way to update timeline ancestry will probably be to stop and start
+all the Timeline objects: this is preferable to the complexity of making their
+ancestry mutable at runtime.
+
+There will be a corresponding "stutter" in the availability of the timelines,
+of the order 10-100ms, which is the time taken to upload their IndexPart, and
+restart the Timeline.
+
+# Interaction with other features
+
+## Concurrent timeline creation
+
+If new historic timelines are created using the parent as an ancestor while the
+detach ancestor API is running, they will not be re-parented to the child. This
+doesn't break anything, but it leaves the parent in a state where it might not
+be possible to delete it.
+
+Since timeline creations are an explicit user action, this is not something we need to
+worry about as the storage layer: a user who wants to delete their parent timeline will not create
+new children, and if they do, they can choose to delete those children to
+enable deleting the parent.
+
+For the least surprise to the user, before starting the detach ancestor branch
+operation, the control plane should wait until all branches are created and not
+allow any branches to be created before the branch point on the ancestor branch
+while the operation is ongoing.
+
+## WAL based disaster recovery
+
+WAL based disaster recovery currently supports only restoring of the main
+branch. Enabling WAL based disaster recovery in the future requires that we
+keep a record which timeline generated the WAL and at which LSN was a parent
+detached. Keep a list of timeline ids and the LSN in which they were detached in
+the `index_part.json`. Limit the size of the list to 100 first entries, after
+which the WAL disaster recovery will not be possible.
+
+## Sharded tenants
+
+For sharded tenants, calls to the detach ancestor API will pass through the storage
+controller, which will handle them the same as timeline creations: invoke first
+on shard zero, and then on all the other shards.
--- a/docs/rfcs/034-timeline-archive.md
+++ b/docs/rfcs/034-timeline-archive.md
@@ -0,0 +1,507 @@
+# Timeline Archival
+
+## Summary
+
+This RFC describes a mechanism for pageservers to eliminate local storage + compute work
+for timelines which are not in use, in response to external API calls to "archive" a timeline.
+
+The archived state roughly corresponds to fully offloading a timeline to object storage, such
+that its cost is purely the cost of that object storage.
+
+## Motivation
+
+Archived timelines serve multiple purposes:
+- Act as a 'snapshot' for workloads that would like to retain restorable copies of their
+  database from longer ago than their PITR window.
+- Enable users to create huge numbers of branches (e.g. one per github PR) without having
+  to diligently clean them up later to avoid overloading the pageserver (currently we support
+  up to ~500 branches per tenant).
+
+### Prior art
+
+Most storage and database systems have some form of snapshot, which can be implemented several ways:
+1. full copies of data (e.g. an EBS snapshot to S3)
+2. shallow snapshots which are CoW relative to the original version of the data, e.g. on a typical NFS appliance, or a filesystem like CephFS.
+3. a series of snapshots which are CoW or de-duplicated relative to one another.
+
+Today's Neon branches are approximately like `2.`, although due to implementation details branches
+often end up storing much more data than they really need, as parent branches assume that all data
+at the branch point is needed.  The layers pinned in the parent branch may have a much larger size
+than the physical size of a compressed image layer representing the data at the branch point.
+
+## Requirements
+
+- Enter & exit the archived state in response to external admin API calls
+- API calls to modify the archived state are atomic and durable
+- An archived timeline should eventually (once out of PITR window) use an efficient compressed
+  representation, and avoid retaining arbitrarily large data in its parent branch.
+- Remote object GETs during tenant start may be O(N) with the number of _active_ branches,
+  but must not scale with the number of _archived_ branches.
+- Background I/O for archived branches should only be done a limited number of times to evolve them
+  to a long-term-efficient state (e.g. rewriting to image layers).  There should be no ongoing "housekeeping"
+  overhead for archived branches, including operations related to calculating sizes for billing.
+- The pageserver should put no load on the safekeeper for archived branches.
+- Performance of un-archiving a branch must make good use of S3/disk bandwidth to restore the branch
+  to a performant state in a short time (linear with the branch's logical size)
+
+## Non Goals
+
+- Archived branches are not a literal `fullbackup` postgres snapshot: they are still stored
+  in Neon's internal format.
+- Compute cold starts after activating an archived branch will not have comparable performance to
+  cold starts on an active branch.
+- Archived branches will not use any new/additional compression or de-duplication beyond what
+  is already implemented for image layers (zstd per page).
+- The pageserver will not "auto start" archived branches in response to page_service API requests: they
+  are only activated explicitly via the HTTP API.
+- We will not implement a total offload of archived timelines from safekeepers: their control file (small) will
+  remain on local disk, although existing eviction mechanisms will remove any segments from local disk.
+- We will not expose any prometheus metrics for archived timelines, or make them visible in any
+  detailed HTTP APIs other than the specific API for listing archived timelines.
+- A parent branch may not be archived unless all its children are.
+
+## Impacted Components
+
+pageserver, storage controller
+
+## Terminology
+
+**Archived**: a branch is _archived_ when an HTTP API request to archive it has succeeded: the caller
+may assume that this branch is now very cheap to store, although this may not be physically so until the
+branch proceeds to the offloaded state.
+
+**Active** branches are branches which are available for use by page_service clients, and have a relatively
+high cost due to consuming local storage.
+
+**Offloaded** branches are a subset of _archived_ branches, which have had their local state removed such
+that they now consume minimal runtime resources and have a cost similar to the cost of object storage.
+
+**Activate** (verb): transition from Archived to Active
+
+**Archive** (verb): transition from Active to Archived
+
+**Offload** (verb): transition from Archived to Offloaded
+
+**Offload manifest**: an object stored in S3 that describes timelines which pageservers do not load.
+
+**Warm up** (verb): operation done on an active branch, by downloading its active layers.  Once a branch is
+warmed up, good performance will be available to page_service clients.
+
+## Implementation
+
+### High level flow
+
+We may think of a timeline which is archived and then activated as proceeding through a series of states:
+
+```mermaid
+stateDiagram
+  [*] --> Active(warm)
+  Active(warm) --> Archived
+  Archived --> Offloaded
+  Archived --> Active(warm)
+  Offloaded --> Active(cold)
+  Active(cold) --> Active(warm)
+```
+
+Note that the transition from Archived to Active(warm) is expected to be fairly rare: the most common lifecycles
+of branches will be:
+- Very frequent: Short lived branches: Active -> Deleted
+- Frequent: Long-lived branches: Active -> Archived -> Offloaded -> Deleted
+- Rare: Branches used to restore old state: Active ->Archived -> Offloaded -> Active
+
+These states are _not_ all stored as a single physical state on the timeline, but rather represent the combination
+of:
+- the timeline's lifecycle state: active or archived, stored in the timeline's index
+- its offload state: whether pageserver has chosen to drop local storage of the timeline and write it into the
+  manifest of offloaded timelines.
+- cache state (whether it's warm or cold).
+
+### Storage format changes
+
+There are two storage format changes:
+1. `index_part.json` gets a new attribute `state` that describes whether the timeline is to
+   be considered active or archived.
+2. A new tenant-level _manifest_ object `tenant_manifest-v1.json` describes which timelines a tenant does not need to load
+   at startup (and is available for storing other small, rarely changing tenant-wide attributes in future)
+
+The manifest object will have a format like this:
+```
+{
+  "offload_timelines": [
+    {
+      "timeline_id": ...
+      "last_record_lsn": ...
+      "last_record_lsn_time": ...
+      "pitr_interval": ...
+      "last_gc_lsn": ...  # equal to last_record_lsn if this branch has no history (i.e. a snapshot)
+      "logical_size": ...  # The size at last_record_lsn
+      "physical_size" ...
+      "parent": Option<{
+        "timeline_id"...
+        "lsn"... # Branch point LSN on the parent
+        "requires_data": bool # True if this branch depends on layers in its parent, identify it here
+
+      }>
+    }
+  ]
+}
+```
+
+The information about a timeline in its offload state is intentionally minimal: just enough to decide:
+- Whether it requires [archive optimization](#archive-branch-optimization) by rewriting as a set of image layers: we may infer this
+  by checking if now > last_record_lsn_time - pitr_interval, and pitr_lsn < last_record_lsn.
+- Whether a parent branch should include this offloaded branch in its GC inputs to avoid removing
+  layers that the archived branch depends on
+- Whether requests to delete this `timeline_id` should be executed (i.e. if a deletion request
+  is received for a timeline_id that isn't in the site of live `Timelines` or in the manifest, then
+  we don't need to go to S3 for the deletion.
+- How much archived space to report in consumption metrics
+
+The contents of the manifest's offload list will also be stored as an attribute of `Tenant`, such that the total
+set of timelines may be found by the union of `Tenant::timelines` (non-offloaded timelines) and `Tenant::offloaded`
+(offloaded timelines).
+
+For split-brain protection, the manifest object will be written with a generation suffix, in the same way as
+index_part objects are (see [generation numbers RFC](025-generation-numbers.md)).  This will add some complexity, but
+give us total safety against two pageservers with the same tenant attached fighting over the object.  Existing code
+for finding the latest generation and for cleaning up old generations (in the scrubber) will be generalized to cover
+the manifest file.
+
+### API & Timeline state
+
+Timelines will store a lifecycle state (enum of Active or Archived) in their IndexPart.  This will
+be controlled by a new per-timeline `configure` endpoint.  This is intentionally generic naming, which
+may be used in future to control other per-timeline attributes (e.g. in future we may make PITR interval
+a per-timeline configuration).
+
+`PUT /v1/tenants/{tenant_id}/timelines/{timeline_id}/configure`
+```
+{
+  'state': 'active|archive'
+}
+```
+
+When archiving a timeline, this API will complete as soon as the timeline's state has been set in index_part, and that index has been uploaded.
+
+When activating a timeline, this API will complete as soon as the timeline's state has been set in index_part,
+**and** the `Timeline` object has been instantiated and activated.  This will require reading the timeline's
+index, but not any data: it should be about as fast as a couple of small S3 requests.
+
+The API will be available with identical path via the storage controller: calling this on a sharded tenant
+will simply map the API call to all the shards.
+
+Archived timelines may never have descendent timelines which are active.  This will be enforced at the API level,
+such that activating a timeline requires that all its ancestors are active, and archiving a timeline requires
+that all its descendents are archived.  It is the callers responsibility to walk the hierarchy of timelines
+in the proper order if they would like to archive whole trees of branches.
+
+Because archive timelines will be excluded from the usual timeline listing APIs, a new API specifically
+for archived timelines will be added: this is for use in support/debug:
+
+```
+GET /v1/tenants/{tenant_id}/archived_timelines
+
+{
+  ...same per-timeline content as the tenant manifest...
+}
+
+```
+
+### Tenant attach changes
+
+Currently, during Tenant::spawn we list all the timelines in the S3 bucket, and then for each timeline
+we load their index_part.json.  To avoid the number of GETs scaling linearly with the number of archived
+timelines, we must have a single object that tells us which timelines do not need to be loaded.  The
+number of ListObjects requests while listing timelines will still scale O(N), but this is less problematic
+because each request covers 1000 timelines.
+
+This is **not** literally the same as the set of timelines who have state=archived.  Rather, it is
+the set of timelines which have been offloaded in the background after their state was set to archived.
+
+We may simply skip loading these timelines: there will be no special state of `Timeline`, they just won't
+exist from the perspective of an active `Tenant` apart from in deletion: timeline deletion will need
+to check for offloaded timelines as well as active timelines, to avoid wrongly returning 404 on trying
+to delete an offloaded timeline.
+
+### Warm-up API
+
+`PUT /v1/tenants/{tenant_id}/timelines/{timeline_id}/download?wait_ms=1234`
+
+This API will be similar to the existing `download_remote_layers` API, but smarter:
+- It will not download _all_ remote layers, just the visible set (i.e. layers needed for a read)
+- It will download layers in the visible set until reaching `wait_ms`, then return a struct describing progress
+  of downloads, so that the caller can poll.
+
+The _visible set_ mentioned above will be calculated by the pageserver in the background, by taking the set
+of readable LSNs (i.e. branch points and heads of branches), and walking the layer map to work out which layers
+can possibly be read from these LSNs.  This concept of layer visibility is more generally useful for cache
+eviction and heatmaps, as well as in this specific case of warming up a timeline.
+
+The caller does not have to wait for the warm up API, or call it at all.  But it is strongly advised
+to call it, because otherwise populating local contents for a timeline can take a long time when waiting
+for SQL queries to coincidentally hit all the layers, and during that time query latency remains quite
+volatile.
+
+### Background work
+
+Archived branches are not subject to normal compaction.  Instead, when the compaction loop encounters
+an archived branch, it will consider rewriting the branch to just image layers if the branch has no history
+([archive branch optimization](#archive-branch-optimization)), or offloading the timeline from local disk
+if its state permits that.
+
+Additionally, the tenant compaction task will walk the state of already offloaded timelines to consider
+optimizing their storage, e.g. if a timeline had some history when offloaded, but since then its PITR
+has elapsed and it can now be rewritten to image layers.
+
+#### Archive branch offload
+
+Recall that when we archive a timeline via the HTTP API, this only sets a state: it doesn't do
+any actual work.
+
+This work is done in the background compaction loop.  It makes sense to tag this work on to the compaction
+loop, because it is spiritually aligned: offloading data for archived branches improves storage efficiency.
+
+The condition for offload is simple:
+ - a `Timeline` object exists with state `Archived`
+ - the timeline does not have any non-offloaded children.
+ 
+ Regarding the condition that children must be offloaded, this will always be eventually true, because
+ we enforce at the API level that children of archived timelines must themselves be archived, and all
+ archived timelines will eventually be offloaded.
+
+Offloading a timeline is simple:
+- Read the timeline's attributes that we will store in its offloaded state (especially its logical size)
+- Call `shutdown()` on the timeline and remove it from the `Tenant` (as if we were about to delete it)
+- Erase all the timeline's content from local storage (`remove_dir_all` on its path)
+- Write the tenant manifest to S3 to prevent this timeline being loaded on next start.
+
+#### Archive branch optimization (flattening)
+
+When we offloaded a branch, it might have had some history that prevented rewriting it to a single
+point in time set of image layers.  For example, a branch might have several days of writes and a 7
+day PITR: when we archive it, it still has those days of history.
+
+Once the PITR has expired, we have an opportunity to reduce the physical footprint of the branch by:
+- Writing compressed image layers within the archived branch, as these are more efficient as a way of storing
+  a point in time compared with delta layers
+- Updating the branch's offload metadata to indicate that this branch no longer depends on its ancestor
+  for data, i.e. the ancestor is free to GC layers files at+below the branch point
+
+Fully compacting an archived branch into image layers at a single LSN may be thought of as *flattening* the
+branch, such that it is now a one-dimensional keyspace rather than a two-dimensional key/lsn space. It becomes
+a true snapshot at that LSN.
+
+It is not always more efficient to flatten a branch than to keep some extra history on the parent: this
+is described in more detail in [optimizations](#delaying-storage-optimization-if-retaining-parent-layers-is-cheaper)
+
+Archive branch optimization should be done _before_ background offloads during compaction, because there may
+be timelines which are ready to be offloaded but also would benefit from the optimization step before
+being offloaded.  For example, a branch which has already fallen out of PITR window and has no history
+of its own may be immediately re-written as a series of image layers before being offloaded.
+
+### Consumption metrics
+
+Archived timelines and offloaded timelines will be excluded from the synthetic size calculation, in anticipating
+that billing structures based on consumption metrics are highly likely to apply different $/GB rates to archived
+vs. ordinary content.
+
+Archived and offloaded timelines' logical size will be reported under the existing `timeline_logical_size`
+variant of `MetricsKey`: receivers are then free to bill on this metric as they please.
+
+### Secondary locations
+
+Archived timelines (including offloaded timelines) will be excluded from heatmaps, and thereby
+when a timeline is archived, after the next cycle of heatmap upload & secondary download, its contents
+will be dropped from secondary locations.
+
+### Sharding
+
+Archiving or activating a timeline will be done symmetrically across all shards in a tenant, in
+the same way that timeline creation and deletion is done.  There are no special rules about ordering:
+the storage controller may dispatch concurrent calls to all shards when archiving or activating a timeline.
+
+Since consumption metrics are only transmitted from shard zero, the state of archival on this shard
+will be authoritative for consumption metrics.
+
+## Error cases
+
+### Errors in sharded tenants
+
+If one shard in a tenant fails an operation but others succeed, the tenant may end up in a mixed
+state, where a timeline is archived on some shards but not on others.  
+
+We will not bother implementing a rollback mechanism for this: errors in archiving/activating a timeline
+are either transient (e.g. S3 unavailable, shutting down), or the fault of the caller (NotFound, BadRequest).
+In the transient case callers are expected to retry until success, or to make appropriate API calls to clear
+up their mistake.  We rely on this good behavior of callers to eventually get timelines into a consistent
+state across all shards.  If callers do leave a timeline in an inconsistent state across shards, this doesn't
+break anything, it's just "weird".
+
+This is similar to the status quo for timeline creation and deletion: callers are expected to retry
+these operations until they succeed.
+
+### Archiving/activating
+
+Archiving/activating a timeline can fail in a limited number of ways:
+1. I/O error storing/reading the timeline's updated index
+    - These errors are always retryable: a fundamental design assumption of the pageserver is that remote
+      storage errors are always transient. 
+2. NotFound if the timeline doesn't exist
+    - Callers of the API are expected to avoid calling deletion and archival APIs concurrently.
+    - The storage controller has runtime locking to prevent races such as deleting a timeline while
+      archiving it.
+3. BadRequest if the rules around ancestors/descendents of archived timelines would be violated
+    - Callers are expected to do their own checks to avoid hitting this case.  If they make
+      a mistake and encounter this error, they should give up.
+
+### Offloading
+
+Offloading can only fail if remote storage is unavailable, which would prevent us from writing the
+tenant manifest.  In such error cases, we give up in the expectation that offloading will be tried 
+again at the next iteration of the compaction loop.
+
+### Archive branch optimization
+
+Optimization is a special form of compaction, so can encounter all the same errors as regular compaction
+can: it should return Result<(), CompactionError>, and as with compaction it will be retried on
+the next iteration of the compaction loop.
+
+## Optimizations
+
+### Delaying storage optimization if retaining parent layers is cheaper
+
+Optimizing archived branches to image layers and thereby enabling parent branch GC to progress
+is a safe default: archived branches cannot over-fill a pageserver's local disk, and once they
+are offloaded to S3 they're totally safe, inert things.
+
+However, in some cases it can be advantageous to retain extra history on their parent branch rather
+than flattening the archived branch.  For example, if a 1TB parent branch is rather slow-changing (1GB
+of data per day), and archive branches are being created nightly, then writing out full 1TB image layers
+for each nightly branch is inefficient compared with just keeping more history on the main branch.
+
+Getting this right requires consideration of:
+- Compaction: if keeping more history on the main branch is going to prompt the main branch's compaction to
+  write out extra image layers, then it might make more sense to just write out the image layers on
+  the archived branch.
+- Metadata bloat: keeping extra history on a parent branch doesn't just cost GB of storage, it makes
+  the layer map (and index_part) bigger.  There are practical limits beyond which writing an indefinitely
+  large layer map can cause problems elsewhere.
+
+This optimization can probably be implemented quite cheaply with some basic heuristics like:
+- don't bother doing optimization on an archive branch if the LSN distance between
+  its branch point and the end of the PITR window is <5% of the logical size of the archive branch.
+- ...but, Don't keep more history on the main branch than double the PITR
+
+### Creating a timeline in archived state (a snapshot)
+
+Sometimes, one might want to create a branch with no history, which will not be written to
+before it is archived.  This is a snapshot, although we do not require a special snapshot API,
+since a snapshot can be represented as a timeline with no history.
+
+This can be accomplished by simply creating a timeline and then immediately archiving it, but
+that is somewhat wasteful: this timeline it will spin up various tasks and open a connection to the storage
+broker to try and ingest WAL, before being shutdown in the subsequent archival call.  To explicitly
+support this common special case, we may add a parameter to the timeline creation API which
+creates a timeline directly into the archived state.
+
+Such a timeline creation will do exactly two I/Os at creation time:
+- write the index_part object to record the timeline's existence
+- when the timeline is offloaded in the next iteration of the compaction loop (~20s later),
+  write the tenant manifest.
+
+Later, when the timeline falls off the end of the PITR interval, the usual offload logic will wake
+up the 'snapshot' branch and write out image layers.
+
+## Future Work
+
+### Enabling `fullbackup` dumps from archive branches
+
+It would be useful to be able to export an archive branch to another system, or for use in a local
+postgres database.
+
+This could be implemented as a general capability for all branches, in which case it would "just work"
+for archive branches by activating them.  However, downloading all the layers in a branch just to generate
+a fullbackup is a bit inefficient: we could implement a special case for flattened archived branches
+which streams image layers from S3 and outputs the fullbackup stream without writing the layers out to disk.
+
+Implementing `fullbackup` is a bit more complicated than this because of sharding, but solving that problem
+is unrelated to the topic of archived branches (it probably involves having each shard write out a fullbackup 
+stream to S3 in an intermediate format and, then having one node stitch them together).
+
+### Tagging layers from archived branches
+
+When we know a layer is an image layer written for an archived branch that has fallen off the PITR window,
+we may add tags to the S3 objects to enable writing lifecycle policies that transition such layers to even
+cheaper storage.
+
+This could be done for all archived layers, or it could be driven by the archival API, to give the pageserver
+external hints on which branches are likely to be reactivated, and which branches are good candidates for
+tagging for low performance storage.
+
+Tagging+lifecycles is just one mechanism: one might also directly use S3 storage classes.  Other clouds' object
+stores have similar mechanisms.
+
+### Storing sequences of archive branches as deltas
+
+When archived branches are used as scheduled snapshots, we could store them even more efficiently
+by encoding them as deltas relative to each other (i.e. for nightly snapshots, when we do the
+storage optimization for Tuesday's snapshot, we would read Monday's snapshot and store only the modified
+pages). This is the kind of encoding that many backup storage systems use.
+
+The utility of this depends a lot on the churn rate of the data, and the cost of doing the delta encoding
+vs. just writing out a simple stream of the entire database.  For smaller databases, writing out a full
+copy is pretty trivial (e.g. writing a compressed copy of a 10GiB database to S3 can take under 10 seconds,
+so the complexity tradeoff of diff-encoding it is dubious).
+
+One does not necessarily have to read-back the previous snapshot in order to encoded the next one: if the
+pageserver knows about the schedule, it can intentionally retain extra history on the main branch so that
+we can say: "A branch exists from Monday night.  I have Monday night's data still active in the main branch,
+so now I can read at the Monday LSN and the Tuesday LSN, calculate the delta, and store it as Tuesday's
+delta snapshot".
+
+Clearly this all requires careful housekeeping to retain the relationship between branches that depend on
+each other: perhaps this would be done by making the archive branches have child/parent relationships with
+each other, or perhaps we would permit them to remain children of their original parent, but additionally
+have a relationship with the snapshot they're encoded relative to.
+
+Activating a branch that is diff-encoded may require activating several earlier branches too, so figuring
+out how frequently to write a full copy is important.  This is essentially a zoomed-out version of what
+we do with delta layers and image layers within a timeline, except each "layer" is a whole timeline.
+
+
+## FAQ/Alternatives
+
+### Store all timelines in the tenant manifest
+
+Rather than special-casing offloaded timelines in the offload manifest, we could store a total
+manifest of all timelines, eliminating the need for the pageserver to list timelines in S3 on
+startup.
+
+That would be a more invasive change (require hooking in to timeline creation), and would
+generate much more I/O to this manifest for tenants that had many branches _and_ frequent
+create/delete cycles for short lived branches.  Restricting the manifest to offloaded timelines
+means that we only have to cope with the rate at which long-lived timelines are archived, rather
+than the rate at which sort lived timelines are created & destroyed.
+
+### Automatically archiving/activating timelines without external API calls
+
+We could implement TTL driven offload of timelines, waking them up when a page request
+arrives.
+
+This has downsides:
+- Opacity: if we do TTL-driven offload inside the pageserver, then the end user doesn't
+  know which of their branches are in this state, and might get a surprise when they try
+  to use such a branch.
+- Price fluctuation: if the archival of a branch is used in end user pricing, then users
+  prefer clarity & consistency.  Ideally a branch's storage should cost the same from the moment it
+  is created, rather than having a usage-dependency storage price.
+- Complexity: enabling the page service to call up into the Tenant to activate a timeline
+  would be awkward, compared with an external entry point.
+
+### Make offloaded a state of Timeline
+
+To reduce the operator-facing complexity of having some timelines APIs that only return
+non-offloaded timelines, we could build the offloaded state into the Timeline type.
+
+`timeline.rs` is already one of the most egregiously long source files in the tree, so
+this is rejected on the basis that we need to avoid making that complexity worse.
--- a/docs/storage_controller.md
+++ b/docs/storage_controller.md
@@ -44,7 +44,7 @@ If you need to modify the database schema, here’s how to create a migration:
 - Use `diesel migration generate <name>` to create a new migration
 - Populate the SQL files in the `migrations/` subdirectory
 - Use `DATABASE_URL=... diesel migration run` to apply the migration you just wrote: this will update the `[schema.rs](http://schema.rs)` file automatically.
-  - This requires a running database: the easiest way to do that is to just run `cargo neon init ; cargo neon start`, which will leave a database available at `postgresql://localhost:1235/attachment_service`
+  - This requires a running database: the easiest way to do that is to just run `cargo neon init ; cargo neon start`, which will leave a database available at `postgresql://localhost:1235/storage_controller`
 - Commit the migration files and the changes to schema.rs
 - If you need to iterate, you can rewind migrations with `diesel migration revert -a` and then `diesel migration run` again.
 - The migrations are build into the storage controller binary, and automatically run at startup after it is deployed, so once you’ve committed a migration no further steps are needed.
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -96,12 +96,6 @@ pub struct ComputeSpec {
    // Stripe size for pageserver sharding, in pages
    #[serde(default)]
    pub shard_stripe_size: Option<usize>,
-
-    // When we are starting a new replica in hot standby mode,
-    // we need to know if the primary is running.
-    // This is used to determine if replica should wait for
-    // RUNNING_XACTS from primary or not.
-    pub primary_is_running: Option<bool>,
 }

 /// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
--- a/libs/metrics/src/hll.rs
+++ b/libs/metrics/src/hll.rs
@@ -13,11 +13,7 @@ use std::{

 use measured::{
    label::{LabelGroupVisitor, LabelName, LabelValue, LabelVisitor},
-    metric::{
-        group::{Encoding, MetricValue},
-        name::MetricNameEncoder,
-        Metric, MetricType, MetricVec,
-    },
+    metric::{counter::CounterState, name::MetricNameEncoder, Metric, MetricType, MetricVec},
    text::TextEncoder,
    LabelGroup,
 };
@@ -144,6 +140,7 @@ impl<const N: usize> HyperLogLogState<N> {
        })
    }
 }
+
 impl<W: std::io::Write, const N: usize> measured::metric::MetricEncoding<TextEncoder<W>>
    for HyperLogLogState<N>
 {
@@ -182,12 +179,13 @@ impl<W: std::io::Write, const N: usize> measured::metric::MetricEncoding<TextEnc
            .into_iter()
            .enumerate()
            .try_for_each(|(hll_shard, val)| {
-                enc.write_metric_value(
-                    name.by_ref(),
+                CounterState::new(val as u64).collect_into(
+                    &(),
                    labels.by_ref().compose_with(HllShardLabel {
                        hll_shard: hll_shard as i64,
                    }),
-                    MetricValue::Int(val as i64),
+                    name.by_ref(),
+                    enc,
                )
            })
    }
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -9,7 +9,7 @@ use measured::{
    metric::{
        counter::CounterState,
        gauge::GaugeState,
-        group::{Encoding, MetricValue},
+        group::Encoding,
        name::{MetricName, MetricNameEncoder},
        MetricEncoding, MetricFamilyEncoding,
    },
@@ -103,9 +103,10 @@ static MAXRSS_KB: Lazy<IntGauge> = Lazy::new(|| {
    .expect("Failed to register maxrss_kb int gauge")
 });

-pub const DISK_WRITE_SECONDS_BUCKETS: &[f64] = &[
-    0.000_050, 0.000_100, 0.000_500, 0.001, 0.003, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5,
-];
+/// Most common fsync latency is 50 µs - 100 µs, but it can be much higher,
+/// especially during many concurrent disk operations.
+pub const DISK_FSYNC_SECONDS_BUCKETS: &[f64] =
+    &[0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0, 30.0];

 pub struct BuildInfo {
    pub revision: &'static str,
@@ -170,8 +171,11 @@ fn write_gauge<Enc: Encoding>(
    labels: impl LabelGroup,
    name: impl MetricNameEncoder,
    enc: &mut Enc,
-) -> Result<(), Enc::Err> {
-    enc.write_metric_value(name, labels, MetricValue::Int(x))
+) -> Result<(), Enc::Err>
+where
+    GaugeState: MetricEncoding<Enc>,
+{
+    GaugeState::new(x).collect_into(&(), labels, name, enc)
 }

 #[derive(Default)]
@@ -543,15 +547,6 @@ impl<T: Encoding> Encoding for Inc<T> {
    fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> {
        self.0.write_help(name, help)
    }
-
-    fn write_metric_value(
-        &mut self,
-        name: impl MetricNameEncoder,
-        labels: impl LabelGroup,
-        value: MetricValue,
-    ) -> Result<(), Self::Err> {
-        self.0.write_metric_value(name, labels, value)
-    }
 }

 impl<T: Encoding> MetricEncoding<Inc<T>> for MeasuredCounterPairState
@@ -578,15 +573,6 @@ impl<T: Encoding> Encoding for Dec<T> {
    fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> {
        self.0.write_help(name, help)
    }
-
-    fn write_metric_value(
-        &mut self,
-        name: impl MetricNameEncoder,
-        labels: impl LabelGroup,
-        value: MetricValue,
-    ) -> Result<(), Self::Err> {
-        self.0.write_metric_value(name, labels, value)
-    }
 }

 /// Write the dec counter to the encoder
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -11,6 +11,27 @@ use crate::{
    shard::{ShardStripeSize, TenantShardId},
 };

+#[derive(Serialize, Deserialize, Debug)]
+#[serde(deny_unknown_fields)]
+pub struct TenantCreateRequest {
+    pub new_tenant_id: TenantShardId,
+    #[serde(default)]
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub generation: Option<u32>,
+
+    // If omitted, create a single shard with TenantShardId::unsharded()
+    #[serde(default)]
+    #[serde(skip_serializing_if = "ShardParameters::is_unsharded")]
+    pub shard_parameters: ShardParameters,
+
+    #[serde(default)]
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub placement_policy: Option<PlacementPolicy>,
+
+    #[serde(flatten)]
+    pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
+}
+
 #[derive(Serialize, Deserialize)]
 pub struct TenantCreateResponseShard {
    pub shard_id: TenantShardId,
@@ -280,4 +301,19 @@ mod test {
        assert_eq!(serde_json::from_str::<PlacementPolicy>(&encoded)?, v);
        Ok(())
    }
+
+    #[test]
+    fn test_reject_unknown_field() {
+        let id = TenantId::generate();
+        let create_request = serde_json::json!({
+            "new_tenant_id": id.to_string(),
+            "unknown_field": "unknown_value".to_string(),
+        });
+        let err = serde_json::from_value::<TenantCreateRequest>(create_request).unwrap_err();
+        assert!(
+            err.to_string().contains("unknown field `unknown_field`"),
+            "expect unknown field `unknown_field` error, got: {}",
+            err
+        );
+    }
 }
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -29,7 +29,7 @@ pub const KEY_SIZE: usize = 18;
 /// See [`Key::to_i128`] for more information on the encoding.
 pub const METADATA_KEY_SIZE: usize = 16;

-/// The key prefix start range for the metadata keys. All keys with the first byte >= 0x40 is a metadata key.
+/// The key prefix start range for the metadata keys. All keys with the first byte >= 0x60 is a metadata key.
 pub const METADATA_KEY_BEGIN_PREFIX: u8 = 0x60;
 pub const METADATA_KEY_END_PREFIX: u8 = 0x7F;

@@ -160,8 +160,9 @@ impl Key {
        key
    }

-    /// Convert a 18B slice to a key. This function should not be used for metadata keys because field2 is handled differently.
-    /// Use [`Key::from_i128`] instead if you want to handle 16B keys (i.e., metadata keys).
+    /// Convert a 18B slice to a key. This function should not be used for 16B metadata keys because `field2` is handled differently.
+    /// Use [`Key::from_i128`] instead if you want to handle 16B keys (i.e., metadata keys). There are some restrictions on `field2`,
+    /// and therefore not all 18B slices are valid page server keys.
    pub fn from_slice(b: &[u8]) -> Self {
        Key {
            field1: b[0],
@@ -173,7 +174,7 @@ impl Key {
        }
    }

-    /// Convert a key to a 18B slice. This function should not be used for metadata keys because field2 is handled differently.
+    /// Convert a key to a 18B slice. This function should not be used for getting a 16B metadata key because `field2` is handled differently.
    /// Use [`Key::to_i128`] instead if you want to get a 16B key (i.e., metadata keys).
    pub fn write_to_byte_slice(&self, buf: &mut [u8]) {
        buf[0] = self.field1;
--- a/libs/pageserver_api/src/keyspace.rs
+++ b/libs/pageserver_api/src/keyspace.rs
@@ -17,6 +17,16 @@ pub struct KeySpace {
    pub ranges: Vec<Range<Key>>,
 }

+impl std::fmt::Display for KeySpace {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "[")?;
+        for range in &self.ranges {
+            write!(f, "{}..{},", range.start, range.end)?;
+        }
+        write!(f, "]")
+    }
+}
+
 /// A wrapper type for sparse keyspaces.
 #[derive(Clone, Debug, Default, PartialEq, Eq)]
 pub struct SparseKeySpace(pub KeySpace);
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -9,6 +9,7 @@ use std::{
    collections::HashMap,
    io::{BufRead, Read},
    num::{NonZeroU64, NonZeroUsize},
+    str::FromStr,
    sync::atomic::AtomicUsize,
    time::{Duration, SystemTime},
 };
@@ -25,7 +26,6 @@ use utils::{
    serde_system_time,
 };

-use crate::controller_api::PlacementPolicy;
 use crate::{
    reltag::RelTag,
    shard::{ShardCount, ShardStripeSize, TenantShardId},
@@ -229,6 +229,11 @@ pub struct TimelineCreateRequest {
    pub pg_version: Option<u32>,
 }

+#[derive(Serialize, Deserialize, Clone)]
+pub struct LsnLeaseRequest {
+    pub lsn: Lsn,
+}
+
 #[derive(Serialize, Deserialize)]
 pub struct TenantShardSplitRequest {
    pub new_shard_count: u8,
@@ -271,28 +276,6 @@ impl Default for ShardParameters {
    }
 }

-#[derive(Serialize, Deserialize, Debug)]
-#[serde(deny_unknown_fields)]
-pub struct TenantCreateRequest {
-    pub new_tenant_id: TenantShardId,
-    #[serde(default)]
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub generation: Option<u32>,
-
-    // If omitted, create a single shard with TenantShardId::unsharded()
-    #[serde(default)]
-    #[serde(skip_serializing_if = "ShardParameters::is_unsharded")]
-    pub shard_parameters: ShardParameters,
-
-    // This parameter is only meaningful in requests sent to the storage controller
-    #[serde(default)]
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub placement_policy: Option<PlacementPolicy>,
-
-    #[serde(flatten)]
-    pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
-}
-
 /// An alternative representation of `pageserver::tenant::TenantConf` with
 /// simpler types.
 #[derive(Serialize, Deserialize, Debug, Default, Clone, Eq, PartialEq)]
@@ -311,7 +294,6 @@ pub struct TenantConfig {
    pub walreceiver_connect_timeout: Option<String>,
    pub lagging_wal_timeout: Option<String>,
    pub max_lsn_wal_lag: Option<NonZeroU64>,
-    pub trace_read_requests: Option<bool>,
    pub eviction_policy: Option<EvictionPolicy>,
    pub min_resident_size_override: Option<u64>,
    pub evictions_low_residence_duration_metric_threshold: Option<String>,
@@ -455,6 +437,41 @@ pub enum CompactionAlgorithm {
    Tiered,
 }

+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+pub enum ImageCompressionAlgorithm {
+    // Disabled for writes, support decompressing during read path
+    Disabled,
+    /// Zstandard compression. Level 0 means and None mean the same (default level). Levels can be negative as well.
+    /// For details, see the [manual](http://facebook.github.io/zstd/zstd_manual.html).
+    Zstd {
+        level: Option<i8>,
+    },
+}
+
+impl FromStr for ImageCompressionAlgorithm {
+    type Err = anyhow::Error;
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        let mut components = s.split(['(', ')']);
+        let first = components
+            .next()
+            .ok_or_else(|| anyhow::anyhow!("empty string"))?;
+        match first {
+            "disabled" => Ok(ImageCompressionAlgorithm::Disabled),
+            "zstd" => {
+                let level = if let Some(v) = components.next() {
+                    let v: i8 = v.parse()?;
+                    Some(v)
+                } else {
+                    None
+                };
+
+                Ok(ImageCompressionAlgorithm::Zstd { level })
+            }
+            _ => anyhow::bail!("invalid specifier '{first}'"),
+        }
+    }
+}
+
 #[derive(Eq, PartialEq, Debug, Clone, Serialize, Deserialize)]
 pub struct CompactionAlgorithmSettings {
    pub kind: CompactionAlgorithm,
@@ -547,10 +564,6 @@ pub struct LocationConfigListResponse {
    pub tenant_shards: Vec<(TenantShardId, Option<LocationConfig>)>,
 }

-#[derive(Serialize, Deserialize)]
-#[serde(transparent)]
-pub struct TenantCreateResponse(pub TenantId);
-
 #[derive(Serialize)]
 pub struct StatusResponse {
    pub id: NodeId,
@@ -607,31 +620,6 @@ impl TenantConfigRequest {
    }
 }

-#[derive(Debug, Deserialize)]
-pub struct TenantAttachRequest {
-    #[serde(default)]
-    pub config: TenantAttachConfig,
-    #[serde(default)]
-    pub generation: Option<u32>,
-}
-
-/// Newtype to enforce deny_unknown_fields on TenantConfig for
-/// its usage inside `TenantAttachRequest`.
-#[derive(Debug, Serialize, Deserialize, Default)]
-#[serde(deny_unknown_fields)]
-pub struct TenantAttachConfig {
-    #[serde(flatten)]
-    allowing_unknown_fields: TenantConfig,
-}
-
-impl std::ops::Deref for TenantAttachConfig {
-    type Target = TenantConfig;
-
-    fn deref(&self) -> &Self::Target {
-        &self.allowing_unknown_fields
-    }
-}
-
 /// See [`TenantState::attachment_status`] and the OpenAPI docs for context.
 #[derive(Serialize, Deserialize, Clone)]
 #[serde(tag = "slug", content = "data", rename_all = "snake_case")]
@@ -650,8 +638,7 @@ pub struct TenantInfo {
    /// If a layer is present in both local FS and S3, it counts only once.
    pub current_physical_size: Option<u64>, // physical size is only included in `tenant_status` endpoint
    pub attachment_status: TenantAttachmentStatus,
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub generation: Option<u32>,
+    pub generation: u32,
 }

 #[derive(Serialize, Deserialize, Clone)]
@@ -696,6 +683,16 @@ pub struct TimelineInfo {
    pub current_physical_size: Option<u64>, // is None when timeline is Unloaded
    pub current_logical_size_non_incremental: Option<u64>,

+    /// How many bytes of WAL are within this branch's pitr_interval.  If the pitr_interval goes
+    /// beyond the branch's branch point, we only count up to the branch point.
+    pub pitr_history_size: u64,
+
+    /// Whether this branch's branch point is within its ancestor's PITR interval (i.e. any
+    /// ancestor data used by this branch would have been retained anyway).  If this is false, then
+    /// this branch may be imposing a cost on the ancestor by causing it to retain layers that it would
+    /// otherwise be able to GC.
+    pub within_ancestor_pitr: bool,
+
    pub timeline_dir_layer_file_size_sum: Option<u64>,

    pub wal_source_connstr: Option<String>,
@@ -1478,7 +1475,7 @@ mod tests {
            state: TenantState::Active,
            current_physical_size: Some(42),
            attachment_status: TenantAttachmentStatus::Attached,
-            generation: None,
+            generation: 1,
        };
        let expected_active = json!({
            "id": original_active.id.to_string(),
@@ -1488,7 +1485,8 @@ mod tests {
            "current_physical_size": 42,
            "attachment_status": {
                "slug":"attached",
-            }
+            },
+            "generation" : 1
        });

        let original_broken = TenantInfo {
@@ -1499,7 +1497,7 @@ mod tests {
            },
            current_physical_size: Some(42),
            attachment_status: TenantAttachmentStatus::Attached,
-            generation: None,
+            generation: 1,
        };
        let expected_broken = json!({
            "id": original_broken.id.to_string(),
@@ -1513,7 +1511,8 @@ mod tests {
            "current_physical_size": 42,
            "attachment_status": {
                "slug":"attached",
-            }
+            },
+            "generation" : 1
        });

        assert_eq!(
@@ -1531,18 +1530,6 @@ mod tests {

    #[test]
    fn test_reject_unknown_field() {
-        let id = TenantId::generate();
-        let create_request = json!({
-            "new_tenant_id": id.to_string(),
-            "unknown_field": "unknown_value".to_string(),
-        });
-        let err = serde_json::from_value::<TenantCreateRequest>(create_request).unwrap_err();
-        assert!(
-            err.to_string().contains("unknown field `unknown_field`"),
-            "expect unknown field `unknown_field` error, got: {}",
-            err
-        );
-
        let id = TenantId::generate();
        let config_request = json!({
            "tenant_id": id.to_string(),
@@ -1554,18 +1541,6 @@ mod tests {
            "expect unknown field `unknown_field` error, got: {}",
            err
        );
-
-        let attach_request = json!({
-            "config": {
-                "unknown_field": "unknown_value".to_string(),
-            },
-        });
-        let err = serde_json::from_value::<TenantAttachRequest>(attach_request).unwrap_err();
-        assert!(
-            err.to_string().contains("unknown field `unknown_field`"),
-            "expect unknown field `unknown_field` error, got: {}",
-            err
-        );
    }

    #[test]
@@ -1689,4 +1664,25 @@ mod tests {
            AuxFilePolicy::CrossValidation
        );
    }
+
+    #[test]
+    fn test_image_compression_algorithm_parsing() {
+        use ImageCompressionAlgorithm::*;
+        assert_eq!(
+            ImageCompressionAlgorithm::from_str("disabled").unwrap(),
+            Disabled
+        );
+        assert_eq!(
+            ImageCompressionAlgorithm::from_str("zstd").unwrap(),
+            Zstd { level: None }
+        );
+        assert_eq!(
+            ImageCompressionAlgorithm::from_str("zstd(18)").unwrap(),
+            Zstd { level: Some(18) }
+        );
+        assert_eq!(
+            ImageCompressionAlgorithm::from_str("zstd(-3)").unwrap(),
+            Zstd { level: Some(-3) }
+        );
+    }
 }
--- a/libs/pageserver_api/src/models/detach_ancestor.rs
+++ b/libs/pageserver_api/src/models/detach_ancestor.rs
@@ -1,6 +1,6 @@
 use utils::id::TimelineId;

-#[derive(Default, serde::Serialize)]
+#[derive(Debug, Default, PartialEq, serde::Serialize, serde::Deserialize)]
 pub struct AncestorDetached {
    pub reparented_timelines: Vec<TimelineId>,
 }
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -1,59 +1,42 @@
-use std::{ops::RangeInclusive, str::FromStr};
+//! See docs/rfcs/031-sharding-static.md for an overview of sharding.
+//!
+//! This module contains a variety of types used to represent the concept of sharding
+//! a Neon tenant across multiple physical shards.  Since there are quite a few of these,
+//! we provide an summary here.
+//!
+//! Types used to describe shards:
+//! - [`ShardCount`] describes how many shards make up a tenant, plus the magic `unsharded` value
+//!   which identifies a tenant which is not shard-aware.  This means its storage paths do not include
+//!   a shard suffix.
+//! - [`ShardNumber`] is simply the zero-based index of a shard within a tenant.
+//! - [`ShardIndex`] is the 2-tuple of `ShardCount` and `ShardNumber`, it's just like a `TenantShardId`
+//!   without the tenant ID.  This is useful for things that are implicitly scoped to a particular
+//!   tenant, such as layer files.
+//! - [`ShardIdentity`]` is the full description of a particular shard's parameters, in sufficient
+//!   detail to convert a [`Key`] to a [`ShardNumber`] when deciding where to write/read.
+//! - The [`ShardSlug`] is a terse formatter for ShardCount and ShardNumber, written as
+//!   four hex digits.  An unsharded tenant is `0000`.
+//! - [`TenantShardId`] is the unique ID of a particular shard within a particular tenant
+//!
+//! Types used to describe the parameters for data distribution in a sharded tenant:
+//! - [`ShardStripeSize`] controls how long contiguous runs of [`Key`]s (stripes) are when distributed across
+//!   multiple shards.  Its value is given in 8kiB pages.
+//! - [`ShardLayout`] describes the data distribution scheme, and at time of writing is
+//!   always zero: this is provided for future upgrades that might introduce different
+//!   data distribution schemes.
+//!
+//! Examples:
+//! - A legacy unsharded tenant has one shard with ShardCount(0), ShardNumber(0), and its slug is 0000
+//! - A single sharded tenant has one shard with ShardCount(1), ShardNumber(0), and its slug is 0001
+//! - In a tenant with 4 shards, each shard has ShardCount(N), ShardNumber(i) where i in 0..N-1 (inclusive),
+//!   and their slugs are 0004, 0104, 0204, and 0304.

 use crate::{key::Key, models::ShardParameters};
-use hex::FromHex;
 use postgres_ffi::relfile_utils::INIT_FORKNUM;
 use serde::{Deserialize, Serialize};
-use utils::id::TenantId;

-/// See docs/rfcs/031-sharding-static.md for an overview of sharding.
-///
-/// This module contains a variety of types used to represent the concept of sharding
-/// a Neon tenant across multiple physical shards.  Since there are quite a few of these,
-/// we provide an summary here.
-///
-/// Types used to describe shards:
-/// - [`ShardCount`] describes how many shards make up a tenant, plus the magic `unsharded` value
-///   which identifies a tenant which is not shard-aware.  This means its storage paths do not include
-///   a shard suffix.
-/// - [`ShardNumber`] is simply the zero-based index of a shard within a tenant.
-/// - [`ShardIndex`] is the 2-tuple of `ShardCount` and `ShardNumber`, it's just like a `TenantShardId`
-///   without the tenant ID.  This is useful for things that are implicitly scoped to a particular
-///   tenant, such as layer files.
-/// - [`ShardIdentity`]` is the full description of a particular shard's parameters, in sufficient
-///   detail to convert a [`Key`] to a [`ShardNumber`] when deciding where to write/read.
-/// - The [`ShardSlug`] is a terse formatter for ShardCount and ShardNumber, written as
-///   four hex digits.  An unsharded tenant is `0000`.
-/// - [`TenantShardId`] is the unique ID of a particular shard within a particular tenant
-///
-/// Types used to describe the parameters for data distribution in a sharded tenant:
-/// - [`ShardStripeSize`] controls how long contiguous runs of [`Key`]s (stripes) are when distributed across
-///   multiple shards.  Its value is given in 8kiB pages.
-/// - [`ShardLayout`] describes the data distribution scheme, and at time of writing is
-///   always zero: this is provided for future upgrades that might introduce different
-///   data distribution schemes.
-///
-/// Examples:
-/// - A legacy unsharded tenant has one shard with ShardCount(0), ShardNumber(0), and its slug is 0000
-/// - A single sharded tenant has one shard with ShardCount(1), ShardNumber(0), and its slug is 0001
-/// - In a tenant with 4 shards, each shard has ShardCount(N), ShardNumber(i) where i in 0..N-1 (inclusive),
-///   and their slugs are 0004, 0104, 0204, and 0304.
-
-#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
-pub struct ShardNumber(pub u8);
-
-#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
-pub struct ShardCount(u8);
-
-/// Combination of ShardNumber and ShardCount.  For use within the context of a particular tenant,
-/// when we need to know which shard we're dealing with, but do not need to know the full
-/// ShardIdentity (because we won't be doing any page->shard mapping), and do not need to know
-/// the fully qualified TenantShardId.
-#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
-pub struct ShardIndex {
-    pub shard_number: ShardNumber,
-    pub shard_count: ShardCount,
-}
+#[doc(inline)]
+pub use ::utils::shard::*;

 /// The ShardIdentity contains enough information to map a [`Key`] to a [`ShardNumber`],
 /// and to check whether that [`ShardNumber`] is the same as the current shard.
@@ -65,362 +48,6 @@ pub struct ShardIdentity {
    layout: ShardLayout,
 }

-/// Formatting helper, for generating the `shard_id` label in traces.
-struct ShardSlug<'a>(&'a TenantShardId);
-
-/// TenantShardId globally identifies a particular shard in a particular tenant.
-///
-/// These are written as `<TenantId>-<ShardSlug>`, for example:
-///   # The second shard in a two-shard tenant
-///   072f1291a5310026820b2fe4b2968934-0102
-///
-/// If the `ShardCount` is _unsharded_, the `TenantShardId` is written without
-/// a shard suffix and is equivalent to the encoding of a `TenantId`: this enables
-/// an unsharded [`TenantShardId`] to be used interchangably with a [`TenantId`].
-///
-/// The human-readable encoding of an unsharded TenantShardId, such as used in API URLs,
-/// is both forward and backward compatible with TenantId: a legacy TenantId can be
-/// decoded as a TenantShardId, and when re-encoded it will be parseable
-/// as a TenantId.
-#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
-pub struct TenantShardId {
-    pub tenant_id: TenantId,
-    pub shard_number: ShardNumber,
-    pub shard_count: ShardCount,
-}
-
-impl ShardCount {
-    pub const MAX: Self = Self(u8::MAX);
-
-    /// The internal value of a ShardCount may be zero, which means "1 shard, but use
-    /// legacy format for TenantShardId that excludes the shard suffix", also known
-    /// as [`TenantShardId::unsharded`].
-    ///
-    /// This method returns the actual number of shards, i.e. if our internal value is
-    /// zero, we return 1 (unsharded tenants have 1 shard).
-    pub fn count(&self) -> u8 {
-        if self.0 > 0 {
-            self.0
-        } else {
-            1
-        }
-    }
-
-    /// The literal internal value: this is **not** the number of shards in the
-    /// tenant, as we have a special zero value for legacy unsharded tenants.  Use
-    /// [`Self::count`] if you want to know the cardinality of shards.
-    pub fn literal(&self) -> u8 {
-        self.0
-    }
-
-    /// Whether the `ShardCount` is for an unsharded tenant, so uses one shard but
-    /// uses the legacy format for `TenantShardId`. See also the documentation for
-    /// [`Self::count`].
-    pub fn is_unsharded(&self) -> bool {
-        self.0 == 0
-    }
-
-    /// `v` may be zero, or the number of shards in the tenant.  `v` is what
-    /// [`Self::literal`] would return.
-    pub const fn new(val: u8) -> Self {
-        Self(val)
-    }
-}
-
-impl ShardNumber {
-    pub const MAX: Self = Self(u8::MAX);
-}
-
-impl TenantShardId {
-    pub fn unsharded(tenant_id: TenantId) -> Self {
-        Self {
-            tenant_id,
-            shard_number: ShardNumber(0),
-            shard_count: ShardCount(0),
-        }
-    }
-
-    /// The range of all TenantShardId that belong to a particular TenantId.  This is useful when
-    /// you have a BTreeMap of TenantShardId, and are querying by TenantId.
-    pub fn tenant_range(tenant_id: TenantId) -> RangeInclusive<Self> {
-        RangeInclusive::new(
-            Self {
-                tenant_id,
-                shard_number: ShardNumber(0),
-                shard_count: ShardCount(0),
-            },
-            Self {
-                tenant_id,
-                shard_number: ShardNumber::MAX,
-                shard_count: ShardCount::MAX,
-            },
-        )
-    }
-
-    pub fn shard_slug(&self) -> impl std::fmt::Display + '_ {
-        ShardSlug(self)
-    }
-
-    /// Convenience for code that has special behavior on the 0th shard.
-    pub fn is_shard_zero(&self) -> bool {
-        self.shard_number == ShardNumber(0)
-    }
-
-    /// The "unsharded" value is distinct from simply having a single shard: it represents
-    /// a tenant which is not shard-aware at all, and whose storage paths will not include
-    /// a shard suffix.
-    pub fn is_unsharded(&self) -> bool {
-        self.shard_number == ShardNumber(0) && self.shard_count.is_unsharded()
-    }
-
-    /// Convenience for dropping the tenant_id and just getting the ShardIndex: this
-    /// is useful when logging from code that is already in a span that includes tenant ID, to
-    /// keep messages reasonably terse.
-    pub fn to_index(&self) -> ShardIndex {
-        ShardIndex {
-            shard_number: self.shard_number,
-            shard_count: self.shard_count,
-        }
-    }
-
-    /// Calculate the children of this TenantShardId when splitting the overall tenant into
-    /// the given number of shards.
-    pub fn split(&self, new_shard_count: ShardCount) -> Vec<TenantShardId> {
-        let effective_old_shard_count = std::cmp::max(self.shard_count.0, 1);
-        let mut child_shards = Vec::new();
-        for shard_number in 0..ShardNumber(new_shard_count.0).0 {
-            // Key mapping is based on a round robin mapping of key hash modulo shard count,
-            // so our child shards are the ones which the same keys would map to.
-            if shard_number % effective_old_shard_count == self.shard_number.0 {
-                child_shards.push(TenantShardId {
-                    tenant_id: self.tenant_id,
-                    shard_number: ShardNumber(shard_number),
-                    shard_count: new_shard_count,
-                })
-            }
-        }
-
-        child_shards
-    }
-}
-
-impl<'a> std::fmt::Display for ShardSlug<'a> {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(
-            f,
-            "{:02x}{:02x}",
-            self.0.shard_number.0, self.0.shard_count.0
-        )
-    }
-}
-
-impl std::fmt::Display for TenantShardId {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        if self.shard_count != ShardCount(0) {
-            write!(f, "{}-{}", self.tenant_id, self.shard_slug())
-        } else {
-            // Legacy case (shard_count == 0) -- format as just the tenant id.  Note that this
-            // is distinct from the normal single shard case (shard count == 1).
-            self.tenant_id.fmt(f)
-        }
-    }
-}
-
-impl std::fmt::Debug for TenantShardId {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        // Debug is the same as Display: the compact hex representation
-        write!(f, "{}", self)
-    }
-}
-
-impl std::str::FromStr for TenantShardId {
-    type Err = hex::FromHexError;
-
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        // Expect format: 16 byte TenantId, '-', 1 byte shard number, 1 byte shard count
-        if s.len() == 32 {
-            // Legacy case: no shard specified
-            Ok(Self {
-                tenant_id: TenantId::from_str(s)?,
-                shard_number: ShardNumber(0),
-                shard_count: ShardCount(0),
-            })
-        } else if s.len() == 37 {
-            let bytes = s.as_bytes();
-            let tenant_id = TenantId::from_hex(&bytes[0..32])?;
-            let mut shard_parts: [u8; 2] = [0u8; 2];
-            hex::decode_to_slice(&bytes[33..37], &mut shard_parts)?;
-            Ok(Self {
-                tenant_id,
-                shard_number: ShardNumber(shard_parts[0]),
-                shard_count: ShardCount(shard_parts[1]),
-            })
-        } else {
-            Err(hex::FromHexError::InvalidStringLength)
-        }
-    }
-}
-
-impl From<[u8; 18]> for TenantShardId {
-    fn from(b: [u8; 18]) -> Self {
-        let tenant_id_bytes: [u8; 16] = b[0..16].try_into().unwrap();
-
-        Self {
-            tenant_id: TenantId::from(tenant_id_bytes),
-            shard_number: ShardNumber(b[16]),
-            shard_count: ShardCount(b[17]),
-        }
-    }
-}
-
-impl ShardIndex {
-    pub fn new(number: ShardNumber, count: ShardCount) -> Self {
-        Self {
-            shard_number: number,
-            shard_count: count,
-        }
-    }
-    pub fn unsharded() -> Self {
-        Self {
-            shard_number: ShardNumber(0),
-            shard_count: ShardCount(0),
-        }
-    }
-
-    /// The "unsharded" value is distinct from simply having a single shard: it represents
-    /// a tenant which is not shard-aware at all, and whose storage paths will not include
-    /// a shard suffix.
-    pub fn is_unsharded(&self) -> bool {
-        self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
-    }
-
-    /// For use in constructing remote storage paths: concatenate this with a TenantId
-    /// to get a fully qualified TenantShardId.
-    ///
-    /// Backward compat: this function returns an empty string if Self::is_unsharded, such
-    /// that the legacy pre-sharding remote key format is preserved.
-    pub fn get_suffix(&self) -> String {
-        if self.is_unsharded() {
-            "".to_string()
-        } else {
-            format!("-{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
-        }
-    }
-}
-
-impl std::fmt::Display for ShardIndex {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
-    }
-}
-
-impl std::fmt::Debug for ShardIndex {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        // Debug is the same as Display: the compact hex representation
-        write!(f, "{}", self)
-    }
-}
-
-impl std::str::FromStr for ShardIndex {
-    type Err = hex::FromHexError;
-
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        // Expect format: 1 byte shard number, 1 byte shard count
-        if s.len() == 4 {
-            let bytes = s.as_bytes();
-            let mut shard_parts: [u8; 2] = [0u8; 2];
-            hex::decode_to_slice(bytes, &mut shard_parts)?;
-            Ok(Self {
-                shard_number: ShardNumber(shard_parts[0]),
-                shard_count: ShardCount(shard_parts[1]),
-            })
-        } else {
-            Err(hex::FromHexError::InvalidStringLength)
-        }
-    }
-}
-
-impl From<[u8; 2]> for ShardIndex {
-    fn from(b: [u8; 2]) -> Self {
-        Self {
-            shard_number: ShardNumber(b[0]),
-            shard_count: ShardCount(b[1]),
-        }
-    }
-}
-
-impl Serialize for TenantShardId {
-    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
-    where
-        S: serde::Serializer,
-    {
-        if serializer.is_human_readable() {
-            serializer.collect_str(self)
-        } else {
-            // Note: while human encoding of [`TenantShardId`] is backward and forward
-            // compatible, this binary encoding is not.
-            let mut packed: [u8; 18] = [0; 18];
-            packed[0..16].clone_from_slice(&self.tenant_id.as_arr());
-            packed[16] = self.shard_number.0;
-            packed[17] = self.shard_count.0;
-
-            packed.serialize(serializer)
-        }
-    }
-}
-
-impl<'de> Deserialize<'de> for TenantShardId {
-    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
-    where
-        D: serde::Deserializer<'de>,
-    {
-        struct IdVisitor {
-            is_human_readable_deserializer: bool,
-        }
-
-        impl<'de> serde::de::Visitor<'de> for IdVisitor {
-            type Value = TenantShardId;
-
-            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
-                if self.is_human_readable_deserializer {
-                    formatter.write_str("value in form of hex string")
-                } else {
-                    formatter.write_str("value in form of integer array([u8; 18])")
-                }
-            }
-
-            fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
-            where
-                A: serde::de::SeqAccess<'de>,
-            {
-                let s = serde::de::value::SeqAccessDeserializer::new(seq);
-                let id: [u8; 18] = Deserialize::deserialize(s)?;
-                Ok(TenantShardId::from(id))
-            }
-
-            fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
-            where
-                E: serde::de::Error,
-            {
-                TenantShardId::from_str(v).map_err(E::custom)
-            }
-        }
-
-        if deserializer.is_human_readable() {
-            deserializer.deserialize_str(IdVisitor {
-                is_human_readable_deserializer: true,
-            })
-        } else {
-            deserializer.deserialize_tuple(
-                18,
-                IdVisitor {
-                    is_human_readable_deserializer: false,
-                },
-            )
-        }
-    }
-}
-
 /// Stripe size in number of pages
 #[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
 pub struct ShardStripeSize(pub u32);
@@ -585,77 +212,6 @@ impl ShardIdentity {
    }
 }

-impl Serialize for ShardIndex {
-    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
-    where
-        S: serde::Serializer,
-    {
-        if serializer.is_human_readable() {
-            serializer.collect_str(self)
-        } else {
-            // Binary encoding is not used in index_part.json, but is included in anticipation of
-            // switching various structures (e.g. inter-process communication, remote metadata) to more
-            // compact binary encodings in future.
-            let mut packed: [u8; 2] = [0; 2];
-            packed[0] = self.shard_number.0;
-            packed[1] = self.shard_count.0;
-            packed.serialize(serializer)
-        }
-    }
-}
-
-impl<'de> Deserialize<'de> for ShardIndex {
-    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
-    where
-        D: serde::Deserializer<'de>,
-    {
-        struct IdVisitor {
-            is_human_readable_deserializer: bool,
-        }
-
-        impl<'de> serde::de::Visitor<'de> for IdVisitor {
-            type Value = ShardIndex;
-
-            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
-                if self.is_human_readable_deserializer {
-                    formatter.write_str("value in form of hex string")
-                } else {
-                    formatter.write_str("value in form of integer array([u8; 2])")
-                }
-            }
-
-            fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
-            where
-                A: serde::de::SeqAccess<'de>,
-            {
-                let s = serde::de::value::SeqAccessDeserializer::new(seq);
-                let id: [u8; 2] = Deserialize::deserialize(s)?;
-                Ok(ShardIndex::from(id))
-            }
-
-            fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
-            where
-                E: serde::de::Error,
-            {
-                ShardIndex::from_str(v).map_err(E::custom)
-            }
-        }
-
-        if deserializer.is_human_readable() {
-            deserializer.deserialize_str(IdVisitor {
-                is_human_readable_deserializer: true,
-            })
-        } else {
-            deserializer.deserialize_tuple(
-                2,
-                IdVisitor {
-                    is_human_readable_deserializer: false,
-                },
-            )
-        }
-    }
-}
-
 /// Whether this key is always held on shard 0 (e.g. shard 0 holds all SLRU keys
 /// in order to be able to serve basebackup requests without peer communication).
 fn key_is_shard0(key: &Key) -> bool {
@@ -737,7 +293,9 @@ pub fn describe(

 #[cfg(test)]
 mod tests {
-    use utils::Hex;
+    use std::str::FromStr;
+
+    use utils::{id::TenantId, Hex};

    use super::*;

--- a/libs/postgres_backend/Cargo.toml
+++ b/libs/postgres_backend/Cargo.toml
@@ -13,6 +13,7 @@ rustls.workspace = true
 serde.workspace = true
 thiserror.workspace = true
 tokio.workspace = true
+tokio-util.workspace = true
 tokio-rustls.workspace = true
 tracing.workspace = true

@@ -23,4 +24,4 @@ workspace_hack.workspace = true
 once_cell.workspace = true
 rustls-pemfile.workspace = true
 tokio-postgres.workspace = true
-tokio-postgres-rustls.workspace = true
+tokio-postgres-rustls.workspace = true
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -16,6 +16,7 @@ use std::{fmt, io};
 use std::{future::Future, str::FromStr};
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_rustls::TlsAcceptor;
+use tokio_util::sync::CancellationToken;
 use tracing::{debug, error, info, trace, warn};

 use pq_proto::framed::{ConnectionError, Framed, FramedReader, FramedWriter};
@@ -400,21 +401,15 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
    }

    /// Wrapper for run_message_loop() that shuts down socket when we are done
-    pub async fn run<F, S>(
+    pub async fn run(
        mut self,
        handler: &mut impl Handler<IO>,
-        shutdown_watcher: F,
-    ) -> Result<(), QueryError>
-    where
-        F: Fn() -> S + Clone,
-        S: Future,
-    {
-        let ret = self
-            .run_message_loop(handler, shutdown_watcher.clone())
-            .await;
+        cancel: &CancellationToken,
+    ) -> Result<(), QueryError> {
+        let ret = self.run_message_loop(handler, cancel).await;

        tokio::select! {
-            _ = shutdown_watcher() => {
+            _ = cancel.cancelled() => {
                // do nothing; we most likely got already stopped by shutdown and will log it next.
            }
            _ = self.framed.shutdown() => {
@@ -444,21 +439,17 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
        }
    }

-    async fn run_message_loop<F, S>(
+    async fn run_message_loop(
        &mut self,
        handler: &mut impl Handler<IO>,
-        shutdown_watcher: F,
-    ) -> Result<(), QueryError>
-    where
-        F: Fn() -> S,
-        S: Future,
-    {
+        cancel: &CancellationToken,
+    ) -> Result<(), QueryError> {
        trace!("postgres backend to {:?} started", self.peer_addr);

        tokio::select!(
            biased;

-            _ = shutdown_watcher() => {
+            _ = cancel.cancelled() => {
                // We were requested to shut down.
                tracing::info!("shutdown request received during handshake");
                return Err(QueryError::Shutdown)
@@ -473,7 +464,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
        let mut query_string = Bytes::new();
        while let Some(msg) = tokio::select!(
            biased;
-            _ = shutdown_watcher() => {
+            _ = cancel.cancelled() => {
                // We were requested to shut down.
                tracing::info!("shutdown request received in run_message_loop");
                return Err(QueryError::Shutdown)
@@ -485,7 +476,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
            let result = self.process_message(handler, msg, &mut query_string).await;
            tokio::select!(
                biased;
-                _ = shutdown_watcher() => {
+                _ = cancel.cancelled() => {
                    // We were requested to shut down.
                    tracing::info!("shutdown request received during response flush");

@@ -672,11 +663,17 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
        assert!(self.state < ProtoState::Authentication);
        let have_tls = self.tls_config.is_some();
        match msg {
-            FeStartupPacket::SslRequest => {
+            FeStartupPacket::SslRequest { direct } => {
                debug!("SSL requested");

-                self.write_message(&BeMessage::EncryptionResponse(have_tls))
-                    .await?;
+                if !direct {
+                    self.write_message(&BeMessage::EncryptionResponse(have_tls))
+                        .await?;
+                } else if !have_tls {
+                    return Err(QueryError::Other(anyhow::anyhow!(
+                        "direct SSL negotiation but no TLS support"
+                    )));
+                }

                if have_tls {
                    self.start_tls().await?;
--- a/libs/postgres_backend/tests/simple_select.rs
+++ b/libs/postgres_backend/tests/simple_select.rs
@@ -3,13 +3,14 @@ use once_cell::sync::Lazy;
 use postgres_backend::{AuthType, Handler, PostgresBackend, QueryError};
 use pq_proto::{BeMessage, RowDescriptor};
 use std::io::Cursor;
-use std::{future, sync::Arc};
+use std::sync::Arc;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio::net::{TcpListener, TcpStream};
 use tokio_postgres::config::SslMode;
 use tokio_postgres::tls::MakeTlsConnect;
 use tokio_postgres::{Config, NoTls, SimpleQueryMessage};
 use tokio_postgres_rustls::MakeRustlsConnect;
+use tokio_util::sync::CancellationToken;

 // generate client, server test streams
 async fn make_tcp_pair() -> (TcpStream, TcpStream) {
@@ -50,7 +51,7 @@ async fn simple_select() {

    tokio::spawn(async move {
        let mut handler = TestHandler {};
-        pgbackend.run(&mut handler, future::pending::<()>).await
+        pgbackend.run(&mut handler, &CancellationToken::new()).await
    });

    let conf = Config::new();
@@ -102,7 +103,7 @@ async fn simple_select_ssl() {

    tokio::spawn(async move {
        let mut handler = TestHandler {};
-        pgbackend.run(&mut handler, future::pending::<()>).await
+        pgbackend.run(&mut handler, &CancellationToken::new()).await
    });

    let client_cfg = rustls::ClientConfig::builder()
--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -356,6 +356,28 @@ impl CheckPoint {
        }
        false
    }
+
+    /// Advance next multi-XID/offset to those given in arguments.
+    ///
+    /// It's important that this handles wraparound correctly. This should match the
+    /// MultiXactAdvanceNextMXact() logic in PostgreSQL's xlog_redo() function.
+    ///
+    /// Returns 'true' if the Checkpoint was updated.
+    pub fn update_next_multixid(&mut self, multi_xid: u32, multi_offset: u32) -> bool {
+        let mut modified = false;
+
+        if multi_xid.wrapping_sub(self.nextMulti) as i32 > 0 {
+            self.nextMulti = multi_xid;
+            modified = true;
+        }
+
+        if multi_offset.wrapping_sub(self.nextMultiOffset) as i32 > 0 {
+            self.nextMultiOffset = multi_offset;
+            modified = true;
+        }
+
+        modified
+    }
 }

 /// Generate new, empty WAL segment, with correct block headers at the first
--- a/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs
+++ b/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs
@@ -202,6 +202,53 @@ pub fn test_update_next_xid() {
    assert_eq!(checkpoint.nextXid.value, 2048);
 }

+#[test]
+pub fn test_update_next_multixid() {
+    let checkpoint_buf = [0u8; std::mem::size_of::<CheckPoint>()];
+    let mut checkpoint = CheckPoint::decode(&checkpoint_buf).unwrap();
+
+    // simple case
+    checkpoint.nextMulti = 20;
+    checkpoint.nextMultiOffset = 20;
+    checkpoint.update_next_multixid(1000, 2000);
+    assert_eq!(checkpoint.nextMulti, 1000);
+    assert_eq!(checkpoint.nextMultiOffset, 2000);
+
+    // No change
+    checkpoint.update_next_multixid(500, 900);
+    assert_eq!(checkpoint.nextMulti, 1000);
+    assert_eq!(checkpoint.nextMultiOffset, 2000);
+
+    // Close to wraparound, but not wrapped around yet
+    checkpoint.nextMulti = 0xffff0000;
+    checkpoint.nextMultiOffset = 0xfffe0000;
+    checkpoint.update_next_multixid(0xffff00ff, 0xfffe00ff);
+    assert_eq!(checkpoint.nextMulti, 0xffff00ff);
+    assert_eq!(checkpoint.nextMultiOffset, 0xfffe00ff);
+
+    // Wraparound
+    checkpoint.update_next_multixid(1, 900);
+    assert_eq!(checkpoint.nextMulti, 1);
+    assert_eq!(checkpoint.nextMultiOffset, 900);
+
+    // Wraparound nextMulti to 0.
+    //
+    // It's a bit surprising that nextMulti can be 0, because that's a special value
+    // (InvalidMultiXactId). However, that's how Postgres does it at multi-xid wraparound:
+    // nextMulti wraps around to 0, but then when the next multi-xid is assigned, it skips
+    // the 0 and the next multi-xid actually assigned is 1.
+    checkpoint.nextMulti = 0xffff0000;
+    checkpoint.nextMultiOffset = 0xfffe0000;
+    checkpoint.update_next_multixid(0, 0xfffe00ff);
+    assert_eq!(checkpoint.nextMulti, 0);
+    assert_eq!(checkpoint.nextMultiOffset, 0xfffe00ff);
+
+    // Wraparound nextMultiOffset to 0
+    checkpoint.update_next_multixid(0, 0);
+    assert_eq!(checkpoint.nextMulti, 0);
+    assert_eq!(checkpoint.nextMultiOffset, 0);
+}
+
 #[test]
 pub fn test_encode_logical_message() {
    let expected = [
--- a/libs/pq_proto/src/framed.rs
+++ b/libs/pq_proto/src/framed.rs
@@ -44,9 +44,9 @@ impl ConnectionError {
 /// Wraps async io `stream`, providing messages to write/flush + read Postgres
 /// messages.
 pub struct Framed<S> {
-    stream: S,
-    read_buf: BytesMut,
-    write_buf: BytesMut,
+    pub stream: S,
+    pub read_buf: BytesMut,
+    pub write_buf: BytesMut,
 }

 impl<S> Framed<S> {
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -39,14 +39,39 @@ pub enum FeMessage {
    PasswordMessage(Bytes),
 }

+#[derive(Clone, Copy, PartialEq, PartialOrd)]
+pub struct ProtocolVersion(u32);
+
+impl ProtocolVersion {
+    pub const fn new(major: u16, minor: u16) -> Self {
+        Self((major as u32) << 16 | minor as u32)
+    }
+    pub const fn minor(self) -> u16 {
+        self.0 as u16
+    }
+    pub const fn major(self) -> u16 {
+        (self.0 >> 16) as u16
+    }
+}
+
+impl fmt::Debug for ProtocolVersion {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_list()
+            .entry(&self.major())
+            .entry(&self.minor())
+            .finish()
+    }
+}
+
 #[derive(Debug)]
 pub enum FeStartupPacket {
    CancelRequest(CancelKeyData),
-    SslRequest,
+    SslRequest {
+        direct: bool,
+    },
    GssEncRequest,
    StartupMessage {
-        major_version: u32,
-        minor_version: u32,
+        version: ProtocolVersion,
        params: StartupMessageParams,
    },
 }
@@ -301,11 +326,23 @@ impl FeStartupPacket {
    /// different from [`FeMessage::parse`] because startup messages don't have
    /// message type byte; otherwise, its comments apply.
    pub fn parse(buf: &mut BytesMut) -> Result<Option<FeStartupPacket>, ProtocolError> {
+        /// <https://github.com/postgres/postgres/blob/ca481d3c9ab7bf69ff0c8d71ad3951d407f6a33c/src/include/libpq/pqcomm.h#L118>
        const MAX_STARTUP_PACKET_LENGTH: usize = 10000;
-        const RESERVED_INVALID_MAJOR_VERSION: u32 = 1234;
-        const CANCEL_REQUEST_CODE: u32 = 5678;
-        const NEGOTIATE_SSL_CODE: u32 = 5679;
-        const NEGOTIATE_GSS_CODE: u32 = 5680;
+        const RESERVED_INVALID_MAJOR_VERSION: u16 = 1234;
+        /// <https://github.com/postgres/postgres/blob/ca481d3c9ab7bf69ff0c8d71ad3951d407f6a33c/src/include/libpq/pqcomm.h#L132>
+        const CANCEL_REQUEST_CODE: ProtocolVersion = ProtocolVersion::new(1234, 5678);
+        /// <https://github.com/postgres/postgres/blob/ca481d3c9ab7bf69ff0c8d71ad3951d407f6a33c/src/include/libpq/pqcomm.h#L166>
+        const NEGOTIATE_SSL_CODE: ProtocolVersion = ProtocolVersion::new(1234, 5679);
+        /// <https://github.com/postgres/postgres/blob/ca481d3c9ab7bf69ff0c8d71ad3951d407f6a33c/src/include/libpq/pqcomm.h#L167>
+        const NEGOTIATE_GSS_CODE: ProtocolVersion = ProtocolVersion::new(1234, 5680);
+
+        // <https://github.com/postgres/postgres/blob/04bcf9e19a4261fe9c7df37c777592c2e10c32a7/src/backend/tcop/backend_startup.c#L378-L382>
+        // First byte indicates standard SSL handshake message
+        // (It can't be a Postgres startup length because in network byte order
+        // that would be a startup packet hundreds of megabytes long)
+        if buf.first() == Some(&0x16) {
+            return Ok(Some(FeStartupPacket::SslRequest { direct: true }));
+        }

        // need at least 4 bytes with packet len
        if buf.len() < 4 {
@@ -338,12 +375,10 @@ impl FeStartupPacket {
        let mut msg = buf.split_to(len).freeze();
        msg.advance(4); // consume len

-        let request_code = msg.get_u32();
-        let req_hi = request_code >> 16;
-        let req_lo = request_code & ((1 << 16) - 1);
+        let request_code = ProtocolVersion(msg.get_u32());
        // StartupMessage, CancelRequest, SSLRequest etc are differentiated by request code.
-        let message = match (req_hi, req_lo) {
-            (RESERVED_INVALID_MAJOR_VERSION, CANCEL_REQUEST_CODE) => {
+        let message = match request_code {
+            CANCEL_REQUEST_CODE => {
                if msg.remaining() != 8 {
                    return Err(ProtocolError::BadMessage(
                        "CancelRequest message is malformed, backend PID / secret key missing"
@@ -355,21 +390,22 @@ impl FeStartupPacket {
                    cancel_key: msg.get_i32(),
                })
            }
-            (RESERVED_INVALID_MAJOR_VERSION, NEGOTIATE_SSL_CODE) => {
+            NEGOTIATE_SSL_CODE => {
                // Requested upgrade to SSL (aka TLS)
-                FeStartupPacket::SslRequest
+                FeStartupPacket::SslRequest { direct: false }
            }
-            (RESERVED_INVALID_MAJOR_VERSION, NEGOTIATE_GSS_CODE) => {
+            NEGOTIATE_GSS_CODE => {
                // Requested upgrade to GSSAPI
                FeStartupPacket::GssEncRequest
            }
-            (RESERVED_INVALID_MAJOR_VERSION, unrecognized_code) => {
+            version if version.major() == RESERVED_INVALID_MAJOR_VERSION => {
                return Err(ProtocolError::Protocol(format!(
-                    "Unrecognized request code {unrecognized_code}"
+                    "Unrecognized request code {}",
+                    version.minor()
                )));
            }
            // TODO bail if protocol major_version is not 3?
-            (major_version, minor_version) => {
+            version => {
                // StartupMessage

                let s = str::from_utf8(&msg).map_err(|_e| {
@@ -382,8 +418,7 @@ impl FeStartupPacket {
                })?;

                FeStartupPacket::StartupMessage {
-                    major_version,
-                    minor_version,
+                    version,
                    params: StartupMessageParams {
                        params: msg.slice_ref(s.as_bytes()),
                    },
@@ -522,6 +557,10 @@ pub enum BeMessage<'a> {
    RowDescription(&'a [RowDescriptor<'a>]),
    XLogData(XLogDataBody<'a>),
    NoticeResponse(&'a str),
+    NegotiateProtocolVersion {
+        version: ProtocolVersion,
+        options: &'a [&'a str],
+    },
    KeepAlive(WalSndKeepAlive),
 }

@@ -945,6 +984,18 @@ impl<'a> BeMessage<'a> {
                    buf.put_u8(u8::from(req.request_reply));
                });
            }
+
+            BeMessage::NegotiateProtocolVersion { version, options } => {
+                buf.put_u8(b'v');
+                write_body(buf, |buf| {
+                    buf.put_u32(version.0);
+                    buf.put_u32(options.len() as u32);
+                    for option in options.iter() {
+                        write_cstr(option, buf)?;
+                    }
+                    Ok(())
+                })?
+            }
        }
        Ok(())
    }
--- a/libs/remote_storage/src/config.rs
+++ b/libs/remote_storage/src/config.rs
@@ -1,6 +1,5 @@
 use std::{fmt::Debug, num::NonZeroUsize, str::FromStr, time::Duration};

-use anyhow::bail;
 use aws_sdk_s3::types::StorageClass;
 use camino::Utf8PathBuf;

@@ -176,20 +175,8 @@ fn serialize_storage_class<S: serde::Serializer>(
 impl RemoteStorageConfig {
    pub const DEFAULT_TIMEOUT: Duration = std::time::Duration::from_secs(120);

-    pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result<Option<RemoteStorageConfig>> {
-        let document: toml_edit::Document = match toml {
-            toml_edit::Item::Table(toml) => toml.clone().into(),
-            toml_edit::Item::Value(toml_edit::Value::InlineTable(toml)) => {
-                toml.clone().into_table().into()
-            }
-            _ => bail!("toml not a table or inline table"),
-        };
-
-        if document.is_empty() {
-            return Ok(None);
-        }
-
-        Ok(Some(toml_edit::de::from_document(document)?))
+    pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result<RemoteStorageConfig> {
+        Ok(utils::toml_edit_ext::deserialize_item(toml)?)
    }
 }

@@ -197,7 +184,7 @@ impl RemoteStorageConfig {
 mod tests {
    use super::*;

-    fn parse(input: &str) -> anyhow::Result<Option<RemoteStorageConfig>> {
+    fn parse(input: &str) -> anyhow::Result<RemoteStorageConfig> {
        let toml = input.parse::<toml_edit::Document>().unwrap();
        RemoteStorageConfig::from_toml(toml.as_item())
    }
@@ -207,7 +194,7 @@ mod tests {
        let input = "local_path = '.'
 timeout = '5s'";

-        let config = parse(input).unwrap().expect("it exists");
+        let config = parse(input).unwrap();

        assert_eq!(
            config,
@@ -229,7 +216,7 @@ timeout = '5s'";
    timeout = '7s'
    ";

-        let config = parse(toml).unwrap().expect("it exists");
+        let config = parse(toml).unwrap();

        assert_eq!(
            config,
@@ -257,7 +244,7 @@ timeout = '5s'";
    timeout = '7s'
    ";

-        let config = parse(toml).unwrap().expect("it exists");
+        let config = parse(toml).unwrap();

        assert_eq!(
            config,
--- a/libs/tenant_size_model/src/calculation.rs
+++ b/libs/tenant_size_model/src/calculation.rs
@@ -34,10 +34,10 @@ struct SegmentSize {
 }

 struct SizeAlternatives {
-    // cheapest alternative if parent is available.
+    /// cheapest alternative if parent is available.
    incremental: SegmentSize,

-    // cheapest alternative if parent node is not available
+    /// cheapest alternative if parent node is not available
    non_incremental: Option<SegmentSize>,
 }

--- a/libs/tenant_size_model/src/svg.rs
+++ b/libs/tenant_size_model/src/svg.rs
@@ -3,10 +3,17 @@ use std::fmt::Write;

 const SVG_WIDTH: f32 = 500.0;

+/// Different branch kind for SVG drawing.
+#[derive(PartialEq)]
+pub enum SvgBranchKind {
+    Timeline,
+    Lease,
+}
+
 struct SvgDraw<'a> {
    storage: &'a StorageModel,
    branches: &'a [String],
-    seg_to_branch: &'a [usize],
+    seg_to_branch: &'a [(usize, SvgBranchKind)],
    sizes: &'a [SegmentSizeResult],

    // layout
@@ -42,13 +49,18 @@ fn draw_legend(result: &mut String) -> anyhow::Result<()> {
        "<line x1=\"5\" y1=\"70\" x2=\"15\" y2=\"70\" stroke-width=\"1\" stroke=\"gray\" />"
    )?;
    writeln!(result, "<text x=\"20\" y=\"75\">WAL not retained</text>")?;
+    writeln!(
+        result,
+        "<line x1=\"10\" y1=\"85\" x2=\"10\" y2=\"95\" stroke-width=\"3\" stroke=\"blue\" />"
+    )?;
+    writeln!(result, "<text x=\"20\" y=\"95\">LSN lease</text>")?;
    Ok(())
 }

 pub fn draw_svg(
    storage: &StorageModel,
    branches: &[String],
-    seg_to_branch: &[usize],
+    seg_to_branch: &[(usize, SvgBranchKind)],
    sizes: &SizeResult,
 ) -> anyhow::Result<String> {
    let mut draw = SvgDraw {
@@ -100,7 +112,7 @@ impl<'a> SvgDraw<'a> {

        // Layout the timelines on Y dimension.
        // TODO
-        let mut y = 100.0;
+        let mut y = 120.0;
        let mut branch_y_coordinates = Vec::new();
        for _branch in self.branches {
            branch_y_coordinates.push(y);
@@ -109,7 +121,7 @@ impl<'a> SvgDraw<'a> {

        // Calculate coordinates for each point
        let seg_coordinates = std::iter::zip(segments, self.seg_to_branch)
-            .map(|(seg, branch_id)| {
+            .map(|(seg, (branch_id, _))| {
                let x = (seg.lsn - min_lsn) as f32 / xscale;
                let y = branch_y_coordinates[*branch_id];
                (x, y)
@@ -175,6 +187,22 @@ impl<'a> SvgDraw<'a> {

        // draw a snapshot point if it's needed
        let (coord_x, coord_y) = self.seg_coordinates[seg_id];
+
+        let (_, kind) = &self.seg_to_branch[seg_id];
+        if kind == &SvgBranchKind::Lease {
+            let (x1, y1) = (coord_x, coord_y - 10.0);
+            let (x2, y2) = (coord_x, coord_y + 10.0);
+
+            let style = "stroke-width=\"3\" stroke=\"blue\"";
+
+            writeln!(
+                result,
+                "<line x1=\"{x1}\" y1=\"{y1}\" x2=\"{x2}\" y2=\"{y2}\" {style}>",
+            )?;
+            writeln!(result, "  <title>leased lsn at {}</title>", seg.lsn)?;
+            writeln!(result, "</line>")?;
+        }
+
        if self.sizes[seg_id].method == SegmentMethod::SnapshotHere {
            writeln!(
                result,
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -40,6 +40,7 @@ thiserror.workspace = true
 tokio.workspace = true
 tokio-tar.workspace = true
 tokio-util.workspace = true
+toml_edit.workspace = true
 tracing.workspace = true
 tracing-error.workspace = true
 tracing-subscriber = { workspace = true, features = ["json", "registry"] }
--- a/libs/utils/src/circuit_breaker.rs
+++ b/libs/utils/src/circuit_breaker.rs
@@ -0,0 +1,114 @@
+use std::{
+    fmt::Display,
+    time::{Duration, Instant},
+};
+
+use metrics::IntCounter;
+
+/// Circuit breakers are for operations that are expensive and fallible: if they fail repeatedly,
+/// we will stop attempting them for some period of time, to avoid denial-of-service from retries, and
+/// to mitigate the log spam from repeated failures.
+pub struct CircuitBreaker {
+    /// An identifier that enables us to log useful errors when a circuit is broken
+    name: String,
+
+    /// Consecutive failures since last success
+    fail_count: usize,
+
+    /// How many consecutive failures before we break the circuit
+    fail_threshold: usize,
+
+    /// If circuit is broken, when was it broken?
+    broken_at: Option<Instant>,
+
+    /// If set, we will auto-reset the circuit this long after it was broken.  If None, broken
+    /// circuits stay broken forever, or until success() is called.
+    reset_period: Option<Duration>,
+
+    /// If this is true, no actual circuit-breaking happens.  This is for overriding a circuit breaker
+    /// to permit something to keep running even if it would otherwise have tripped it.
+    short_circuit: bool,
+}
+
+impl CircuitBreaker {
+    pub fn new(name: String, fail_threshold: usize, reset_period: Option<Duration>) -> Self {
+        Self {
+            name,
+            fail_count: 0,
+            fail_threshold,
+            broken_at: None,
+            reset_period,
+            short_circuit: false,
+        }
+    }
+
+    /// Construct an unbreakable circuit breaker, for use in unit tests etc.
+    pub fn short_circuit() -> Self {
+        Self {
+            name: String::new(),
+            fail_threshold: 0,
+            fail_count: 0,
+            broken_at: None,
+            reset_period: None,
+            short_circuit: true,
+        }
+    }
+
+    pub fn fail<E>(&mut self, metric: &IntCounter, error: E)
+    where
+        E: Display,
+    {
+        if self.short_circuit {
+            return;
+        }
+
+        self.fail_count += 1;
+        if self.broken_at.is_none() && self.fail_count >= self.fail_threshold {
+            self.break_circuit(metric, error);
+        }
+    }
+
+    /// Call this after successfully executing an operation
+    pub fn success(&mut self, metric: &IntCounter) {
+        self.fail_count = 0;
+        if let Some(broken_at) = &self.broken_at {
+            tracing::info!(breaker=%self.name, "Circuit breaker failure ended (was broken for {})",
+                humantime::format_duration(broken_at.elapsed()));
+            self.broken_at = None;
+            metric.inc();
+        }
+    }
+
+    /// Call this before attempting an operation, and skip the operation if we are currently broken.
+    pub fn is_broken(&mut self) -> bool {
+        if self.short_circuit {
+            return false;
+        }
+
+        if let Some(broken_at) = self.broken_at {
+            match self.reset_period {
+                Some(reset_period) if broken_at.elapsed() > reset_period => {
+                    self.reset_circuit();
+                    false
+                }
+                _ => true,
+            }
+        } else {
+            false
+        }
+    }
+
+    fn break_circuit<E>(&mut self, metric: &IntCounter, error: E)
+    where
+        E: Display,
+    {
+        self.broken_at = Some(Instant::now());
+        tracing::error!(breaker=%self.name, "Circuit breaker broken!  Last error: {error}");
+        metric.inc();
+    }
+
+    fn reset_circuit(&mut self) {
+        self.broken_at = None;
+        self.fail_count = 0;
+    }
+}
--- a/libs/utils/src/generation.rs
+++ b/libs/utils/src/generation.rs
@@ -9,20 +9,11 @@ use serde::{Deserialize, Serialize};
 /// numbers are used.
 #[derive(Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)]
 pub enum Generation {
-    // Generations with this magic value will not add a suffix to S3 keys, and will not
-    // be included in persisted index_part.json.  This value is only to be used
-    // during migration from pre-generation metadata to generation-aware metadata,
-    // and should eventually go away.
-    //
-    // A special Generation is used rather than always wrapping Generation in an Option,
-    // so that code handling generations doesn't have to be aware of the legacy
-    // case everywhere it touches a generation.
+    // The None Generation is used in the metadata of layers written before generations were
+    // introduced.  A running Tenant always has a valid generation, but the layer metadata may
+    // include None generations.
    None,
-    // Generations with this magic value may never be used to construct S3 keys:
-    // we will panic if someone tries to.  This is for Tenants in the "Broken" state,
-    // so that we can satisfy their constructor with a Generation without risking
-    // a code bug using it in an S3 write (broken tenants should never write)
-    Broken,
+
    Valid(u32),
 }

@@ -42,11 +33,6 @@ impl Generation {
        Self::None
    }

-    // Create a new generation that will panic if you try to use get_suffix
-    pub fn broken() -> Self {
-        Self::Broken
-    }
-
    pub const fn new(v: u32) -> Self {
        Self::Valid(v)
    }
@@ -60,9 +46,6 @@ impl Generation {
        match self {
            Self::Valid(v) => GenerationFileSuffix(Some(*v)),
            Self::None => GenerationFileSuffix(None),
-            Self::Broken => {
-                panic!("Tried to use a broken generation");
-            }
        }
    }

@@ -86,7 +69,6 @@ impl Generation {
                }
            }
            Self::None => Self::None,
-            Self::Broken => panic!("Attempted to use a broken generation"),
        }
    }

@@ -95,7 +77,6 @@ impl Generation {
        match self {
            Self::Valid(n) => Self::Valid(*n + 1),
            Self::None => Self::Valid(1),
-            Self::Broken => panic!("Attempted to use a broken generation"),
        }
    }

@@ -128,7 +109,7 @@ impl Serialize for Generation {
        if let Self::Valid(v) = self {
            v.serialize(serializer)
        } else {
-            // We should never be asked to serialize a None or Broken.  Structures
+            // We should never be asked to serialize a None. Structures
            // that include an optional generation should convert None to an
            // Option<Generation>::None
            Err(serde::ser::Error::custom(
@@ -159,9 +140,6 @@ impl Debug for Generation {
            Self::None => {
                write!(f, "<none>")
            }
-            Self::Broken => {
-                write!(f, "<broken>")
-            }
        }
    }
 }
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -52,17 +52,17 @@ struct RequestId(String);
 /// There could be other ways to implement similar functionality:
 ///
 /// * procmacros placed on top of all handler methods
-/// With all the drawbacks of procmacros, brings no difference implementation-wise,
-/// and little code reduction compared to the existing approach.
+///   With all the drawbacks of procmacros, brings no difference implementation-wise,
+///   and little code reduction compared to the existing approach.
 ///
 /// * Another `TraitExt` with e.g. the `get_with_span`, `post_with_span` methods to do similar logic,
-/// implemented for [`RouterBuilder`].
-/// Could be simpler, but we don't want to depend on [`routerify`] more, targeting to use other library later.
+///   implemented for [`RouterBuilder`].
+///   Could be simpler, but we don't want to depend on [`routerify`] more, targeting to use other library later.
 ///
 /// * In theory, a span guard could've been created in a pre-request middleware and placed into a global collection, to be dropped
-/// later, in a post-response middleware.
-/// Due to suspendable nature of the futures, would give contradictive results which is exactly the opposite of what `tracing-futures`
-/// tries to achive with its `.instrument` used in the current approach.
+///   later, in a post-response middleware.
+///   Due to suspendable nature of the futures, would give contradictive results which is exactly the opposite of what `tracing-futures`
+///   tries to achive with its `.instrument` used in the current approach.
 ///
 /// If needed, a declarative macro to substitute the |r| ... closure boilerplate could be introduced.
 pub async fn request_span<R, H>(request: Request<Body>, handler: H) -> R::Output
--- a/libs/utils/src/http/json.rs
+++ b/libs/utils/src/http/json.rs
@@ -8,22 +8,15 @@ use super::error::ApiError;
 pub async fn json_request<T: for<'de> Deserialize<'de>>(
    request: &mut Request<Body>,
 ) -> Result<T, ApiError> {
-    json_request_or_empty_body(request)
-        .await?
-        .context("missing request body")
-        .map_err(ApiError::BadRequest)
-}
-
-/// Will be removed as part of <https://github.com/neondatabase/neon/issues/4282>
-pub async fn json_request_or_empty_body<T: for<'de> Deserialize<'de>>(
-    request: &mut Request<Body>,
-) -> Result<Option<T>, ApiError> {
    let body = hyper::body::aggregate(request.body_mut())
        .await
        .context("Failed to read request body")
        .map_err(ApiError::BadRequest)?;
+
    if body.remaining() == 0 {
-        return Ok(None);
+        return Err(ApiError::BadRequest(anyhow::anyhow!(
+            "missing request body"
+        )));
    }

    let mut deser = serde_json::de::Deserializer::from_reader(body.reader());
@@ -31,7 +24,6 @@ pub async fn json_request_or_empty_body<T: for<'de> Deserialize<'de>>(
    serde_path_to_error::deserialize(&mut deser)
        // intentionally stringify because the debug version is not helpful in python logs
        .map_err(|e| anyhow::anyhow!("Failed to parse json request: {e}"))
-        .map(Some)
        .map_err(ApiError::BadRequest)
 }

--- a/libs/utils/src/http/request.rs
+++ b/libs/utils/src/http/request.rs
@@ -74,6 +74,15 @@ pub fn parse_query_param<E: fmt::Display, T: FromStr<Err = E>>(
        .transpose()
 }

+pub fn must_parse_query_param<E: fmt::Display, T: FromStr<Err = E>>(
+    request: &Request<Body>,
+    param_name: &str,
+) -> Result<T, ApiError> {
+    parse_query_param(request, param_name)?.ok_or_else(|| {
+        ApiError::BadRequest(anyhow!("no {param_name} specified in query parameters"))
+    })
+}
+
 pub async fn ensure_no_body(request: &mut Request<Body>) -> Result<(), ApiError> {
    match request.body_mut().data().await {
        Some(_) => Err(ApiError::BadRequest(anyhow!("Unexpected request body"))),
--- a/libs/utils/src/id.rs
+++ b/libs/utils/src/id.rs
@@ -302,17 +302,6 @@ pub struct TenantId(Id);

 id_newtype!(TenantId);

-/// Neon Connection Id identifies long-lived connections (for example a pagestream
-/// connection with the page_service). Is used for better logging and tracing
-///
-/// NOTE: It (de)serializes as an array of hex bytes, so the string representation would look
-/// like `[173,80,132,115,129,226,72,254,170,201,135,108,199,26,228,24]`.
-/// See [`Id`] for alternative ways to serialize it.
-#[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, PartialOrd, Ord)]
-pub struct ConnectionId(Id);
-
-id_newtype!(ConnectionId);
-
 // A pair uniquely identifying Neon instance.
 #[derive(Debug, Clone, Copy, PartialOrd, Ord, PartialEq, Eq, Hash, Serialize, Deserialize)]
 pub struct TenantTimelineId {
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -26,6 +26,8 @@ pub mod auth;
 // utility functions and helper traits for unified unique id generation/serialization etc.
 pub mod id;

+pub mod shard;
+
 mod hex;
 pub use hex::Hex;

@@ -94,6 +96,10 @@ pub mod env;

 pub mod poison;

+pub mod toml_edit_ext;
+
+pub mod circuit_breaker;
+
 /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
 ///
 /// we have several cases:
--- a/libs/utils/src/shard.rs
+++ b/libs/utils/src/shard.rs
@@ -0,0 +1,451 @@
+//! See `pageserver_api::shard` for description on sharding.
+
+use std::{ops::RangeInclusive, str::FromStr};
+
+use hex::FromHex;
+use serde::{Deserialize, Serialize};
+
+use crate::id::TenantId;
+
+#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
+pub struct ShardNumber(pub u8);
+
+#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
+pub struct ShardCount(pub u8);
+
+/// Combination of ShardNumber and ShardCount.  For use within the context of a particular tenant,
+/// when we need to know which shard we're dealing with, but do not need to know the full
+/// ShardIdentity (because we won't be doing any page->shard mapping), and do not need to know
+/// the fully qualified TenantShardId.
+#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
+pub struct ShardIndex {
+    pub shard_number: ShardNumber,
+    pub shard_count: ShardCount,
+}
+
+/// Formatting helper, for generating the `shard_id` label in traces.
+pub struct ShardSlug<'a>(&'a TenantShardId);
+
+/// TenantShardId globally identifies a particular shard in a particular tenant.
+///
+/// These are written as `<TenantId>-<ShardSlug>`, for example:
+///   # The second shard in a two-shard tenant
+///   072f1291a5310026820b2fe4b2968934-0102
+///
+/// If the `ShardCount` is _unsharded_, the `TenantShardId` is written without
+/// a shard suffix and is equivalent to the encoding of a `TenantId`: this enables
+/// an unsharded [`TenantShardId`] to be used interchangably with a [`TenantId`].
+///
+/// The human-readable encoding of an unsharded TenantShardId, such as used in API URLs,
+/// is both forward and backward compatible with TenantId: a legacy TenantId can be
+/// decoded as a TenantShardId, and when re-encoded it will be parseable
+/// as a TenantId.
+#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
+pub struct TenantShardId {
+    pub tenant_id: TenantId,
+    pub shard_number: ShardNumber,
+    pub shard_count: ShardCount,
+}
+
+impl ShardCount {
+    pub const MAX: Self = Self(u8::MAX);
+
+    /// The internal value of a ShardCount may be zero, which means "1 shard, but use
+    /// legacy format for TenantShardId that excludes the shard suffix", also known
+    /// as [`TenantShardId::unsharded`].
+    ///
+    /// This method returns the actual number of shards, i.e. if our internal value is
+    /// zero, we return 1 (unsharded tenants have 1 shard).
+    pub fn count(&self) -> u8 {
+        if self.0 > 0 {
+            self.0
+        } else {
+            1
+        }
+    }
+
+    /// The literal internal value: this is **not** the number of shards in the
+    /// tenant, as we have a special zero value for legacy unsharded tenants.  Use
+    /// [`Self::count`] if you want to know the cardinality of shards.
+    pub fn literal(&self) -> u8 {
+        self.0
+    }
+
+    /// Whether the `ShardCount` is for an unsharded tenant, so uses one shard but
+    /// uses the legacy format for `TenantShardId`. See also the documentation for
+    /// [`Self::count`].
+    pub fn is_unsharded(&self) -> bool {
+        self.0 == 0
+    }
+
+    /// `v` may be zero, or the number of shards in the tenant.  `v` is what
+    /// [`Self::literal`] would return.
+    pub const fn new(val: u8) -> Self {
+        Self(val)
+    }
+}
+
+impl ShardNumber {
+    pub const MAX: Self = Self(u8::MAX);
+}
+
+impl TenantShardId {
+    pub fn unsharded(tenant_id: TenantId) -> Self {
+        Self {
+            tenant_id,
+            shard_number: ShardNumber(0),
+            shard_count: ShardCount(0),
+        }
+    }
+
+    /// The range of all TenantShardId that belong to a particular TenantId.  This is useful when
+    /// you have a BTreeMap of TenantShardId, and are querying by TenantId.
+    pub fn tenant_range(tenant_id: TenantId) -> RangeInclusive<Self> {
+        RangeInclusive::new(
+            Self {
+                tenant_id,
+                shard_number: ShardNumber(0),
+                shard_count: ShardCount(0),
+            },
+            Self {
+                tenant_id,
+                shard_number: ShardNumber::MAX,
+                shard_count: ShardCount::MAX,
+            },
+        )
+    }
+
+    pub fn shard_slug(&self) -> impl std::fmt::Display + '_ {
+        ShardSlug(self)
+    }
+
+    /// Convenience for code that has special behavior on the 0th shard.
+    pub fn is_shard_zero(&self) -> bool {
+        self.shard_number == ShardNumber(0)
+    }
+
+    /// The "unsharded" value is distinct from simply having a single shard: it represents
+    /// a tenant which is not shard-aware at all, and whose storage paths will not include
+    /// a shard suffix.
+    pub fn is_unsharded(&self) -> bool {
+        self.shard_number == ShardNumber(0) && self.shard_count.is_unsharded()
+    }
+
+    /// Convenience for dropping the tenant_id and just getting the ShardIndex: this
+    /// is useful when logging from code that is already in a span that includes tenant ID, to
+    /// keep messages reasonably terse.
+    pub fn to_index(&self) -> ShardIndex {
+        ShardIndex {
+            shard_number: self.shard_number,
+            shard_count: self.shard_count,
+        }
+    }
+
+    /// Calculate the children of this TenantShardId when splitting the overall tenant into
+    /// the given number of shards.
+    pub fn split(&self, new_shard_count: ShardCount) -> Vec<TenantShardId> {
+        let effective_old_shard_count = std::cmp::max(self.shard_count.0, 1);
+        let mut child_shards = Vec::new();
+        for shard_number in 0..ShardNumber(new_shard_count.0).0 {
+            // Key mapping is based on a round robin mapping of key hash modulo shard count,
+            // so our child shards are the ones which the same keys would map to.
+            if shard_number % effective_old_shard_count == self.shard_number.0 {
+                child_shards.push(TenantShardId {
+                    tenant_id: self.tenant_id,
+                    shard_number: ShardNumber(shard_number),
+                    shard_count: new_shard_count,
+                })
+            }
+        }
+
+        child_shards
+    }
+}
+
+impl<'a> std::fmt::Display for ShardSlug<'a> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "{:02x}{:02x}",
+            self.0.shard_number.0, self.0.shard_count.0
+        )
+    }
+}
+
+impl std::fmt::Display for TenantShardId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        if self.shard_count != ShardCount(0) {
+            write!(f, "{}-{}", self.tenant_id, self.shard_slug())
+        } else {
+            // Legacy case (shard_count == 0) -- format as just the tenant id.  Note that this
+            // is distinct from the normal single shard case (shard count == 1).
+            self.tenant_id.fmt(f)
+        }
+    }
+}
+
+impl std::fmt::Debug for TenantShardId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        // Debug is the same as Display: the compact hex representation
+        write!(f, "{}", self)
+    }
+}
+
+impl std::str::FromStr for TenantShardId {
+    type Err = hex::FromHexError;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        // Expect format: 16 byte TenantId, '-', 1 byte shard number, 1 byte shard count
+        if s.len() == 32 {
+            // Legacy case: no shard specified
+            Ok(Self {
+                tenant_id: TenantId::from_str(s)?,
+                shard_number: ShardNumber(0),
+                shard_count: ShardCount(0),
+            })
+        } else if s.len() == 37 {
+            let bytes = s.as_bytes();
+            let tenant_id = TenantId::from_hex(&bytes[0..32])?;
+            let mut shard_parts: [u8; 2] = [0u8; 2];
+            hex::decode_to_slice(&bytes[33..37], &mut shard_parts)?;
+            Ok(Self {
+                tenant_id,
+                shard_number: ShardNumber(shard_parts[0]),
+                shard_count: ShardCount(shard_parts[1]),
+            })
+        } else {
+            Err(hex::FromHexError::InvalidStringLength)
+        }
+    }
+}
+
+impl From<[u8; 18]> for TenantShardId {
+    fn from(b: [u8; 18]) -> Self {
+        let tenant_id_bytes: [u8; 16] = b[0..16].try_into().unwrap();
+
+        Self {
+            tenant_id: TenantId::from(tenant_id_bytes),
+            shard_number: ShardNumber(b[16]),
+            shard_count: ShardCount(b[17]),
+        }
+    }
+}
+
+impl ShardIndex {
+    pub fn new(number: ShardNumber, count: ShardCount) -> Self {
+        Self {
+            shard_number: number,
+            shard_count: count,
+        }
+    }
+    pub fn unsharded() -> Self {
+        Self {
+            shard_number: ShardNumber(0),
+            shard_count: ShardCount(0),
+        }
+    }
+
+    /// The "unsharded" value is distinct from simply having a single shard: it represents
+    /// a tenant which is not shard-aware at all, and whose storage paths will not include
+    /// a shard suffix.
+    pub fn is_unsharded(&self) -> bool {
+        self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
+    }
+
+    /// For use in constructing remote storage paths: concatenate this with a TenantId
+    /// to get a fully qualified TenantShardId.
+    ///
+    /// Backward compat: this function returns an empty string if Self::is_unsharded, such
+    /// that the legacy pre-sharding remote key format is preserved.
+    pub fn get_suffix(&self) -> String {
+        if self.is_unsharded() {
+            "".to_string()
+        } else {
+            format!("-{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
+        }
+    }
+}
+
+impl std::fmt::Display for ShardIndex {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
+    }
+}
+
+impl std::fmt::Debug for ShardIndex {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        // Debug is the same as Display: the compact hex representation
+        write!(f, "{}", self)
+    }
+}
+
+impl std::str::FromStr for ShardIndex {
+    type Err = hex::FromHexError;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        // Expect format: 1 byte shard number, 1 byte shard count
+        if s.len() == 4 {
+            let bytes = s.as_bytes();
+            let mut shard_parts: [u8; 2] = [0u8; 2];
+            hex::decode_to_slice(bytes, &mut shard_parts)?;
+            Ok(Self {
+                shard_number: ShardNumber(shard_parts[0]),
+                shard_count: ShardCount(shard_parts[1]),
+            })
+        } else {
+            Err(hex::FromHexError::InvalidStringLength)
+        }
+    }
+}
+
+impl From<[u8; 2]> for ShardIndex {
+    fn from(b: [u8; 2]) -> Self {
+        Self {
+            shard_number: ShardNumber(b[0]),
+            shard_count: ShardCount(b[1]),
+        }
+    }
+}
+
+impl Serialize for TenantShardId {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        if serializer.is_human_readable() {
+            serializer.collect_str(self)
+        } else {
+            // Note: while human encoding of [`TenantShardId`] is backward and forward
+            // compatible, this binary encoding is not.
+            let mut packed: [u8; 18] = [0; 18];
+            packed[0..16].clone_from_slice(&self.tenant_id.as_arr());
+            packed[16] = self.shard_number.0;
+            packed[17] = self.shard_count.0;
+
+            packed.serialize(serializer)
+        }
+    }
+}
+
+impl<'de> Deserialize<'de> for TenantShardId {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        struct IdVisitor {
+            is_human_readable_deserializer: bool,
+        }
+
+        impl<'de> serde::de::Visitor<'de> for IdVisitor {
+            type Value = TenantShardId;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
+                if self.is_human_readable_deserializer {
+                    formatter.write_str("value in form of hex string")
+                } else {
+                    formatter.write_str("value in form of integer array([u8; 18])")
+                }
+            }
+
+            fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
+            where
+                A: serde::de::SeqAccess<'de>,
+            {
+                let s = serde::de::value::SeqAccessDeserializer::new(seq);
+                let id: [u8; 18] = Deserialize::deserialize(s)?;
+                Ok(TenantShardId::from(id))
+            }
+
+            fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
+            where
+                E: serde::de::Error,
+            {
+                TenantShardId::from_str(v).map_err(E::custom)
+            }
+        }
+
+        if deserializer.is_human_readable() {
+            deserializer.deserialize_str(IdVisitor {
+                is_human_readable_deserializer: true,
+            })
+        } else {
+            deserializer.deserialize_tuple(
+                18,
+                IdVisitor {
+                    is_human_readable_deserializer: false,
+                },
+            )
+        }
+    }
+}
+
+impl Serialize for ShardIndex {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        if serializer.is_human_readable() {
+            serializer.collect_str(self)
+        } else {
+            // Binary encoding is not used in index_part.json, but is included in anticipation of
+            // switching various structures (e.g. inter-process communication, remote metadata) to more
+            // compact binary encodings in future.
+            let mut packed: [u8; 2] = [0; 2];
+            packed[0] = self.shard_number.0;
+            packed[1] = self.shard_count.0;
+            packed.serialize(serializer)
+        }
+    }
+}
+
+impl<'de> Deserialize<'de> for ShardIndex {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        struct IdVisitor {
+            is_human_readable_deserializer: bool,
+        }
+
+        impl<'de> serde::de::Visitor<'de> for IdVisitor {
+            type Value = ShardIndex;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
+                if self.is_human_readable_deserializer {
+                    formatter.write_str("value in form of hex string")
+                } else {
+                    formatter.write_str("value in form of integer array([u8; 2])")
+                }
+            }
+
+            fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
+            where
+                A: serde::de::SeqAccess<'de>,
+            {
+                let s = serde::de::value::SeqAccessDeserializer::new(seq);
+                let id: [u8; 2] = Deserialize::deserialize(s)?;
+                Ok(ShardIndex::from(id))
+            }
+
+            fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
+            where
+                E: serde::de::Error,
+            {
+                ShardIndex::from_str(v).map_err(E::custom)
+            }
+        }
+
+        if deserializer.is_human_readable() {
+            deserializer.deserialize_str(IdVisitor {
+                is_human_readable_deserializer: true,
+            })
+        } else {
+            deserializer.deserialize_tuple(
+                2,
+                IdVisitor {
+                    is_human_readable_deserializer: false,
+                },
+            )
+        }
+    }
+}
--- a/libs/utils/src/toml_edit_ext.rs
+++ b/libs/utils/src/toml_edit_ext.rs
@@ -0,0 +1,22 @@
+#[derive(Debug, thiserror::Error)]
+pub enum Error {
+    #[error("item is not a document")]
+    ItemIsNotADocument,
+    #[error(transparent)]
+    Serde(toml_edit::de::Error),
+}
+
+pub fn deserialize_item<T>(item: &toml_edit::Item) -> Result<T, Error>
+where
+    T: serde::de::DeserializeOwned,
+{
+    let document: toml_edit::Document = match item {
+        toml_edit::Item::Table(toml) => toml.clone().into(),
+        toml_edit::Item::Value(toml_edit::Value::InlineTable(toml)) => {
+            toml.clone().into_table().into()
+        }
+        _ => return Err(Error::ItemIsNotADocument),
+    };
+
+    toml_edit::de::from_document(document).map_err(Error::Serde)
+}
--- a/libs/walproposer/src/walproposer.rs
+++ b/libs/walproposer/src/walproposer.rs
@@ -1,3 +1,5 @@
+#![allow(clippy::todo)]
+
 use std::ffi::CString;

 use crate::{
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -62,6 +62,7 @@ sync_wrapper.workspace = true
 sysinfo.workspace = true
 tokio-tar.workspace = true
 thiserror.workspace = true
+tikv-jemallocator.workspace = true
 tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
 tokio-epoll-uring.workspace = true
 tokio-io-timeout.workspace = true
--- a/pageserver/benches/bench_walredo.rs
+++ b/pageserver/benches/bench_walredo.rs
@@ -48,6 +48,7 @@
 //! medium/128        time:   [8.8311 ms 8.9849 ms 9.1263 ms]
 //! ```

+use anyhow::Context;
 use bytes::{Buf, Bytes};
 use criterion::{BenchmarkId, Criterion};
 use pageserver::{config::PageServerConf, walrecord::NeonWalRecord, walredo::PostgresRedoManager};
@@ -188,6 +189,7 @@ impl Request {
        manager
            .request_redo(*key, *lsn, base_img.clone(), records.clone(), *pg_version)
            .await
+            .context("request_redo")
    }

    fn pg_record(will_init: bool, bytes: &'static [u8]) -> NeonWalRecord {
--- a/pageserver/client/Cargo.toml
+++ b/pageserver/client/Cargo.toml
@@ -8,7 +8,7 @@ license.workspace = true
 pageserver_api.workspace = true
 thiserror.workspace = true
 async-trait.workspace = true
-reqwest.workspace = true
+reqwest = { workspace = true, features = [ "stream" ] }
 utils.workspace = true
 serde.workspace = true
 workspace_hack = { version = "0.1", path = "../../workspace_hack" }
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -1,6 +1,7 @@
 use std::collections::HashMap;

 use bytes::Bytes;
+use detach_ancestor::AncestorDetached;
 use pageserver_api::{models::*, shard::TenantShardId};
 use reqwest::{IntoUrl, Method, StatusCode};
 use utils::{
@@ -9,6 +10,8 @@ use utils::{
    lsn::Lsn,
 };

+pub use reqwest::Body as ReqwestBody;
+
 pub mod util;

 #[derive(Debug, Clone)]
@@ -20,6 +23,9 @@ pub struct Client {

 #[derive(thiserror::Error, Debug)]
 pub enum Error {
+    #[error("send request: {0}")]
+    SendRequest(reqwest::Error),
+
    #[error("receive body: {0}")]
    ReceiveBody(reqwest::Error),

@@ -173,19 +179,30 @@ impl Client {
        self.request(Method::GET, uri, ()).await
    }

+    fn start_request<U: reqwest::IntoUrl>(
+        &self,
+        method: Method,
+        uri: U,
+    ) -> reqwest::RequestBuilder {
+        let req = self.client.request(method, uri);
+        if let Some(value) = &self.authorization_header {
+            req.header(reqwest::header::AUTHORIZATION, value)
+        } else {
+            req
+        }
+    }
+
    async fn request_noerror<B: serde::Serialize, U: reqwest::IntoUrl>(
        &self,
        method: Method,
        uri: U,
        body: B,
    ) -> Result<reqwest::Response> {
-        let req = self.client.request(method, uri);
-        let req = if let Some(value) = &self.authorization_header {
-            req.header(reqwest::header::AUTHORIZATION, value)
-        } else {
-            req
-        };
-        req.json(&body).send().await.map_err(Error::ReceiveBody)
+        self.start_request(method, uri)
+            .json(&body)
+            .send()
+            .await
+            .map_err(Error::ReceiveBody)
    }

    async fn request<B: serde::Serialize, U: reqwest::IntoUrl>(
@@ -205,15 +222,6 @@ impl Client {
        Ok(())
    }

-    pub async fn tenant_create(&self, req: &TenantCreateRequest) -> Result<TenantId> {
-        let uri = format!("{}/v1/tenant", self.mgmt_api_endpoint);
-        self.request(Method::POST, &uri, req)
-            .await?
-            .json()
-            .await
-            .map_err(Error::ReceiveBody)
-    }
-
    /// The tenant deletion API can return 202 if deletion is incomplete, or
    /// 404 if it is complete.  Callers are responsible for checking the status
    /// code and retrying.  Error codes other than 404 will return Err().
@@ -411,6 +419,23 @@ impl Client {
        }
    }

+    pub async fn timeline_detach_ancestor(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+    ) -> Result<AncestorDetached> {
+        let uri = format!(
+            "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/detach_ancestor",
+            self.mgmt_api_endpoint
+        );
+
+        self.request(Method::PUT, &uri, ())
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
+
    pub async fn tenant_reset(&self, tenant_shard_id: TenantShardId) -> Result<()> {
        let uri = format!(
            "{}/v1/tenant/{}/reset",
@@ -618,4 +643,53 @@ impl Client {
            }),
        }
    }
+
+    pub async fn import_basebackup(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        base_lsn: Lsn,
+        end_lsn: Lsn,
+        pg_version: u32,
+        basebackup_tarball: ReqwestBody,
+    ) -> Result<()> {
+        let uri = format!(
+            "{}/v1/tenant/{tenant_id}/timeline/{timeline_id}/import_basebackup?base_lsn={base_lsn}&end_lsn={end_lsn}&pg_version={pg_version}",
+            self.mgmt_api_endpoint,
+        );
+        self.start_request(Method::PUT, uri)
+            .body(basebackup_tarball)
+            .send()
+            .await
+            .map_err(Error::SendRequest)?
+            .error_from_body()
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
+
+    pub async fn import_wal(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        start_lsn: Lsn,
+        end_lsn: Lsn,
+        wal_tarball: ReqwestBody,
+    ) -> Result<()> {
+        let uri = format!(
+            "{}/v1/tenant/{tenant_id}/timeline/{timeline_id}/import_wal?start_lsn={start_lsn}&end_lsn={end_lsn}",
+            self.mgmt_api_endpoint,
+        );
+        self.start_request(Method::PUT, uri)
+            .body(wal_tarball)
+            .send()
+            .await
+            .map_err(Error::SendRequest)?
+            .error_from_body()
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
 }
--- a/pageserver/compaction/src/interface.rs
+++ b/pageserver/compaction/src/interface.rs
@@ -131,7 +131,7 @@ impl CompactionKey for Key {
 pub type CompactionKeySpace<K> = Vec<Range<K>>;

 /// Functions needed from all layers.
-pub trait CompactionLayer<K: CompactionKey + ?Sized> {
+pub trait CompactionLayer<K: CompactionKey> {
    fn key_range(&self) -> &Range<K>;
    fn lsn_range(&self) -> &Range<Lsn>;

--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -178,7 +178,7 @@ async fn main() -> anyhow::Result<()> {
            let toml_item = toml_document
                .get("remote_storage")
                .expect("need remote_storage");
-            let config = RemoteStorageConfig::from_toml(toml_item)?.expect("incomplete config");
+            let config = RemoteStorageConfig::from_toml(toml_item)?;
            let storage = remote_storage::GenericRemoteStorage::from_config(&config);
            let cancel = CancellationToken::new();
            storage
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -348,35 +348,36 @@ where
                    self.add_rel(rel, rel).await?;
                }
            }
-
-            for (path, content) in self
-                .timeline
-                .list_aux_files(self.lsn, self.ctx)
-                .await
-                .map_err(|e| BasebackupError::Server(e.into()))?
-            {
-                if path.starts_with("pg_replslot") {
-                    let offs = pg_constants::REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN;
-                    let restart_lsn = Lsn(u64::from_le_bytes(
-                        content[offs..offs + 8].try_into().unwrap(),
-                    ));
-                    info!("Replication slot {} restart LSN={}", path, restart_lsn);
-                    min_restart_lsn = Lsn::min(min_restart_lsn, restart_lsn);
-                } else if path == "pg_logical/replorigin_checkpoint" {
-                    // replorigin_checkoint is written only on compute shutdown, so it contains
-                    // deteriorated values. So we generate our own version of this file for the particular LSN
-                    // based on information about replorigins extracted from transaction commit records.
-                    // In future we will not generate AUX record for "pg_logical/replorigin_checkpoint" at all,
-                    // but now we should handle (skip) it for backward compatibility.
-                    continue;
-                }
-                let header = new_tar_header(&path, content.len() as u64)?;
-                self.ar
-                    .append(&header, &*content)
-                    .await
-                    .context("could not add aux file to basebackup tarball")?;
-            }
        }
+
+        for (path, content) in self
+            .timeline
+            .list_aux_files(self.lsn, self.ctx)
+            .await
+            .map_err(|e| BasebackupError::Server(e.into()))?
+        {
+            if path.starts_with("pg_replslot") {
+                let offs = pg_constants::REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN;
+                let restart_lsn = Lsn(u64::from_le_bytes(
+                    content[offs..offs + 8].try_into().unwrap(),
+                ));
+                info!("Replication slot {} restart LSN={}", path, restart_lsn);
+                min_restart_lsn = Lsn::min(min_restart_lsn, restart_lsn);
+            } else if path == "pg_logical/replorigin_checkpoint" {
+                // replorigin_checkoint is written only on compute shutdown, so it contains
+                // deteriorated values. So we generate our own version of this file for the particular LSN
+                // based on information about replorigins extracted from transaction commit records.
+                // In future we will not generate AUX record for "pg_logical/replorigin_checkpoint" at all,
+                // but now we should handle (skip) it for backward compatibility.
+                continue;
+            }
+            let header = new_tar_header(&path, content.len() as u64)?;
+            self.ar
+                .append(&header, &*content)
+                .await
+                .context("could not add aux file to basebackup tarball")?;
+        }
+
        if min_restart_lsn != Lsn::MAX {
            info!(
                "Min restart LSN for logical replication is {}",
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -47,6 +47,9 @@ use utils::{
 project_git_version!(GIT_VERSION);
 project_build_tag!(BUILD_TAG);

+#[global_allocator]
+static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
+
 const PID_FILE_NAME: &str = "pageserver.pid";

 const FEATURES: &[&str] = &[
@@ -421,6 +424,10 @@ fn start_pageserver(
        background_jobs_can_start: background_jobs_barrier.clone(),
    };

+    info!(config=?conf.l0_flush, "using l0_flush config");
+    let l0_flush_global_state =
+        pageserver::l0_flush::L0FlushGlobalState::new(conf.l0_flush.clone());
+
    // Scan the local 'tenants/' directory and start loading the tenants
    let deletion_queue_client = deletion_queue.new_client();
    let tenant_manager = BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
@@ -429,6 +436,7 @@ fn start_pageserver(
            broker_client: broker_client.clone(),
            remote_storage: remote_storage.clone(),
            deletion_queue_client,
+            l0_flush_global_state,
        },
        order,
        shutdown_pageserver.clone(),
@@ -652,7 +660,6 @@ fn start_pageserver(
                async move {
                    page_service::libpq_listener_main(
                        tenant_manager,
-                        broker_client,
                        pg_auth,
                        pageserver_listener,
                        conf.pg_auth_type,
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -5,14 +5,13 @@
 //! See also `settings.md` for better description on every parameter.

 use anyhow::{anyhow, bail, ensure, Context, Result};
-use pageserver_api::shard::TenantShardId;
+use pageserver_api::{models::ImageCompressionAlgorithm, shard::TenantShardId};
 use remote_storage::{RemotePath, RemoteStorageConfig};
 use serde;
 use serde::de::IntoDeserializer;
 use std::env;
 use storage_broker::Uri;
 use utils::crashsafe::path_with_suffix_extension;
-use utils::id::ConnectionId;
 use utils::logging::SecretString;

 use once_cell::sync::OnceCell;
@@ -30,16 +29,13 @@ use utils::{
    logging::LogFormat,
 };

-use crate::tenant::timeline::GetVectoredImpl;
 use crate::tenant::vectored_blob_io::MaxVectoredReadBytes;
 use crate::tenant::{config::TenantConfOpt, timeline::GetImpl};
 use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
 use crate::{disk_usage_eviction_task::DiskUsageEvictionTaskConfig, virtual_file::io_engine};
+use crate::{l0_flush::L0FlushConfig, tenant::timeline::GetVectoredImpl};
 use crate::{tenant::config::TenantConf, virtual_file};
-use crate::{
-    TENANT_CONFIG_NAME, TENANT_HEATMAP_BASENAME, TENANT_LOCATION_CONFIG_NAME,
-    TIMELINE_DELETE_MARK_SUFFIX,
-};
+use crate::{TENANT_HEATMAP_BASENAME, TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX};

 use self::defaults::DEFAULT_CONCURRENT_TENANT_WARMUP;

@@ -53,6 +49,7 @@ pub mod defaults {
        DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_HTTP_LISTEN_PORT, DEFAULT_PG_LISTEN_ADDR,
        DEFAULT_PG_LISTEN_PORT,
    };
+    use pageserver_api::models::ImageCompressionAlgorithm;
    pub use storage_broker::DEFAULT_ENDPOINT as BROKER_DEFAULT_ENDPOINT;

    pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "60 s";
@@ -93,6 +90,9 @@ pub mod defaults {

    pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB

+    pub const DEFAULT_IMAGE_COMPRESSION: ImageCompressionAlgorithm =
+        ImageCompressionAlgorithm::Disabled;
+
    pub const DEFAULT_VALIDATE_VECTORED_GET: bool = true;

    pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
@@ -162,7 +162,7 @@ pub mod defaults {

 #ephemeral_bytes_per_memory_kb = {DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB}

-[remote_storage]
+#[remote_storage]

 "#
    );
@@ -288,12 +288,16 @@ pub struct PageServerConf {

    pub validate_vectored_get: bool,

+    pub image_compression: ImageCompressionAlgorithm,
+
    /// How many bytes of ephemeral layer content will we allow per kilobyte of RAM.  When this
    /// is exceeded, we start proactively closing ephemeral layers to limit the total amount
    /// of ephemeral data.
    ///
    /// Setting this to zero disables limits on total ephemeral layer size.
    pub ephemeral_bytes_per_memory_kb: usize,
+
+    pub l0_flush: L0FlushConfig,
 }

 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -398,7 +402,11 @@ struct PageServerConfigBuilder {

    validate_vectored_get: BuilderValue<bool>,

+    image_compression: BuilderValue<ImageCompressionAlgorithm>,
+
    ephemeral_bytes_per_memory_kb: BuilderValue<usize>,
+
+    l0_flush: BuilderValue<L0FlushConfig>,
 }

 impl PageServerConfigBuilder {
@@ -485,8 +493,10 @@ impl PageServerConfigBuilder {
            max_vectored_read_bytes: Set(MaxVectoredReadBytes(
                NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
            )),
+            image_compression: Set(DEFAULT_IMAGE_COMPRESSION),
            validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET),
            ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
+            l0_flush: Set(L0FlushConfig::default()),
        }
    }
 }
@@ -670,10 +680,18 @@ impl PageServerConfigBuilder {
        self.validate_vectored_get = BuilderValue::Set(value);
    }

+    pub fn get_image_compression(&mut self, value: ImageCompressionAlgorithm) {
+        self.image_compression = BuilderValue::Set(value);
+    }
+
    pub fn get_ephemeral_bytes_per_memory_kb(&mut self, value: usize) {
        self.ephemeral_bytes_per_memory_kb = BuilderValue::Set(value);
    }

+    pub fn l0_flush(&mut self, value: L0FlushConfig) {
+        self.l0_flush = BuilderValue::Set(value);
+    }
+
    pub fn build(self) -> anyhow::Result<PageServerConf> {
        let default = Self::default_values();

@@ -730,7 +748,9 @@ impl PageServerConfigBuilder {
                get_impl,
                max_vectored_read_bytes,
                validate_vectored_get,
+                image_compression,
                ephemeral_bytes_per_memory_kb,
+                l0_flush,
            }
            CUSTOM LOGIC
            {
@@ -810,15 +830,11 @@ impl PageServerConf {
    }

    /// Points to a place in pageserver's local directory,
-    /// where certain tenant's tenantconf file should be located.
-    ///
-    /// Legacy: superseded by tenant_location_config_path.  Eventually
-    /// remove this function.
-    pub fn tenant_config_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
-        self.tenant_path(tenant_shard_id).join(TENANT_CONFIG_NAME)
-    }
-
-    pub fn tenant_location_config_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
+    /// where certain tenant's LocationConf be stored.
+    pub(crate) fn tenant_location_config_path(
+        &self,
+        tenant_shard_id: &TenantShardId,
+    ) -> Utf8PathBuf {
        self.tenant_path(tenant_shard_id)
            .join(TENANT_LOCATION_CONFIG_NAME)
    }
@@ -853,22 +869,6 @@ impl PageServerConf {
        )
    }

-    pub fn traces_path(&self) -> Utf8PathBuf {
-        self.workdir.join("traces")
-    }
-
-    pub fn trace_path(
-        &self,
-        tenant_shard_id: &TenantShardId,
-        timeline_id: &TimelineId,
-        connection_id: &ConnectionId,
-    ) -> Utf8PathBuf {
-        self.traces_path()
-            .join(tenant_shard_id.to_string())
-            .join(timeline_id.to_string())
-            .join(connection_id.to_string())
-    }
-
    /// Turns storage remote path of a file into its local path.
    pub fn local_path(&self, remote_path: &RemotePath) -> Utf8PathBuf {
        remote_path.with_base(&self.workdir)
@@ -925,7 +925,7 @@ impl PageServerConf {
                "http_auth_type" => builder.http_auth_type(parse_toml_from_str(key, item)?),
                "pg_auth_type" => builder.pg_auth_type(parse_toml_from_str(key, item)?),
                "remote_storage" => {
-                    builder.remote_storage_config(RemoteStorageConfig::from_toml(item)?)
+                    builder.remote_storage_config(Some(RemoteStorageConfig::from_toml(item).context("remote_storage")?))
                }
                "tenant_config" => {
                    t_conf = TenantConfOpt::try_from(item.to_owned()).context(format!("failed to parse: '{key}'"))?;
@@ -953,7 +953,7 @@ impl PageServerConf {
                    builder.metric_collection_endpoint(Some(endpoint));
                },
                "metric_collection_bucket" => {
-                    builder.metric_collection_bucket(RemoteStorageConfig::from_toml(item)?)
+                    builder.metric_collection_bucket(Some(RemoteStorageConfig::from_toml(item)?))
                }
                "synthetic_size_calculation_interval" =>
                    builder.synthetic_size_calculation_interval(parse_toml_duration(key, item)?),
@@ -1011,9 +1011,15 @@ impl PageServerConf {
                "validate_vectored_get" => {
                    builder.get_validate_vectored_get(parse_toml_bool("validate_vectored_get", item)?)
                }
+                "image_compression" => {
+                    builder.get_image_compression(parse_toml_from_str("image_compression", item)?)
+                }
                "ephemeral_bytes_per_memory_kb" => {
                    builder.get_ephemeral_bytes_per_memory_kb(parse_toml_u64("ephemeral_bytes_per_memory_kb", item)? as usize)
                }
+                "l0_flush" => {
+                    builder.l0_flush(utils::toml_edit_ext::deserialize_item(item).context("l0_flush")?)
+                }
                _ => bail!("unrecognized pageserver option '{key}'"),
            }
        }
@@ -1095,8 +1101,10 @@ impl PageServerConf {
                NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
                    .expect("Invalid default constant"),
            ),
+            image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
            validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
            ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
+            l0_flush: L0FlushConfig::default(),
        }
    }
 }
@@ -1335,7 +1343,9 @@ background_task_maximum_delay = '334 s'
                        .expect("Invalid default constant")
                ),
                validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
+                image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
                ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
+                l0_flush: L0FlushConfig::default(),
            },
            "Correct defaults should be used when no config values are provided"
        );
@@ -1408,7 +1418,9 @@ background_task_maximum_delay = '334 s'
                        .expect("Invalid default constant")
                ),
                validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
+                image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
                ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
+                l0_flush: L0FlushConfig::default(),
            },
            "Should be able to parse all basic config values correctly"
        );
@@ -1531,34 +1543,6 @@ broker_endpoint = '{broker_endpoint}'
        Ok(())
    }

-    #[test]
-    fn parse_tenant_config() -> anyhow::Result<()> {
-        let tempdir = tempdir()?;
-        let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?;
-
-        let broker_endpoint = "http://127.0.0.1:7777";
-        let trace_read_requests = true;
-
-        let config_string = format!(
-            r#"{ALL_BASE_VALUES_TOML}
-pg_distrib_dir='{pg_distrib_dir}'
-broker_endpoint = '{broker_endpoint}'
-
-[tenant_config]
-trace_read_requests = {trace_read_requests}"#,
-        );
-
-        let toml = config_string.parse()?;
-
-        let conf = PageServerConf::parse_and_validate(&toml, &workdir)?;
-        assert_eq!(
-            conf.default_tenant_conf.trace_read_requests, trace_read_requests,
-            "Tenant config from pageserver config file should be parsed and udpated values used as defaults for all tenants",
-        );
-
-        Ok(())
-    }
-
    #[test]
    fn parse_incorrect_tenant_config() -> anyhow::Result<()> {
        let config_string = r#"
@@ -1688,6 +1672,19 @@ threshold = "20m"
        }
    }

+    #[test]
+    fn empty_remote_storage_is_error() {
+        let tempdir = tempdir().unwrap();
+        let (workdir, _) = prepare_fs(&tempdir).unwrap();
+        let input = r#"
+remote_storage = {}
+        "#;
+        let doc = toml_edit::Document::from_str(input).unwrap();
+        let err = PageServerConf::parse_and_validate(&doc, &workdir)
+            .expect_err("empty remote_storage field should fail, don't specify it if you want no remote_storage");
+        assert!(format!("{err}").contains("remote_storage"), "{err}");
+    }
+
    fn prepare_fs(tempdir: &Utf8TempDir) -> anyhow::Result<(Utf8PathBuf, Utf8PathBuf)> {
        let tempdir_path = tempdir.path();

--- a/pageserver/src/context.rs
+++ b/pageserver/src/context.rs
@@ -59,6 +59,7 @@
 //! 1. It should be easy to forward the context to callees.
 //! 2. To propagate more data from high-level to low-level code, the functions in
 //!    the middle should not need to be modified.
+//!
 //! The solution is to have a container structure ([`RequestContext`]) that
 //! carries the information. Functions that don't care about what's in it
 //! pass it along to callees.
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -382,17 +382,6 @@ pub enum DeletionQueueError {
 }

 impl DeletionQueueClient {
-    pub(crate) fn broken() -> Self {
-        // Channels whose receivers are immediately dropped.
-        let (tx, _rx) = tokio::sync::mpsc::unbounded_channel();
-        let (executor_tx, _executor_rx) = tokio::sync::mpsc::channel(1);
-        Self {
-            tx,
-            executor_tx,
-            lsn_table: Arc::default(),
-        }
-    }
-
    /// This is cancel-safe.  If you drop the future before it completes, the message
    /// is not pushed, although in the context of the deletion queue it doesn't matter: once
    /// we decide to do a deletion the decision is always final.
--- a/pageserver/src/deletion_queue/validator.rs
+++ b/pageserver/src/deletion_queue/validator.rs
@@ -190,7 +190,7 @@ where
                }
            } else {
                // If we failed validation, then do not apply any of the projected updates
-                warn!("Dropped remote consistent LSN updates for tenant {tenant_id} in stale generation {:?}", tenant_lsn_state.generation);
+                info!("Dropped remote consistent LSN updates for tenant {tenant_id} in stale generation {:?}", tenant_lsn_state.generation);
                metrics::DELETION_QUEUE.dropped_lsn_updates.inc();
            }
        }
@@ -225,7 +225,7 @@ where
                    && (tenant.generation == *validated_generation);

                if !this_list_valid {
-                    warn!("Dropping stale deletions for tenant {tenant_id} in generation {:?}, objects may be leaked", tenant.generation);
+                    info!("Dropping stale deletions for tenant {tenant_id} in generation {:?}, objects may be leaked", tenant.generation);
                    metrics::DELETION_QUEUE.keys_dropped.inc_by(tenant.len() as u64);
                    mutated = true;
                } else {
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -265,15 +265,19 @@ paths:
          type: string
          format: hex
    post:
-      description: Obtain lease for the given LSN
-      parameters:
-        - name: lsn
-          in: query
-          required: true
-          schema:
-            type: string
-            format: hex
-          description: A LSN to obtain the lease for
+      description: Obtains a lease for the given LSN.
+      requestBody:
+        content:
+          application/json:
+            schema:
+              type: object
+              required:
+               - lsn
+              properties:
+                lsn:
+                  description: A LSN to obtain the lease for.
+                  type: string
+                  format: hex
      responses:
        "200":
          description: OK
@@ -367,16 +371,7 @@ paths:
                $ref: "#/components/schemas/TenantLocationConfigResponse"
        "409":
          description: |
-            The tenant is already known to Pageserver in some way,
-            and hence this `/attach` call has been rejected.
-
-            Some examples of how this can happen:
-            - tenant was created on this pageserver
-            - tenant attachment was started by an earlier call to `/attach`.
-
-            Callers should poll the tenant status's `attachment_status` field,
-            like for status 202. See the longer description for `POST /attach`
-            for details.
+            The tenant is already being modified, perhaps by a concurrent call to this API
          content:
            application/json:
              schema:
@@ -762,8 +757,6 @@ components:
              For example this can be caused by s3 being unreachable. The retry may be implemented
              with call to detach, though it would be better to not automate it and inspec failed state
              manually before proceeding with a retry.
-
-            See the tenant `/attach` endpoint for more information.
          type: object
          required:
            - slug
@@ -880,8 +873,6 @@ components:
          type: string
        max_lsn_wal_lag:
          type: integer
-        trace_read_requests:
-          type: boolean
        heatmap_period:
          type: string
    TenantConfigResponse:
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -10,6 +10,7 @@ use std::time::Duration;

 use anyhow::{anyhow, Context, Result};
 use enumset::EnumSet;
+use futures::StreamExt;
 use futures::TryFutureExt;
 use humantime::format_rfc3339;
 use hyper::header;
@@ -22,6 +23,7 @@ use pageserver_api::models::ListAuxFilesRequest;
 use pageserver_api::models::LocationConfig;
 use pageserver_api::models::LocationConfigListResponse;
 use pageserver_api::models::LsnLease;
+use pageserver_api::models::LsnLeaseRequest;
 use pageserver_api::models::ShardParameters;
 use pageserver_api::models::TenantDetails;
 use pageserver_api::models::TenantLocationConfigResponse;
@@ -31,32 +33,30 @@ use pageserver_api::models::TenantShardLocation;
 use pageserver_api::models::TenantShardSplitRequest;
 use pageserver_api::models::TenantShardSplitResponse;
 use pageserver_api::models::TenantSorting;
-use pageserver_api::models::TenantState;
 use pageserver_api::models::TopTenantShardItem;
 use pageserver_api::models::TopTenantShardsRequest;
 use pageserver_api::models::TopTenantShardsResponse;
 use pageserver_api::models::{
-    DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantAttachRequest,
-    TenantLocationConfigRequest,
+    DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantLocationConfigRequest,
 };
 use pageserver_api::shard::ShardCount;
 use pageserver_api::shard::TenantShardId;
 use remote_storage::DownloadError;
 use remote_storage::GenericRemoteStorage;
 use remote_storage::TimeTravelError;
-use tenant_size_model::{SizeResult, StorageModel};
+use tenant_size_model::{svg::SvgBranchKind, SizeResult, StorageModel};
+use tokio_util::io::StreamReader;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::auth::JwtAuth;
 use utils::failpoint_support::failpoints_handler;
 use utils::http::endpoint::prometheus_metrics_handler;
 use utils::http::endpoint::request_span;
-use utils::http::json::json_request_or_empty_body;
+use utils::http::request::must_parse_query_param;
 use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};

 use crate::context::{DownloadBehavior, RequestContext};
 use crate::deletion_queue::DeletionQueueClient;
-use crate::metrics::{StorageTimeOperation, STORAGE_TIME_GLOBAL};
 use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::task_mgr::TaskKind;
 use crate::tenant::config::{LocationConf, TenantConfOpt};
@@ -78,13 +78,12 @@ use crate::tenant::timeline::CompactFlags;
 use crate::tenant::timeline::CompactionError;
 use crate::tenant::timeline::Timeline;
 use crate::tenant::GetTimelineError;
-use crate::tenant::SpawnMode;
 use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError};
 use crate::{config::PageServerConf, tenant::mgr};
 use crate::{disk_usage_eviction_task, tenant};
 use pageserver_api::models::{
-    StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo,
-    TimelineCreateRequest, TimelineGcRequest, TimelineInfo,
+    StatusResponse, TenantConfigRequest, TenantInfo, TimelineCreateRequest, TimelineGcRequest,
+    TimelineInfo,
 };
 use utils::{
    auth::SwappableJwtAuth,
@@ -232,7 +231,7 @@ impl From<UpsertLocationError> for ApiError {
            BadRequest(e) => ApiError::BadRequest(e),
            Unavailable(_) => ApiError::ShuttingDown,
            e @ InProgress => ApiError::Conflict(format!("{e}")),
-            Flush(e) | Other(e) => ApiError::InternalServerError(e),
+            Flush(e) | InternalError(e) => ApiError::InternalServerError(e),
        }
    }
 }
@@ -411,6 +410,8 @@ async fn build_timeline_info_common(

    let walreceiver_status = timeline.walreceiver_status();

+    let (pitr_history_size, within_ancestor_pitr) = timeline.get_pitr_history_stats();
+
    let info = TimelineInfo {
        tenant_id: timeline.tenant_shard_id,
        timeline_id: timeline.timeline_id,
@@ -431,6 +432,8 @@ async fn build_timeline_info_common(
        directory_entries_counts: timeline.get_directory_metrics().to_vec(),
        current_physical_size,
        current_logical_size_non_incremental: None,
+        pitr_history_size,
+        within_ancestor_pitr,
        timeline_dir_layer_file_size_sum: None,
        wal_source_connstr,
        last_received_msg_lsn,
@@ -821,58 +824,6 @@ async fn get_timestamp_of_lsn_handler(
    }
 }

-async fn tenant_attach_handler(
-    mut request: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
-    check_permission(&request, Some(tenant_id))?;
-
-    let maybe_body: Option<TenantAttachRequest> = json_request_or_empty_body(&mut request).await?;
-    let tenant_conf = match &maybe_body {
-        Some(request) => TenantConfOpt::try_from(&*request.config).map_err(ApiError::BadRequest)?,
-        None => TenantConfOpt::default(),
-    };
-
-    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
-
-    info!("Handling tenant attach {tenant_id}");
-
-    let state = get_state(&request);
-
-    let generation = get_request_generation(state, maybe_body.as_ref().and_then(|r| r.generation))?;
-
-    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
-    let shard_params = ShardParameters::default();
-    let location_conf = LocationConf::attached_single(tenant_conf, generation, &shard_params);
-
-    let tenant = state
-        .tenant_manager
-        .upsert_location(tenant_shard_id, location_conf, None, SpawnMode::Eager, &ctx)
-        .await?;
-
-    let Some(tenant) = tenant else {
-        // This should never happen: indicates a bug in upsert_location
-        return Err(ApiError::InternalServerError(anyhow::anyhow!(
-            "Upsert succeeded but didn't return tenant!"
-        )));
-    };
-
-    // We might have successfully constructed a Tenant, but it could still
-    // end up in a broken state:
-    if let TenantState::Broken {
-        reason,
-        backtrace: _,
-    } = tenant.current_state()
-    {
-        return Err(ApiError::InternalServerError(anyhow::anyhow!(
-            "Tenant state is Broken: {reason}"
-        )));
-    }
-
-    json_response(StatusCode::ACCEPTED, ())
-}
-
 async fn timeline_delete_handler(
    request: Request<Body>,
    _cancel: CancellationToken,
@@ -903,26 +854,6 @@ async fn timeline_delete_handler(
    json_response(StatusCode::ACCEPTED, ())
 }

-async fn tenant_detach_handler(
-    request: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
-    check_permission(&request, Some(tenant_id))?;
-    // This is a legacy API (`/location_conf` is the replacement).  It only supports unsharded tenants
-    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
-
-    let state = get_state(&request);
-    let conf = state.conf;
-    state
-        .tenant_manager
-        .detach_tenant(conf, tenant_shard_id, &state.deletion_queue_client)
-        .instrument(info_span!("tenant_detach", %tenant_id, shard_id=%tenant_shard_id.shard_slug()))
-        .await?;
-
-    json_response(StatusCode::OK, ())
-}
-
 async fn tenant_reset_handler(
    request: Request<Body>,
    _cancel: CancellationToken,
@@ -962,7 +893,9 @@ async fn tenant_list_handler(
            state: state.clone(),
            current_physical_size: None,
            attachment_status: state.attachment_status(),
-            generation: (*gen).into(),
+            generation: (*gen)
+                .into()
+                .expect("Tenants are always attached with a generation"),
        })
        .collect::<Vec<TenantInfo>>();

@@ -1010,7 +943,10 @@ async fn tenant_status(
                state: state.clone(),
                current_physical_size: Some(current_physical_size),
                attachment_status: state.attachment_status(),
-                generation: tenant.generation().into(),
+                generation: tenant
+                    .generation()
+                    .into()
+                    .expect("Tenants are always attached with a generation"),
            },
            walredo: tenant.wal_redo_manager_status(),
            timelines: tenant.list_timeline_ids(),
@@ -1263,10 +1199,15 @@ fn synthetic_size_html_response(
        timeline_map.insert(ti.timeline_id, index);
        timeline_ids.push(ti.timeline_id.to_string());
    }
-    let seg_to_branch: Vec<usize> = inputs
+    let seg_to_branch: Vec<(usize, SvgBranchKind)> = inputs
        .segments
        .iter()
-        .map(|seg| *timeline_map.get(&seg.timeline_id).unwrap())
+        .map(|seg| {
+            (
+                *timeline_map.get(&seg.timeline_id).unwrap(),
+                seg.kind.into(),
+            )
+        })
        .collect();

    let svg =
@@ -1307,75 +1248,6 @@ pub fn html_response(status: StatusCode, data: String) -> Result<Response<Body>,
    Ok(response)
 }

-/// Helper for requests that may take a generation, which is mandatory
-/// when control_plane_api is set, but otherwise defaults to Generation::none()
-fn get_request_generation(state: &State, req_gen: Option<u32>) -> Result<Generation, ApiError> {
-    if state.conf.control_plane_api.is_some() {
-        req_gen
-            .map(Generation::new)
-            .ok_or(ApiError::BadRequest(anyhow!(
-                "generation attribute missing"
-            )))
-    } else {
-        // Legacy mode: all tenants operate with no generation
-        Ok(Generation::none())
-    }
-}
-
-async fn tenant_create_handler(
-    mut request: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    let request_data: TenantCreateRequest = json_request(&mut request).await?;
-    let target_tenant_id = request_data.new_tenant_id;
-    check_permission(&request, None)?;
-
-    let _timer = STORAGE_TIME_GLOBAL
-        .get_metric_with_label_values(&[StorageTimeOperation::CreateTenant.into()])
-        .expect("bug")
-        .start_timer();
-
-    let tenant_conf =
-        TenantConfOpt::try_from(&request_data.config).map_err(ApiError::BadRequest)?;
-
-    let state = get_state(&request);
-
-    let generation = get_request_generation(state, request_data.generation)?;
-
-    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
-
-    let location_conf =
-        LocationConf::attached_single(tenant_conf, generation, &request_data.shard_parameters);
-
-    let new_tenant = state
-        .tenant_manager
-        .upsert_location(
-            target_tenant_id,
-            location_conf,
-            None,
-            SpawnMode::Create,
-            &ctx,
-        )
-        .await?;
-
-    let Some(new_tenant) = new_tenant else {
-        // This should never happen: indicates a bug in upsert_location
-        return Err(ApiError::InternalServerError(anyhow::anyhow!(
-            "Upsert succeeded but didn't return tenant!"
-        )));
-    };
-    // We created the tenant. Existing API semantics are that the tenant
-    // is Active when this function returns.
-    new_tenant
-        .wait_to_become_active(ACTIVE_TENANT_TIMEOUT)
-        .await?;
-
-    json_response(
-        StatusCode::CREATED,
-        TenantCreateResponse(new_tenant.tenant_shard_id().tenant_id),
-    )
-}
-
 async fn get_tenant_config_handler(
    request: Request<Body>,
    _cancel: CancellationToken,
@@ -1437,7 +1309,7 @@ async fn update_tenant_config_handler(

    crate::tenant::Tenant::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf)
        .await
-        .map_err(ApiError::InternalServerError)?;
+        .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;
    tenant.set_new_tenant_config(new_tenant_conf);

    json_response(StatusCode::OK, ())
@@ -1668,15 +1540,13 @@ async fn handle_tenant_break(

 // Obtains an lsn lease on the given timeline.
 async fn lsn_lease_handler(
-    request: Request<Body>,
+    mut request: Request<Body>,
    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
-
-    let lsn: Lsn = parse_query_param(&request, "lsn")?
-        .ok_or_else(|| ApiError::BadRequest(anyhow!("missing 'lsn' query parameter")))?;
+    let lsn = json_request::<LsnLeaseRequest>(&mut request).await?.lsn;

    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);

@@ -1727,6 +1597,14 @@ async fn timeline_compact_handler(
    if Some(true) == parse_query_param::<_, bool>(&request, "force_image_layer_creation")? {
        flags |= CompactFlags::ForceImageLayerCreation;
    }
+    if Some(true) == parse_query_param::<_, bool>(&request, "enhanced_gc_bottom_most_compaction")? {
+        if !cfg!(feature = "testing") {
+            return Err(ApiError::InternalServerError(anyhow!(
+                "enhanced_gc_bottom_most_compaction is only available in testing mode"
+            )));
+        }
+        flags |= CompactFlags::EnhancedGcBottomMostCompaction;
+    }
    let wait_until_uploaded =
        parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false);

@@ -1843,7 +1721,9 @@ async fn timeline_detach_ancestor_handler(
    request: Request<Body>,
    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    use crate::tenant::timeline::detach_ancestor::Options;
+    use crate::tenant::timeline::detach_ancestor;
+    use pageserver_api::models::detach_ancestor::AncestorDetached;
+
    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
@@ -1851,7 +1731,7 @@ async fn timeline_detach_ancestor_handler(
    let span = tracing::info_span!("detach_ancestor", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id);

    async move {
-        let mut options = Options::default();
+        let mut options = detach_ancestor::Options::default();

        let rewrite_concurrency =
            parse_query_param::<_, std::num::NonZeroUsize>(&request, "rewrite_concurrency")?;
@@ -1879,27 +1759,36 @@ async fn timeline_detach_ancestor_handler(

        let timeline = tenant.get_timeline(timeline_id, true)?;

-        let (_guard, prepared) = timeline
+        let progress = timeline
            .prepare_to_detach_from_ancestor(&tenant, options, ctx)
            .await?;

-        let res = state
-            .tenant_manager
-            .complete_detaching_timeline_ancestor(tenant_shard_id, timeline_id, prepared, ctx)
-            .await;
+        // uncomment to allow early as possible Tenant::drop
+        // drop(tenant);

-        match res {
-            Ok(reparented_timelines) => {
-                let resp = pageserver_api::models::detach_ancestor::AncestorDetached {
+        let resp = match progress {
+            detach_ancestor::Progress::Prepared(_guard, prepared) => {
+                // it would be great to tag the guard on to the tenant activation future
+                let reparented_timelines = state
+                    .tenant_manager
+                    .complete_detaching_timeline_ancestor(
+                        tenant_shard_id,
+                        timeline_id,
+                        prepared,
+                        ctx,
+                    )
+                    .await
+                    .context("timeline detach ancestor completion")
+                    .map_err(ApiError::InternalServerError)?;
+
+                AncestorDetached {
                    reparented_timelines,
-                };
-
-                json_response(StatusCode::OK, resp)
+                }
            }
-            Err(e) => Err(ApiError::InternalServerError(
-                e.context("timeline detach completion"),
-            )),
-        }
+            detach_ancestor::Progress::Done(resp) => resp,
+        };
+
+        json_response(StatusCode::OK, resp)
    }
    .instrument(span)
    .await
@@ -2529,6 +2418,189 @@ async fn post_top_tenants(
    )
 }

+async fn put_tenant_timeline_import_basebackup(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+    let base_lsn: Lsn = must_parse_query_param(&request, "base_lsn")?;
+    let end_lsn: Lsn = must_parse_query_param(&request, "end_lsn")?;
+    let pg_version: u32 = must_parse_query_param(&request, "pg_version")?;
+
+    check_permission(&request, Some(tenant_id))?;
+
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
+
+    let span = info_span!("import_basebackup", tenant_id=%tenant_id, timeline_id=%timeline_id, base_lsn=%base_lsn, end_lsn=%end_lsn, pg_version=%pg_version);
+    async move {
+        let state = get_state(&request);
+        let tenant = state
+            .tenant_manager
+            .get_attached_tenant_shard(TenantShardId::unsharded(tenant_id))?;
+
+        let broker_client = state.broker_client.clone();
+
+        let mut body = StreamReader::new(request.into_body().map(|res| {
+            res.map_err(|error| {
+                std::io::Error::new(std::io::ErrorKind::Other, anyhow::anyhow!(error))
+            })
+        }));
+
+        tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
+
+        let timeline = tenant
+            .create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx)
+            .map_err(ApiError::InternalServerError)
+            .await?;
+
+        // TODO mark timeline as not ready until it reaches end_lsn.
+        // We might have some wal to import as well, and we should prevent compute
+        // from connecting before that and writing conflicting wal.
+        //
+        // This is not relevant for pageserver->pageserver migrations, since there's
+        // no wal to import. But should be fixed if we want to import from postgres.
+
+        // TODO leave clean state on error. For now you can use detach to clean
+        // up broken state from a failed import.
+
+        // Import basebackup provided via CopyData
+        info!("importing basebackup");
+
+        timeline
+            .import_basebackup_from_tar(tenant.clone(), &mut body, base_lsn, broker_client, &ctx)
+            .await
+            .map_err(ApiError::InternalServerError)?;
+
+        // Read the end of the tar archive.
+        read_tar_eof(body)
+            .await
+            .map_err(ApiError::InternalServerError)?;
+
+        // TODO check checksum
+        // Meanwhile you can verify client-side by taking fullbackup
+        // and checking that it matches in size with what was imported.
+        // It wouldn't work if base came from vanilla postgres though,
+        // since we discard some log files.
+
+        info!("done");
+        json_response(StatusCode::OK, ())
+    }
+    .instrument(span)
+    .await
+}
+
+async fn put_tenant_timeline_import_wal(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+    let start_lsn: Lsn = must_parse_query_param(&request, "start_lsn")?;
+    let end_lsn: Lsn = must_parse_query_param(&request, "end_lsn")?;
+
+    check_permission(&request, Some(tenant_id))?;
+
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
+
+    let span = info_span!("import_wal", tenant_id=%tenant_id, timeline_id=%timeline_id, start_lsn=%start_lsn, end_lsn=%end_lsn);
+    async move {
+        let state = get_state(&request);
+
+        let timeline = active_timeline_of_active_tenant(&state.tenant_manager, TenantShardId::unsharded(tenant_id), timeline_id).await?;
+
+        let mut body = StreamReader::new(request.into_body().map(|res| {
+            res.map_err(|error| {
+                std::io::Error::new(std::io::ErrorKind::Other, anyhow::anyhow!(error))
+            })
+        }));
+
+        let last_record_lsn = timeline.get_last_record_lsn();
+        if last_record_lsn != start_lsn {
+            return Err(ApiError::InternalServerError(anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}")));
+        }
+
+        // TODO leave clean state on error. For now you can use detach to clean
+        // up broken state from a failed import.
+
+        // Import wal provided via CopyData
+        info!("importing wal");
+        crate::import_datadir::import_wal_from_tar(&timeline, &mut body, start_lsn, end_lsn, &ctx).await.map_err(ApiError::InternalServerError)?;
+        info!("wal import complete");
+
+        // Read the end of the tar archive.
+        read_tar_eof(body).await.map_err(ApiError::InternalServerError)?;
+
+        // TODO Does it make sense to overshoot?
+        if timeline.get_last_record_lsn() < end_lsn {
+            return Err(ApiError::InternalServerError(anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}")));
+        }
+
+        // Flush data to disk, then upload to s3. No need for a forced checkpoint.
+        // We only want to persist the data, and it doesn't matter if it's in the
+        // shape of deltas or images.
+        info!("flushing layers");
+        timeline.freeze_and_flush().await.map_err(|e| match e {
+            tenant::timeline::FlushLayerError::Cancelled => ApiError::ShuttingDown,
+            other => ApiError::InternalServerError(anyhow::anyhow!(other)),
+        })?;
+
+        info!("done");
+
+        json_response(StatusCode::OK, ())
+    }.instrument(span).await
+}
+
+/// Read the end of a tar archive.
+///
+/// A tar archive normally ends with two consecutive blocks of zeros, 512 bytes each.
+/// `tokio_tar` already read the first such block. Read the second all-zeros block,
+/// and check that there is no more data after the EOF marker.
+///
+/// 'tar' command can also write extra blocks of zeros, up to a record
+/// size, controlled by the --record-size argument. Ignore them too.
+async fn read_tar_eof(mut reader: (impl tokio::io::AsyncRead + Unpin)) -> anyhow::Result<()> {
+    use tokio::io::AsyncReadExt;
+    let mut buf = [0u8; 512];
+
+    // Read the all-zeros block, and verify it
+    let mut total_bytes = 0;
+    while total_bytes < 512 {
+        let nbytes = reader.read(&mut buf[total_bytes..]).await?;
+        total_bytes += nbytes;
+        if nbytes == 0 {
+            break;
+        }
+    }
+    if total_bytes < 512 {
+        anyhow::bail!("incomplete or invalid tar EOF marker");
+    }
+    if !buf.iter().all(|&x| x == 0) {
+        anyhow::bail!("invalid tar EOF marker");
+    }
+
+    // Drain any extra zero-blocks after the EOF marker
+    let mut trailing_bytes = 0;
+    let mut seen_nonzero_bytes = false;
+    loop {
+        let nbytes = reader.read(&mut buf).await?;
+        trailing_bytes += nbytes;
+        if !buf.iter().all(|&x| x == 0) {
+            seen_nonzero_bytes = true;
+        }
+        if nbytes == 0 {
+            break;
+        }
+    }
+    if seen_nonzero_bytes {
+        anyhow::bail!("unexpected non-zero bytes after the tar archive");
+    }
+    if trailing_bytes % 512 != 0 {
+        anyhow::bail!("unexpected number of zeros ({trailing_bytes}), not divisible by tar block size (512 bytes), after the tar archive");
+    }
+    Ok(())
+}
+
 /// Common functionality of all the HTTP API handlers.
 ///
 /// - Adds a tracing span to each request (by `request_span`)
@@ -2673,7 +2745,6 @@ pub fn make_router(
            api_handler(r, reload_auth_validation_keys_handler)
        })
        .get("/v1/tenant", |r| api_handler(r, tenant_list_handler))
-        .post("/v1/tenant", |r| api_handler(r, tenant_create_handler))
        .get("/v1/tenant/:tenant_shard_id", |r| {
            api_handler(r, tenant_status)
        })
@@ -2711,12 +2782,6 @@ pub fn make_router(
        .post("/v1/tenant/:tenant_shard_id/timeline", |r| {
            api_handler(r, timeline_create_handler)
        })
-        .post("/v1/tenant/:tenant_id/attach", |r| {
-            api_handler(r, tenant_attach_handler)
-        })
-        .post("/v1/tenant/:tenant_id/detach", |r| {
-            api_handler(r, tenant_detach_handler)
-        })
        .post("/v1/tenant/:tenant_shard_id/reset", |r| {
            api_handler(r, tenant_reset_handler)
        })
@@ -2830,5 +2895,13 @@ pub fn make_router(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/perf_info",
            |r| testing_api_handler("perf_info", r, perf_info),
        )
+        .put(
+            "/v1/tenant/:tenant_id/timeline/:timeline_id/import_basebackup",
+            |r| api_handler(r, put_tenant_timeline_import_basebackup),
+        )
+        .put(
+            "/v1/tenant/:tenant_id/timeline/:timeline_id/import_wal",
+            |r| api_handler(r, put_tenant_timeline_import_wal),
+        )
        .any(handler_404))
 }
--- a/pageserver/src/l0_flush.rs
+++ b/pageserver/src/l0_flush.rs
@@ -0,0 +1,46 @@
+use std::{num::NonZeroUsize, sync::Arc};
+
+use crate::tenant::ephemeral_file;
+
+#[derive(Default, Debug, PartialEq, Eq, Clone, serde::Deserialize)]
+#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
+pub enum L0FlushConfig {
+    #[default]
+    PageCached,
+    #[serde(rename_all = "snake_case")]
+    Direct { max_concurrency: NonZeroUsize },
+}
+
+#[derive(Clone)]
+pub struct L0FlushGlobalState(Arc<Inner>);
+
+pub(crate) enum Inner {
+    PageCached,
+    Direct { semaphore: tokio::sync::Semaphore },
+}
+
+impl L0FlushGlobalState {
+    pub fn new(config: L0FlushConfig) -> Self {
+        match config {
+            L0FlushConfig::PageCached => Self(Arc::new(Inner::PageCached)),
+            L0FlushConfig::Direct { max_concurrency } => {
+                let semaphore = tokio::sync::Semaphore::new(max_concurrency.get());
+                Self(Arc::new(Inner::Direct { semaphore }))
+            }
+        }
+    }
+
+    pub(crate) fn inner(&self) -> &Arc<Inner> {
+        &self.0
+    }
+}
+
+impl L0FlushConfig {
+    pub(crate) fn prewarm_on_write(&self) -> ephemeral_file::PrewarmPageCacheOnWrite {
+        use L0FlushConfig::*;
+        match self {
+            PageCached => ephemeral_file::PrewarmPageCacheOnWrite::Yes,
+            Direct { .. } => ephemeral_file::PrewarmPageCacheOnWrite::No,
+        }
+    }
+}
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -11,6 +11,7 @@ pub mod deletion_queue;
 pub mod disk_usage_eviction_task;
 pub mod http;
 pub mod import_datadir;
+pub mod l0_flush;
 pub use pageserver_api::keyspace;
 pub mod aux_file;
 pub mod metrics;
@@ -22,7 +23,6 @@ pub mod span;
 pub(crate) mod statvfs;
 pub mod task_mgr;
 pub mod tenant;
-pub mod trace;
 pub mod utilization;
 pub mod virtual_file;
 pub mod walingest;
@@ -113,11 +113,7 @@ pub async fn shutdown_pageserver(
 }

 /// Per-tenant configuration file.
-/// Full path: `tenants/<tenant_id>/config`.
-pub(crate) const TENANT_CONFIG_NAME: &str = "config";
-
-/// Per-tenant configuration file.
-/// Full path: `tenants/<tenant_id>/config`.
+/// Full path: `tenants/<tenant_id>/config-v1`.
 pub(crate) const TENANT_LOCATION_CONFIG_NAME: &str = "config-v1";

 /// Per-tenant copy of their remote heatmap, downloaded into the local
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -8,7 +8,7 @@ use metrics::{
 };
 use once_cell::sync::Lazy;
 use pageserver_api::shard::TenantShardId;
-use strum::{EnumCount, IntoEnumIterator, VariantNames};
+use strum::{EnumCount, VariantNames};
 use strum_macros::{EnumVariantNames, IntoStaticStr};
 use tracing::warn;
 use utils::id::TimelineId;
@@ -53,9 +53,6 @@ pub(crate) enum StorageTimeOperation {

    #[strum(serialize = "find gc cutoffs")]
    FindGcCutoffs,
-
-    #[strum(serialize = "create tenant")]
-    CreateTenant,
 }

 pub(crate) static STORAGE_TIME_SUM_PER_TIMELINE: Lazy<CounterVec> = Lazy::new(|| {
@@ -467,6 +464,49 @@ static LAST_RECORD_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

+static PITR_HISTORY_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
+    register_uint_gauge_vec!(
+        "pageserver_pitr_history_size",
+        "Data written since PITR cutoff on this timeline",
+        &["tenant_id", "shard_id", "timeline_id"]
+    )
+    .expect("failed to define a metric")
+});
+
+#[derive(strum_macros::EnumString, strum_macros::Display, strum_macros::IntoStaticStr)]
+#[strum(serialize_all = "kebab_case")]
+pub(crate) enum MetricLayerKind {
+    Delta,
+    Image,
+}
+
+static TIMELINE_LAYER_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
+    register_uint_gauge_vec!(
+        "pageserver_layer_bytes",
+        "Sum of layer physical sizes in bytes",
+        &["tenant_id", "shard_id", "timeline_id", "kind"]
+    )
+    .expect("failed to define a metric")
+});
+
+static TIMELINE_LAYER_COUNT: Lazy<UIntGaugeVec> = Lazy::new(|| {
+    register_uint_gauge_vec!(
+        "pageserver_layer_count",
+        "Number of layers that exist",
+        &["tenant_id", "shard_id", "timeline_id", "kind"]
+    )
+    .expect("failed to define a metric")
+});
+
+static TIMELINE_ARCHIVE_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
+    register_uint_gauge_vec!(
+        "pageserver_archive_size",
+        "Timeline's logical size if it is considered eligible for archival (outside PITR window), else zero",
+        &["tenant_id", "shard_id", "timeline_id"]
+    )
+    .expect("failed to define a metric")
+});
+
 static STANDBY_HORIZON: Lazy<IntGaugeVec> = Lazy::new(|| {
    register_int_gauge_vec!(
        "pageserver_standby_horizon",
@@ -479,7 +519,7 @@ static STANDBY_HORIZON: Lazy<IntGaugeVec> = Lazy::new(|| {
 static RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
    register_uint_gauge_vec!(
        "pageserver_resident_physical_size",
-        "The size of the layer files present in the pageserver's filesystem.",
+        "The size of the layer files present in the pageserver's filesystem, for attached locations.",
        &["tenant_id", "shard_id", "timeline_id"]
    )
    .expect("failed to define a metric")
@@ -545,6 +585,31 @@ static AUX_FILE_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

+static VALID_LSN_LEASE_COUNT: Lazy<UIntGaugeVec> = Lazy::new(|| {
+    register_uint_gauge_vec!(
+        "pageserver_valid_lsn_lease_count",
+        "The number of valid leases after refreshing gc info.",
+        &["tenant_id", "shard_id", "timeline_id"],
+    )
+    .expect("failed to define a metric")
+});
+
+pub(crate) static CIRCUIT_BREAKERS_BROKEN: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "pageserver_circuit_breaker_broken",
+        "How many times a circuit breaker has broken"
+    )
+    .expect("failed to define a metric")
+});
+
+pub(crate) static CIRCUIT_BREAKERS_UNBROKEN: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "pageserver_circuit_breaker_unbroken",
+        "How many times a circuit breaker has been un-broken (recovered)"
+    )
+    .expect("failed to define a metric")
+});
+
 pub(crate) mod initial_logical_size {
    use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
    use once_cell::sync::Lazy;
@@ -1070,21 +1135,12 @@ pub(crate) mod virtual_file_io_engine {
    });
 }

-#[derive(Debug)]
-struct GlobalAndPerTimelineHistogram {
-    global: Histogram,
-    per_tenant_timeline: Histogram,
-}
-
-impl GlobalAndPerTimelineHistogram {
-    fn observe(&self, value: f64) {
-        self.global.observe(value);
-        self.per_tenant_timeline.observe(value);
-    }
-}
-
 struct GlobalAndPerTimelineHistogramTimer<'a, 'c> {
-    h: &'a GlobalAndPerTimelineHistogram,
+    global_metric: &'a Histogram,
+
+    // Optional because not all op types are tracked per-timeline
+    timeline_metric: Option<&'a Histogram>,
+
    ctx: &'c RequestContext,
    start: std::time::Instant,
    op: SmgrQueryType,
@@ -1115,7 +1171,10 @@ impl<'a, 'c> Drop for GlobalAndPerTimelineHistogramTimer<'a, 'c> {
                elapsed
            }
        };
-        self.h.observe(ex_throttled.as_secs_f64());
+        self.global_metric.observe(ex_throttled.as_secs_f64());
+        if let Some(timeline_metric) = self.timeline_metric {
+            timeline_metric.observe(ex_throttled.as_secs_f64());
+        }
    }
 }

@@ -1140,7 +1199,8 @@ pub enum SmgrQueryType {

 #[derive(Debug)]
 pub(crate) struct SmgrQueryTimePerTimeline {
-    metrics: [GlobalAndPerTimelineHistogram; SmgrQueryType::COUNT],
+    global_metrics: [Histogram; SmgrQueryType::COUNT],
+    per_timeline_getpage: Histogram,
 }

 static SMGR_QUERY_TIME_PER_TENANT_TIMELINE: Lazy<HistogramVec> = Lazy::new(|| {
@@ -1218,27 +1278,32 @@ impl SmgrQueryTimePerTimeline {
        let tenant_id = tenant_shard_id.tenant_id.to_string();
        let shard_slug = format!("{}", tenant_shard_id.shard_slug());
        let timeline_id = timeline_id.to_string();
-        let metrics = std::array::from_fn(|i| {
+        let global_metrics = std::array::from_fn(|i| {
            let op = SmgrQueryType::from_repr(i).unwrap();
-            let global = SMGR_QUERY_TIME_GLOBAL
+            SMGR_QUERY_TIME_GLOBAL
                .get_metric_with_label_values(&[op.into()])
-                .unwrap();
-            let per_tenant_timeline = SMGR_QUERY_TIME_PER_TENANT_TIMELINE
-                .get_metric_with_label_values(&[op.into(), &tenant_id, &shard_slug, &timeline_id])
-                .unwrap();
-            GlobalAndPerTimelineHistogram {
-                global,
-                per_tenant_timeline,
-            }
+                .unwrap()
        });
-        Self { metrics }
+
+        let per_timeline_getpage = SMGR_QUERY_TIME_PER_TENANT_TIMELINE
+            .get_metric_with_label_values(&[
+                SmgrQueryType::GetPageAtLsn.into(),
+                &tenant_id,
+                &shard_slug,
+                &timeline_id,
+            ])
+            .unwrap();
+        Self {
+            global_metrics,
+            per_timeline_getpage,
+        }
    }
    pub(crate) fn start_timer<'c: 'a, 'a>(
        &'a self,
        op: SmgrQueryType,
        ctx: &'c RequestContext,
-    ) -> impl Drop + '_ {
-        let metric = &self.metrics[op as usize];
+    ) -> Option<impl Drop + '_> {
+        let global_metric = &self.global_metrics[op as usize];
        let start = Instant::now();
        match ctx.micros_spent_throttled.open() {
            Ok(()) => (),
@@ -1257,12 +1322,20 @@ impl SmgrQueryTimePerTimeline {
                });
            }
        }
-        GlobalAndPerTimelineHistogramTimer {
-            h: metric,
+
+        let timeline_metric = if matches!(op, SmgrQueryType::GetPageAtLsn) {
+            Some(&self.per_timeline_getpage)
+        } else {
+            None
+        };
+
+        Some(GlobalAndPerTimelineHistogramTimer {
+            global_metric,
+            timeline_metric,
            ctx,
            start,
            op,
-        }
+        })
    }
 }

@@ -1309,17 +1382,9 @@ mod smgr_query_time_tests {
            let get_counts = || {
                let global: u64 = ops
                    .iter()
-                    .map(|op| metrics.metrics[*op as usize].global.get_sample_count())
+                    .map(|op| metrics.global_metrics[*op as usize].get_sample_count())
                    .sum();
-                let per_tenant_timeline: u64 = ops
-                    .iter()
-                    .map(|op| {
-                        metrics.metrics[*op as usize]
-                            .per_tenant_timeline
-                            .get_sample_count()
-                    })
-                    .sum();
-                (global, per_tenant_timeline)
+                (global, metrics.per_timeline_getpage.get_sample_count())
            };

            let (pre_global, pre_per_tenant_timeline) = get_counts();
@@ -1330,7 +1395,12 @@ mod smgr_query_time_tests {
            drop(timer);

            let (post_global, post_per_tenant_timeline) = get_counts();
-            assert_eq!(post_per_tenant_timeline, 1);
+            if matches!(op, super::SmgrQueryType::GetPageAtLsn) {
+                // getpage ops are tracked per-timeline, others aren't
+                assert_eq!(post_per_tenant_timeline, 1);
+            } else {
+                assert_eq!(post_per_tenant_timeline, 0);
+            }
            assert!(post_global > pre_global);
        }
    }
@@ -1427,15 +1497,53 @@ impl<'a, 'c> BasebackupQueryTimeOngoingRecording<'a, 'c> {
    }
 }

-pub(crate) static LIVE_CONNECTIONS_COUNT: Lazy<IntGaugeVec> = Lazy::new(|| {
-    register_int_gauge_vec!(
-        "pageserver_live_connections",
-        "Number of live network connections",
+pub(crate) static LIVE_CONNECTIONS: Lazy<IntCounterPairVec> = Lazy::new(|| {
+    register_int_counter_pair_vec!(
+        "pageserver_live_connections_started",
+        "Number of network connections that we started handling",
+        "pageserver_live_connections_finished",
+        "Number of network connections that we finished handling",
        &["pageserver_connection_kind"]
    )
    .expect("failed to define a metric")
 });

+#[derive(Clone, Copy, enum_map::Enum, IntoStaticStr)]
+pub(crate) enum ComputeCommandKind {
+    PageStreamV2,
+    PageStream,
+    Basebackup,
+    Fullbackup,
+    LeaseLsn,
+}
+
+pub(crate) struct ComputeCommandCounters {
+    map: EnumMap<ComputeCommandKind, IntCounter>,
+}
+
+pub(crate) static COMPUTE_COMMANDS_COUNTERS: Lazy<ComputeCommandCounters> = Lazy::new(|| {
+    let inner = register_int_counter_vec!(
+        "pageserver_compute_commands",
+        "Number of compute -> pageserver commands processed",
+        &["command"]
+    )
+    .expect("failed to define a metric");
+
+    ComputeCommandCounters {
+        map: EnumMap::from_array(std::array::from_fn(|i| {
+            let command = <ComputeCommandKind as enum_map::Enum>::from_usize(i);
+            let command_str: &'static str = command.into();
+            inner.with_label_values(&[command_str])
+        })),
+    }
+});
+
+impl ComputeCommandCounters {
+    pub(crate) fn for_command(&self, command: ComputeCommandKind) -> &IntCounter {
+        &self.map[command]
+    }
+}
+
 // remote storage metrics

 static REMOTE_TIMELINE_CLIENT_CALLS: Lazy<IntCounterPairVec> = Lazy::new(|| {
@@ -1645,6 +1753,15 @@ pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| {
 }
 });

+pub(crate) static SECONDARY_RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
+    register_uint_gauge_vec!(
+        "pageserver_secondary_resident_physical_size",
+        "The size of the layer files present in the pageserver's filesystem, for secondary locations.",
+        &["tenant_id", "shard_id"]
+    )
+    .expect("failed to define a metric")
+});
+
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 pub enum RemoteOpKind {
    Upload,
@@ -2047,6 +2164,12 @@ pub(crate) struct TimelineMetrics {
    pub garbage_collect_histo: StorageTimeMetrics,
    pub find_gc_cutoffs_histo: StorageTimeMetrics,
    pub last_record_gauge: IntGauge,
+    pub pitr_history_size: UIntGauge,
+    pub archival_size: UIntGauge,
+    pub(crate) layer_size_image: UIntGauge,
+    pub(crate) layer_count_image: UIntGauge,
+    pub(crate) layer_size_delta: UIntGauge,
+    pub(crate) layer_count_delta: UIntGauge,
    pub standby_horizon_gauge: IntGauge,
    pub resident_physical_size_gauge: UIntGauge,
    /// copy of LayeredTimeline.current_logical_size
@@ -2055,6 +2178,8 @@ pub(crate) struct TimelineMetrics {
    pub directory_entries_count_gauge: Lazy<UIntGauge, Box<dyn Send + Fn() -> UIntGauge>>,
    pub evictions: IntCounter,
    pub evictions_with_low_residence_duration: std::sync::RwLock<EvictionsWithLowResidenceDuration>,
+    /// Number of valid LSN leases.
+    pub valid_lsn_lease_count_gauge: UIntGauge,
    shutdown: std::sync::atomic::AtomicBool,
 }

@@ -2118,6 +2243,51 @@ impl TimelineMetrics {
        let last_record_gauge = LAST_RECORD_LSN
            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
            .unwrap();
+
+        let pitr_history_size = PITR_HISTORY_SIZE
+            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
+            .unwrap();
+
+        let archival_size = TIMELINE_ARCHIVE_SIZE
+            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
+            .unwrap();
+
+        let layer_size_image = TIMELINE_LAYER_SIZE
+            .get_metric_with_label_values(&[
+                &tenant_id,
+                &shard_id,
+                &timeline_id,
+                MetricLayerKind::Image.into(),
+            ])
+            .unwrap();
+
+        let layer_count_image = TIMELINE_LAYER_COUNT
+            .get_metric_with_label_values(&[
+                &tenant_id,
+                &shard_id,
+                &timeline_id,
+                MetricLayerKind::Image.into(),
+            ])
+            .unwrap();
+
+        let layer_size_delta = TIMELINE_LAYER_SIZE
+            .get_metric_with_label_values(&[
+                &tenant_id,
+                &shard_id,
+                &timeline_id,
+                MetricLayerKind::Delta.into(),
+            ])
+            .unwrap();
+
+        let layer_count_delta = TIMELINE_LAYER_COUNT
+            .get_metric_with_label_values(&[
+                &tenant_id,
+                &shard_id,
+                &timeline_id,
+                MetricLayerKind::Delta.into(),
+            ])
+            .unwrap();
+
        let standby_horizon_gauge = STANDBY_HORIZON
            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
            .unwrap();
@@ -2153,6 +2323,10 @@ impl TimelineMetrics {
        let evictions_with_low_residence_duration = evictions_with_low_residence_duration_builder
            .build(&tenant_id, &shard_id, &timeline_id);

+        let valid_lsn_lease_count_gauge = VALID_LSN_LEASE_COUNT
+            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
+            .unwrap();
+
        TimelineMetrics {
            tenant_id,
            shard_id,
@@ -2166,6 +2340,12 @@ impl TimelineMetrics {
            find_gc_cutoffs_histo,
            load_layer_map_histo,
            last_record_gauge,
+            pitr_history_size,
+            archival_size,
+            layer_size_image,
+            layer_count_image,
+            layer_size_delta,
+            layer_count_delta,
            standby_horizon_gauge,
            resident_physical_size_gauge,
            current_logical_size_gauge,
@@ -2175,6 +2355,7 @@ impl TimelineMetrics {
            evictions_with_low_residence_duration: std::sync::RwLock::new(
                evictions_with_low_residence_duration,
            ),
+            valid_lsn_lease_count_gauge,
            shutdown: std::sync::atomic::AtomicBool::default(),
        }
    }
@@ -2222,8 +2403,38 @@ impl TimelineMetrics {
        if let Some(metric) = Lazy::get(&DIRECTORY_ENTRIES_COUNT) {
            let _ = metric.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        }
+
+        let _ = TIMELINE_ARCHIVE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
+        let _ = PITR_HISTORY_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
+
+        let _ = TIMELINE_LAYER_SIZE.remove_label_values(&[
+            tenant_id,
+            shard_id,
+            timeline_id,
+            MetricLayerKind::Image.into(),
+        ]);
+        let _ = TIMELINE_LAYER_COUNT.remove_label_values(&[
+            tenant_id,
+            shard_id,
+            timeline_id,
+            MetricLayerKind::Image.into(),
+        ]);
+        let _ = TIMELINE_LAYER_SIZE.remove_label_values(&[
+            tenant_id,
+            shard_id,
+            timeline_id,
+            MetricLayerKind::Delta.into(),
+        ]);
+        let _ = TIMELINE_LAYER_COUNT.remove_label_values(&[
+            tenant_id,
+            shard_id,
+            timeline_id,
+            MetricLayerKind::Delta.into(),
+        ]);
+
        let _ = EVICTIONS.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        let _ = AUX_FILE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
+        let _ = VALID_LSN_LEASE_COUNT.remove_label_values(&[tenant_id, shard_id, timeline_id]);

        self.evictions_with_low_residence_duration
            .write()
@@ -2254,14 +2465,12 @@ impl TimelineMetrics {
            let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, shard_id, timeline_id]);
        }

-        for op in SmgrQueryType::iter() {
-            let _ = SMGR_QUERY_TIME_PER_TENANT_TIMELINE.remove_label_values(&[
-                op.into(),
-                tenant_id,
-                shard_id,
-                timeline_id,
-            ]);
-        }
+        let _ = SMGR_QUERY_TIME_PER_TENANT_TIMELINE.remove_label_values(&[
+            SmgrQueryType::GetPageAtLsn.into(),
+            tenant_id,
+            shard_id,
+            timeline_id,
+        ]);
    }
 }

@@ -2932,4 +3141,5 @@ pub fn preinitialize_metrics() {
    Lazy::force(&RECONSTRUCT_TIME);
    Lazy::force(&tenant_throttling::TIMELINE_GET);
    Lazy::force(&BASEBACKUP_QUERY_TIME);
+    Lazy::force(&COMPUTE_COMMANDS_COUNTERS);
 }
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -4,9 +4,7 @@
 use anyhow::Context;
 use async_compression::tokio::write::GzipEncoder;
 use bytes::Buf;
-use bytes::Bytes;
 use futures::stream::FuturesUnordered;
-use futures::Stream;
 use futures::StreamExt;
 use pageserver_api::key::Key;
 use pageserver_api::models::TenantState;
@@ -28,7 +26,6 @@ use std::borrow::Cow;
 use std::collections::HashMap;
 use std::io;
 use std::net::TcpListener;
-use std::pin::pin;
 use std::str;
 use std::str::FromStr;
 use std::sync::Arc;
@@ -37,10 +34,8 @@ use std::time::Instant;
 use std::time::SystemTime;
 use tokio::io::AsyncWriteExt;
 use tokio::io::{AsyncRead, AsyncWrite};
-use tokio_util::io::StreamReader;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
-use utils::id::ConnectionId;
 use utils::sync::gate::GateGuard;
 use utils::{
    auth::{Claims, Scope, SwappableJwtAuth},
@@ -53,9 +48,8 @@ use crate::auth::check_permission;
 use crate::basebackup;
 use crate::basebackup::BasebackupError;
 use crate::context::{DownloadBehavior, RequestContext};
-use crate::import_datadir::import_wal_from_tar;
 use crate::metrics;
-use crate::metrics::LIVE_CONNECTIONS_COUNT;
+use crate::metrics::{ComputeCommandKind, COMPUTE_COMMANDS_COUNTERS, LIVE_CONNECTIONS};
 use crate::pgdatadir_mapping::Version;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
@@ -66,13 +60,11 @@ use crate::tenant::mgr::GetTenantError;
 use crate::tenant::mgr::ShardResolveResult;
 use crate::tenant::mgr::ShardSelector;
 use crate::tenant::mgr::TenantManager;
-use crate::tenant::timeline::FlushLayerError;
 use crate::tenant::timeline::WaitLsnError;
 use crate::tenant::GetTimelineError;
 use crate::tenant::PageReconstructError;
 use crate::tenant::Tenant;
 use crate::tenant::Timeline;
-use crate::trace::Tracer;
 use pageserver_api::key::rel_block_to_key;
 use pageserver_api::reltag::SlruKind;
 use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
@@ -82,56 +74,6 @@ use postgres_ffi::BLCKSZ;
 // is not yet in state [`TenantState::Active`].
 const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000);

-/// Read the end of a tar archive.
-///
-/// A tar archive normally ends with two consecutive blocks of zeros, 512 bytes each.
-/// `tokio_tar` already read the first such block. Read the second all-zeros block,
-/// and check that there is no more data after the EOF marker.
-///
-/// 'tar' command can also write extra blocks of zeros, up to a record
-/// size, controlled by the --record-size argument. Ignore them too.
-async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<()> {
-    use tokio::io::AsyncReadExt;
-    let mut buf = [0u8; 512];
-
-    // Read the all-zeros block, and verify it
-    let mut total_bytes = 0;
-    while total_bytes < 512 {
-        let nbytes = reader.read(&mut buf[total_bytes..]).await?;
-        total_bytes += nbytes;
-        if nbytes == 0 {
-            break;
-        }
-    }
-    if total_bytes < 512 {
-        anyhow::bail!("incomplete or invalid tar EOF marker");
-    }
-    if !buf.iter().all(|&x| x == 0) {
-        anyhow::bail!("invalid tar EOF marker");
-    }
-
-    // Drain any extra zero-blocks after the EOF marker
-    let mut trailing_bytes = 0;
-    let mut seen_nonzero_bytes = false;
-    loop {
-        let nbytes = reader.read(&mut buf).await?;
-        trailing_bytes += nbytes;
-        if !buf.iter().all(|&x| x == 0) {
-            seen_nonzero_bytes = true;
-        }
-        if nbytes == 0 {
-            break;
-        }
-    }
-    if seen_nonzero_bytes {
-        anyhow::bail!("unexpected non-zero bytes after the tar archive");
-    }
-    if trailing_bytes % 512 != 0 {
-        anyhow::bail!("unexpected number of zeros ({trailing_bytes}), not divisible by tar block size (512 bytes), after the tar archive");
-    }
-    Ok(())
-}
-
 ///////////////////////////////////////////////////////////////////////////////

 ///
@@ -141,7 +83,6 @@ async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<()
 ///
 pub async fn libpq_listener_main(
    tenant_manager: Arc<TenantManager>,
-    broker_client: storage_broker::BrokerClientChannel,
    auth: Option<Arc<SwappableJwtAuth>>,
    listener: TcpListener,
    auth_type: AuthType,
@@ -186,7 +127,6 @@ pub async fn libpq_listener_main(
                    false,
                    page_service_conn_main(
                        tenant_manager.clone(),
-                        broker_client.clone(),
                        local_auth,
                        socket,
                        auth_type,
@@ -209,20 +149,14 @@ pub async fn libpq_listener_main(
 #[instrument(skip_all, fields(peer_addr))]
 async fn page_service_conn_main(
    tenant_manager: Arc<TenantManager>,
-    broker_client: storage_broker::BrokerClientChannel,
    auth: Option<Arc<SwappableJwtAuth>>,
    socket: tokio::net::TcpStream,
    auth_type: AuthType,
    connection_ctx: RequestContext,
 ) -> anyhow::Result<()> {
-    // Immediately increment the gauge, then create a job to decrement it on task exit.
-    // One of the pros of `defer!` is that this will *most probably*
-    // get called, even in presence of panics.
-    let gauge = LIVE_CONNECTIONS_COUNT.with_label_values(&["page_service"]);
-    gauge.inc();
-    scopeguard::defer! {
-        gauge.dec();
-    }
+    let _guard = LIVE_CONNECTIONS
+        .with_label_values(&["page_service"])
+        .guard();

    socket
        .set_nodelay(true)
@@ -267,12 +201,11 @@ async fn page_service_conn_main(
    // and create a child per-query context when it invokes process_query.
    // But it's in a shared crate, so, we store connection_ctx inside PageServerHandler
    // and create the per-query context in process_query ourselves.
-    let mut conn_handler =
-        PageServerHandler::new(tenant_manager, broker_client, auth, connection_ctx);
+    let mut conn_handler = PageServerHandler::new(tenant_manager, auth, connection_ctx);
    let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;

    match pgbackend
-        .run(&mut conn_handler, task_mgr::shutdown_watcher)
+        .run(&mut conn_handler, &task_mgr::shutdown_token())
        .await
    {
        Ok(()) => {
@@ -299,7 +232,6 @@ struct HandlerTimeline {
 }

 struct PageServerHandler {
-    broker_client: storage_broker::BrokerClientChannel,
    auth: Option<Arc<SwappableJwtAuth>>,
    claims: Option<Claims>,

@@ -391,13 +323,11 @@ impl From<WaitLsnError> for QueryError {
 impl PageServerHandler {
    pub fn new(
        tenant_manager: Arc<TenantManager>,
-        broker_client: storage_broker::BrokerClientChannel,
        auth: Option<Arc<SwappableJwtAuth>>,
        connection_ctx: RequestContext,
    ) -> Self {
        PageServerHandler {
            tenant_manager,
-            broker_client,
            auth,
            claims: None,
            connection_ctx,
@@ -480,73 +410,6 @@ impl PageServerHandler {
        )
    }

-    fn copyin_stream<'a, IO>(
-        &'a self,
-        pgb: &'a mut PostgresBackend<IO>,
-        cancel: &'a CancellationToken,
-    ) -> impl Stream<Item = io::Result<Bytes>> + 'a
-    where
-        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
-    {
-        async_stream::try_stream! {
-            loop {
-                let msg = tokio::select! {
-                    biased;
-
-                    _ = cancel.cancelled() => {
-                        // We were requested to shut down.
-                        let msg = "pageserver is shutting down";
-                        let _ = pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, None));
-                        Err(QueryError::Shutdown)
-                    }
-
-                    msg = pgb.read_message() => { msg.map_err(QueryError::from)}
-                };
-
-                match msg {
-                    Ok(Some(message)) => {
-                        let copy_data_bytes = match message {
-                            FeMessage::CopyData(bytes) => bytes,
-                            FeMessage::CopyDone => { break },
-                            FeMessage::Sync => continue,
-                            FeMessage::Terminate => {
-                                let msg = "client terminated connection with Terminate message during COPY";
-                                let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
-                                // error can't happen here, ErrorResponse serialization should be always ok
-                                pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?;
-                                Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
-                                break;
-                            }
-                            m => {
-                                let msg = format!("unexpected message {m:?}");
-                                // error can't happen here, ErrorResponse serialization should be always ok
-                                pgb.write_message_noflush(&BeMessage::ErrorResponse(&msg, None)).map_err(|e| e.into_io_error())?;
-                                Err(io::Error::new(io::ErrorKind::Other, msg))?;
-                                break;
-                            }
-                        };
-
-                        yield copy_data_bytes;
-                    }
-                    Ok(None) => {
-                        let msg = "client closed connection during COPY";
-                        let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
-                        // error can't happen here, ErrorResponse serialization should be always ok
-                        pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?;
-                        self.flush_cancellable(pgb, cancel).await.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
-                        Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
-                    }
-                    Err(QueryError::Disconnected(ConnectionError::Io(io_error))) => {
-                        Err(io_error)?;
-                    }
-                    Err(other) => {
-                        Err(io::Error::new(io::ErrorKind::Other, other.to_string()))?;
-                    }
-                };
-            }
-        }
-    }
-
    #[instrument(skip_all)]
    async fn handle_pagerequests<IO>(
        &mut self,
@@ -565,18 +428,6 @@ impl PageServerHandler {
            .get_active_tenant_with_timeout(tenant_id, ShardSelector::First, ACTIVE_TENANT_TIMEOUT)
            .await?;

-        // Make request tracer if needed
-        let mut tracer = if tenant.get_trace_read_requests() {
-            let connection_id = ConnectionId::generate();
-            let path =
-                tenant
-                    .conf
-                    .trace_path(&tenant.tenant_shard_id(), &timeline_id, &connection_id);
-            Some(Tracer::new(path))
-        } else {
-            None
-        };
-
        // switch client to COPYBOTH
        pgb.write_message_noflush(&BeMessage::CopyBothResponse)?;
        self.flush_cancellable(pgb, &tenant.cancel).await?;
@@ -608,11 +459,6 @@ impl PageServerHandler {
            trace!("query: {copy_data_bytes:?}");
            fail::fail_point!("ps::handle-pagerequest-message");

-            // Trace request if needed
-            if let Some(t) = tracer.as_mut() {
-                t.trace(&copy_data_bytes)
-            }
-
            let neon_fe_msg =
                PagestreamFeMessage::parse(&mut copy_data_bytes.reader(), protocol_version)?;

@@ -718,128 +564,6 @@ impl PageServerHandler {
        Ok(())
    }

-    #[allow(clippy::too_many_arguments)]
-    #[instrument(skip_all, fields(%base_lsn, end_lsn=%_end_lsn, %pg_version))]
-    async fn handle_import_basebackup<IO>(
-        &self,
-        pgb: &mut PostgresBackend<IO>,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-        base_lsn: Lsn,
-        _end_lsn: Lsn,
-        pg_version: u32,
-        ctx: RequestContext,
-    ) -> Result<(), QueryError>
-    where
-        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
-    {
-        debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id();
-
-        // Create empty timeline
-        info!("creating new timeline");
-        let tenant = self
-            .get_active_tenant_with_timeout(tenant_id, ShardSelector::Zero, ACTIVE_TENANT_TIMEOUT)
-            .await?;
-        let timeline = tenant
-            .create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx)
-            .await?;
-
-        // TODO mark timeline as not ready until it reaches end_lsn.
-        // We might have some wal to import as well, and we should prevent compute
-        // from connecting before that and writing conflicting wal.
-        //
-        // This is not relevant for pageserver->pageserver migrations, since there's
-        // no wal to import. But should be fixed if we want to import from postgres.
-
-        // TODO leave clean state on error. For now you can use detach to clean
-        // up broken state from a failed import.
-
-        // Import basebackup provided via CopyData
-        info!("importing basebackup");
-        pgb.write_message_noflush(&BeMessage::CopyInResponse)?;
-        self.flush_cancellable(pgb, &tenant.cancel).await?;
-
-        let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb, &tenant.cancel)));
-        timeline
-            .import_basebackup_from_tar(
-                tenant.clone(),
-                &mut copyin_reader,
-                base_lsn,
-                self.broker_client.clone(),
-                &ctx,
-            )
-            .await?;
-
-        // Read the end of the tar archive.
-        read_tar_eof(copyin_reader).await?;
-
-        // TODO check checksum
-        // Meanwhile you can verify client-side by taking fullbackup
-        // and checking that it matches in size with what was imported.
-        // It wouldn't work if base came from vanilla postgres though,
-        // since we discard some log files.
-
-        info!("done");
-        Ok(())
-    }
-
-    #[instrument(skip_all, fields(shard_id, %start_lsn, %end_lsn))]
-    async fn handle_import_wal<IO>(
-        &self,
-        pgb: &mut PostgresBackend<IO>,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-        start_lsn: Lsn,
-        end_lsn: Lsn,
-        ctx: RequestContext,
-    ) -> Result<(), QueryError>
-    where
-        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
-    {
-        let timeline = self
-            .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
-            .await?;
-        let last_record_lsn = timeline.get_last_record_lsn();
-        if last_record_lsn != start_lsn {
-            return Err(QueryError::Other(
-                anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}"))
-            );
-        }
-
-        // TODO leave clean state on error. For now you can use detach to clean
-        // up broken state from a failed import.
-
-        // Import wal provided via CopyData
-        info!("importing wal");
-        pgb.write_message_noflush(&BeMessage::CopyInResponse)?;
-        self.flush_cancellable(pgb, &timeline.cancel).await?;
-        let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb, &timeline.cancel)));
-        import_wal_from_tar(&timeline, &mut copyin_reader, start_lsn, end_lsn, &ctx).await?;
-        info!("wal import complete");
-
-        // Read the end of the tar archive.
-        read_tar_eof(copyin_reader).await?;
-
-        // TODO Does it make sense to overshoot?
-        if timeline.get_last_record_lsn() < end_lsn {
-            return Err(QueryError::Other(
-                anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}"))
-            );
-        }
-
-        // Flush data to disk, then upload to s3. No need for a forced checkpoint.
-        // We only want to persist the data, and it doesn't matter if it's in the
-        // shape of deltas or images.
-        info!("flushing layers");
-        timeline.freeze_and_flush().await.map_err(|e| match e {
-            FlushLayerError::Cancelled => QueryError::Shutdown,
-            other => QueryError::Other(other.into()),
-        })?;
-
-        info!("done");
-        Ok(())
-    }
-
    /// Helper function to handle the LSN from client request.
    ///
    /// Each GetPage (and Exists and Nblocks) request includes information about
@@ -1554,6 +1278,10 @@ where

            self.check_permission(Some(tenant_id))?;

+            COMPUTE_COMMANDS_COUNTERS
+                .for_command(ComputeCommandKind::PageStreamV2)
+                .inc();
+
            self.handle_pagerequests(
                pgb,
                tenant_id,
@@ -1579,6 +1307,10 @@ where

            self.check_permission(Some(tenant_id))?;

+            COMPUTE_COMMANDS_COUNTERS
+                .for_command(ComputeCommandKind::PageStream)
+                .inc();
+
            self.handle_pagerequests(
                pgb,
                tenant_id,
@@ -1605,6 +1337,10 @@ where

            self.check_permission(Some(tenant_id))?;

+            COMPUTE_COMMANDS_COUNTERS
+                .for_command(ComputeCommandKind::Basebackup)
+                .inc();
+
            let lsn = if let Some(lsn_str) = params.get(2) {
                Some(
                    Lsn::from_str(lsn_str)
@@ -1644,48 +1380,6 @@ where
            metric_recording.observe(&res);
            res?;
        }
-        // return pair of prev_lsn and last_lsn
-        else if let Some(params) = parts.strip_prefix(&["get_last_record_rlsn"]) {
-            if params.len() != 2 {
-                return Err(QueryError::Other(anyhow::anyhow!(
-                    "invalid param number for get_last_record_rlsn command"
-                )));
-            }
-
-            let tenant_id = TenantId::from_str(params[0])
-                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
-            let timeline_id = TimelineId::from_str(params[1])
-                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
-
-            tracing::Span::current()
-                .record("tenant_id", field::display(tenant_id))
-                .record("timeline_id", field::display(timeline_id));
-
-            self.check_permission(Some(tenant_id))?;
-            async {
-                let timeline = self
-                    .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
-                    .await?;
-
-                let end_of_timeline = timeline.get_last_record_rlsn();
-
-                pgb.write_message_noflush(&BeMessage::RowDescription(&[
-                    RowDescriptor::text_col(b"prev_lsn"),
-                    RowDescriptor::text_col(b"last_lsn"),
-                ]))?
-                .write_message_noflush(&BeMessage::DataRow(&[
-                    Some(end_of_timeline.prev.to_string().as_bytes()),
-                    Some(end_of_timeline.last.to_string().as_bytes()),
-                ]))?
-                .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
-                anyhow::Ok(())
-            }
-            .instrument(info_span!(
-                "handle_get_last_record_lsn",
-                shard_id = tracing::field::Empty
-            ))
-            .await?;
-        }
        // same as basebackup, but result includes relational data as well
        else if let Some(params) = parts.strip_prefix(&["fullbackup"]) {
            if params.len() < 2 {
@@ -1723,6 +1417,10 @@ where

            self.check_permission(Some(tenant_id))?;

+            COMPUTE_COMMANDS_COUNTERS
+                .for_command(ComputeCommandKind::Fullbackup)
+                .inc();
+
            // Check that the timeline exists
            self.handle_basebackup_request(
                pgb,
@@ -1736,101 +1434,6 @@ where
            )
            .await?;
            pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
-        } else if query_string.starts_with("import basebackup ") {
-            // Import the `base` section (everything but the wal) of a basebackup.
-            // Assumes the tenant already exists on this pageserver.
-            //
-            // Files are scheduled to be persisted to remote storage, and the
-            // caller should poll the http api to check when that is done.
-            //
-            // Example import command:
-            // 1. Get start/end LSN from backup_manifest file
-            // 2. Run:
-            // cat my_backup/base.tar | psql -h $PAGESERVER \
-            //     -c "import basebackup $TENANT $TIMELINE $START_LSN $END_LSN $PG_VERSION"
-            let params = &parts[2..];
-            if params.len() != 5 {
-                return Err(QueryError::Other(anyhow::anyhow!(
-                    "invalid param number for import basebackup command"
-                )));
-            }
-            let tenant_id = TenantId::from_str(params[0])
-                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
-            let timeline_id = TimelineId::from_str(params[1])
-                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
-            let base_lsn = Lsn::from_str(params[2])
-                .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?;
-            let end_lsn = Lsn::from_str(params[3])
-                .with_context(|| format!("Failed to parse Lsn from {}", params[3]))?;
-            let pg_version = u32::from_str(params[4])
-                .with_context(|| format!("Failed to parse pg_version from {}", params[4]))?;
-
-            tracing::Span::current()
-                .record("tenant_id", field::display(tenant_id))
-                .record("timeline_id", field::display(timeline_id));
-
-            self.check_permission(Some(tenant_id))?;
-
-            match self
-                .handle_import_basebackup(
-                    pgb,
-                    tenant_id,
-                    timeline_id,
-                    base_lsn,
-                    end_lsn,
-                    pg_version,
-                    ctx,
-                )
-                .await
-            {
-                Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?,
-                Err(e) => {
-                    error!("error importing base backup between {base_lsn} and {end_lsn}: {e:?}");
-                    pgb.write_message_noflush(&BeMessage::ErrorResponse(
-                        &e.to_string(),
-                        Some(e.pg_error_code()),
-                    ))?
-                }
-            };
-        } else if query_string.starts_with("import wal ") {
-            // Import the `pg_wal` section of a basebackup.
-            //
-            // Files are scheduled to be persisted to remote storage, and the
-            // caller should poll the http api to check when that is done.
-            let params = &parts[2..];
-            if params.len() != 4 {
-                return Err(QueryError::Other(anyhow::anyhow!(
-                    "invalid param number for import wal command"
-                )));
-            }
-            let tenant_id = TenantId::from_str(params[0])
-                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
-            let timeline_id = TimelineId::from_str(params[1])
-                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
-            let start_lsn = Lsn::from_str(params[2])
-                .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?;
-            let end_lsn = Lsn::from_str(params[3])
-                .with_context(|| format!("Failed to parse Lsn from {}", params[3]))?;
-
-            tracing::Span::current()
-                .record("tenant_id", field::display(tenant_id))
-                .record("timeline_id", field::display(timeline_id));
-
-            self.check_permission(Some(tenant_id))?;
-
-            match self
-                .handle_import_wal(pgb, tenant_id, timeline_id, start_lsn, end_lsn, ctx)
-                .await
-            {
-                Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?,
-                Err(e) => {
-                    error!("error importing WAL between {start_lsn} and {end_lsn}: {e:?}");
-                    pgb.write_message_noflush(&BeMessage::ErrorResponse(
-                        &e.to_string(),
-                        Some(e.pg_error_code()),
-                    ))?
-                }
-            };
        } else if query_string.to_ascii_lowercase().starts_with("set ") {
            // important because psycopg2 executes "SET datestyle TO 'ISO'"
            // on connect
@@ -1855,6 +1458,10 @@ where

            self.check_permission(Some(tenant_shard_id.tenant_id))?;

+            COMPUTE_COMMANDS_COUNTERS
+                .for_command(ComputeCommandKind::LeaseLsn)
+                .inc();
+
            // The caller is responsible for providing correct lsn.
            let lsn = Lsn::from_str(params[2])
                .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?;
@@ -1872,62 +1479,6 @@ where
                    ))?
                }
            };
-        } else if let Some(params) = parts.strip_prefix(&["show"]) {
-            // show <tenant_id>
-            if params.len() != 1 {
-                return Err(QueryError::Other(anyhow::anyhow!(
-                    "invalid param number for config command"
-                )));
-            }
-            let tenant_id = TenantId::from_str(params[0])
-                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
-
-            tracing::Span::current().record("tenant_id", field::display(tenant_id));
-
-            self.check_permission(Some(tenant_id))?;
-
-            let tenant = self
-                .get_active_tenant_with_timeout(
-                    tenant_id,
-                    ShardSelector::Zero,
-                    ACTIVE_TENANT_TIMEOUT,
-                )
-                .await?;
-            pgb.write_message_noflush(&BeMessage::RowDescription(&[
-                RowDescriptor::int8_col(b"checkpoint_distance"),
-                RowDescriptor::int8_col(b"checkpoint_timeout"),
-                RowDescriptor::int8_col(b"compaction_target_size"),
-                RowDescriptor::int8_col(b"compaction_period"),
-                RowDescriptor::int8_col(b"compaction_threshold"),
-                RowDescriptor::int8_col(b"gc_horizon"),
-                RowDescriptor::int8_col(b"gc_period"),
-                RowDescriptor::int8_col(b"image_creation_threshold"),
-                RowDescriptor::int8_col(b"pitr_interval"),
-            ]))?
-            .write_message_noflush(&BeMessage::DataRow(&[
-                Some(tenant.get_checkpoint_distance().to_string().as_bytes()),
-                Some(
-                    tenant
-                        .get_checkpoint_timeout()
-                        .as_secs()
-                        .to_string()
-                        .as_bytes(),
-                ),
-                Some(tenant.get_compaction_target_size().to_string().as_bytes()),
-                Some(
-                    tenant
-                        .get_compaction_period()
-                        .as_secs()
-                        .to_string()
-                        .as_bytes(),
-                ),
-                Some(tenant.get_compaction_threshold().to_string().as_bytes()),
-                Some(tenant.get_gc_horizon().to_string().as_bytes()),
-                Some(tenant.get_gc_period().as_secs().to_string().as_bytes()),
-                Some(tenant.get_image_creation_threshold().to_string().as_bytes()),
-                Some(tenant.get_pitr_interval().as_secs().to_string().as_bytes()),
-            ]))?
-            .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
        } else {
            return Err(QueryError::Other(anyhow::anyhow!(
                "unknown command {query_string}"
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -522,7 +522,7 @@ impl Timeline {
        ctx: &RequestContext,
    ) -> Result<Option<TimestampTz>, PageReconstructError> {
        let mut max: Option<TimestampTz> = None;
-        self.map_all_timestamps(probe_lsn, ctx, |timestamp| {
+        self.map_all_timestamps::<()>(probe_lsn, ctx, |timestamp| {
            if let Some(max_prev) = max {
                max = Some(max_prev.max(timestamp));
            } else {
@@ -854,13 +854,14 @@ impl Timeline {
        result.add_key(DBDIR_KEY);

        // Fetch list of database dirs and iterate them
-        let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
-        let dbdir = DbDirectory::des(&buf)?;
+        let dbdir = self.list_dbdirs(lsn, ctx).await?;
+        let mut dbs: Vec<((Oid, Oid), bool)> = dbdir.into_iter().collect();

-        let mut dbs: Vec<(Oid, Oid)> = dbdir.dbdirs.keys().cloned().collect();
-        dbs.sort_unstable();
-        for (spcnode, dbnode) in dbs {
-            result.add_key(relmap_file_key(spcnode, dbnode));
+        dbs.sort_unstable_by(|(k_a, _), (k_b, _)| k_a.cmp(k_b));
+        for ((spcnode, dbnode), has_relmap_file) in dbs {
+            if has_relmap_file {
+                result.add_key(relmap_file_key(spcnode, dbnode));
+            }
            result.add_key(rel_dir_to_key(spcnode, dbnode));

            let mut rels: Vec<RelTag> = self
@@ -919,6 +920,9 @@ impl Timeline {
            result.add_key(AUX_FILES_KEY);
        }

+        // Add extra keyspaces in the test cases. Some test cases write keys into the storage without
+        // creating directory keys. These test cases will add such keyspaces into `extra_test_dense_keyspace`
+        // and the keys will not be garbage-colllected.
        #[cfg(test)]
        {
            let guard = self.extra_test_dense_keyspace.load();
@@ -927,13 +931,48 @@ impl Timeline {
            }
        }

-        Ok((
-            result.to_keyspace(),
-            /* AUX sparse key space */
-            SparseKeySpace(KeySpace {
-                ranges: vec![repl_origin_key_range(), Key::metadata_aux_key_range()],
-            }),
-        ))
+        let dense_keyspace = result.to_keyspace();
+        let sparse_keyspace = SparseKeySpace(KeySpace {
+            ranges: vec![Key::metadata_aux_key_range(), repl_origin_key_range()],
+        });
+
+        if cfg!(debug_assertions) {
+            // Verify if the sparse keyspaces are ordered and non-overlapping.
+
+            // We do not use KeySpaceAccum for sparse_keyspace because we want to ensure each
+            // category of sparse keys are split into their own image/delta files. If there
+            // are overlapping keyspaces, they will be automatically merged by keyspace accum,
+            // and we want the developer to keep the keyspaces separated.
+
+            let ranges = &sparse_keyspace.0.ranges;
+
+            // TODO: use a single overlaps_with across the codebase
+            fn overlaps_with<T: Ord>(a: &Range<T>, b: &Range<T>) -> bool {
+                !(a.end <= b.start || b.end <= a.start)
+            }
+            for i in 0..ranges.len() {
+                for j in 0..i {
+                    if overlaps_with(&ranges[i], &ranges[j]) {
+                        panic!(
+                            "overlapping sparse keyspace: {}..{} and {}..{}",
+                            ranges[i].start, ranges[i].end, ranges[j].start, ranges[j].end
+                        );
+                    }
+                }
+            }
+            for i in 1..ranges.len() {
+                assert!(
+                    ranges[i - 1].end <= ranges[i].start,
+                    "unordered sparse keyspace: {}..{} and {}..{}",
+                    ranges[i - 1].start,
+                    ranges[i - 1].end,
+                    ranges[i].start,
+                    ranges[i].end
+                );
+            }
+        }
+
+        Ok((dense_keyspace, sparse_keyspace))
    }

    /// Get cached size of relation if it not updated after specified LSN
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -39,6 +39,7 @@ use tokio::task::JoinSet;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::backoff;
+use utils::circuit_breaker::CircuitBreaker;
 use utils::completion;
 use utils::crashsafe::path_with_suffix_extension;
 use utils::failpoint_support;
@@ -73,9 +74,11 @@ use crate::deletion_queue::DeletionQueueClient;
 use crate::deletion_queue::DeletionQueueError;
 use crate::import_datadir;
 use crate::is_uninit_mark;
+use crate::l0_flush::L0FlushGlobalState;
 use crate::metrics::TENANT;
 use crate::metrics::{
-    remove_tenant_metrics, BROKEN_TENANTS_SET, TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC,
+    remove_tenant_metrics, BROKEN_TENANTS_SET, CIRCUIT_BREAKERS_BROKEN, CIRCUIT_BREAKERS_UNBROKEN,
+    TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC,
 };
 use crate::repository::GcResult;
 use crate::task_mgr;
@@ -88,6 +91,7 @@ use crate::tenant::remote_timeline_client::MaybeDeletedIndexPart;
 use crate::tenant::remote_timeline_client::INITDB_PATH;
 use crate::tenant::storage_layer::DeltaLayer;
 use crate::tenant::storage_layer::ImageLayer;
+use crate::walredo;
 use crate::InitializationOrder;
 use std::collections::hash_map::Entry;
 use std::collections::BTreeSet;
@@ -165,6 +169,7 @@ pub struct TenantSharedResources {
    pub broker_client: storage_broker::BrokerClientChannel,
    pub remote_storage: GenericRemoteStorage,
    pub deletion_queue_client: DeletionQueueClient,
+    pub l0_flush_global_state: L0FlushGlobalState,
 }

 /// A [`Tenant`] is really an _attached_ tenant.  The configuration
@@ -212,8 +217,6 @@ pub(crate) enum SpawnMode {
    Eager,
    /// Lazy activation in the background, with the option to skip the queue if the need comes up
    Lazy,
-    /// Tenant has been created during the lifetime of this process
-    Create,
 }

 ///
@@ -275,6 +278,10 @@ pub struct Tenant {

    eviction_task_tenant_state: tokio::sync::Mutex<EvictionTaskTenantState>,

+    /// Track repeated failures to compact, so that we can back off.
+    /// Overhead of mutex is acceptable because compaction is done with a multi-second period.
+    compaction_circuit_breaker: std::sync::Mutex<CircuitBreaker>,
+
    /// If the tenant is in Activating state, notify this to encourage it
    /// to proceed to Active as soon as possible, rather than waiting for lazy
    /// background warmup.
@@ -295,6 +302,8 @@ pub struct Tenant {

    /// An ongoing timeline detach must be checked during attempts to GC or compact a timeline.
    ongoing_timeline_detach: std::sync::Mutex<Option<(TimelineId, utils::completion::Barrier)>>,
+
+    l0_flush_global_state: L0FlushGlobalState,
 }

 impl std::fmt::Debug for Tenant {
@@ -323,6 +332,16 @@ impl From<harness::TestRedoManager> for WalRedoManager {
 }

 impl WalRedoManager {
+    pub(crate) async fn shutdown(&self) {
+        match self {
+            Self::Prod(mgr) => mgr.shutdown().await,
+            #[cfg(test)]
+            Self::Test(_) => {
+                // Not applicable to test redo manager
+            }
+        }
+    }
+
    pub(crate) fn maybe_quiesce(&self, idle_timeout: Duration) {
        match self {
            Self::Prod(mgr) => mgr.maybe_quiesce(idle_timeout),
@@ -343,7 +362,7 @@ impl WalRedoManager {
        base_img: Option<(Lsn, bytes::Bytes)>,
        records: Vec<(Lsn, crate::walrecord::NeonWalRecord)>,
        pg_version: u32,
-    ) -> anyhow::Result<bytes::Bytes> {
+    ) -> Result<bytes::Bytes, walredo::Error> {
        match self {
            Self::Prod(mgr) => {
                mgr.request_redo(key, lsn, base_img, records, pg_version)
@@ -520,6 +539,15 @@ impl From<PageReconstructError> for GcError {
    }
 }

+#[derive(thiserror::Error, Debug)]
+pub(crate) enum LoadConfigError {
+    #[error("TOML deserialization error: '{0}'")]
+    DeserializeToml(#[from] toml_edit::de::Error),
+
+    #[error("Config not found at {0}")]
+    NotFound(Utf8PathBuf),
+}
+
 impl Tenant {
    /// Yet another helper for timeline initialization.
    ///
@@ -648,7 +676,7 @@ impl Tenant {
        init_order: Option<InitializationOrder>,
        mode: SpawnMode,
        ctx: &RequestContext,
-    ) -> anyhow::Result<Arc<Tenant>> {
+    ) -> Arc<Tenant> {
        let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new(
            conf,
            tenant_shard_id,
@@ -658,6 +686,7 @@ impl Tenant {
            broker_client,
            remote_storage,
            deletion_queue_client,
+            l0_flush_global_state,
        } = resources;

        let attach_mode = attached_conf.location.attach_mode;
@@ -672,6 +701,7 @@ impl Tenant {
            tenant_shard_id,
            remote_storage.clone(),
            deletion_queue_client,
+            l0_flush_global_state,
        ));

        // The attach task will carry a GateGuard, so that shutdown() reliably waits for it to drop out if
@@ -797,9 +827,6 @@ impl Tenant {
                };

                let preload = match &mode {
-                    SpawnMode::Create => {
-                        None
-                    },
                    SpawnMode::Eager | SpawnMode::Lazy => {
                        let _preload_timer = TENANT.preload.start_timer();
                        let res = tenant_clone
@@ -821,11 +848,8 @@ impl Tenant {

                // We will time the duration of the attach phase unless this is a creation (attach will do no work)
                let attached = {
-                    let _attach_timer = match mode {
-                        SpawnMode::Create => None,
-                        SpawnMode::Eager | SpawnMode::Lazy => Some(TENANT.attach.start_timer()),
-                    };
-                    tenant_clone.attach(preload, mode, &ctx).await
+                    let _attach_timer = Some(TENANT.attach.start_timer());
+                    tenant_clone.attach(preload, &ctx).await
                };

                match attached {
@@ -856,7 +880,7 @@ impl Tenant {
            }
            .instrument(tracing::info_span!(parent: None, "attach", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), gen=?generation)),
        );
-        Ok(tenant)
+        tenant
    }

    #[instrument(skip_all)]
@@ -901,21 +925,14 @@ impl Tenant {
    async fn attach(
        self: &Arc<Tenant>,
        preload: Option<TenantPreload>,
-        mode: SpawnMode,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
        span::debug_assert_current_span_has_tenant_id();

        failpoint_support::sleep_millis_async!("before-attaching-tenant");

-        let preload = match (preload, mode) {
-            (Some(p), _) => p,
-            (None, SpawnMode::Create) => TenantPreload {
-                timelines: HashMap::new(),
-            },
-            (None, _) => {
-                anyhow::bail!("local-only deployment is no longer supported, https://github.com/neondatabase/neon/issues/5624");
-            }
+        let Some(preload) = preload else {
+            anyhow::bail!("local-only deployment is no longer supported, https://github.com/neondatabase/neon/issues/5624");
        };

        let mut timelines_to_resume_deletions = vec![];
@@ -984,6 +1001,7 @@ impl Tenant {
                TimelineResources {
                    remote_client,
                    timeline_get_throttle: self.timeline_get_throttle.clone(),
+                    l0_flush_global_state: self.l0_flush_global_state.clone(),
                },
                ctx,
            )
@@ -1147,30 +1165,6 @@ impl Tenant {
        .await
    }

-    /// Create a placeholder Tenant object for a broken tenant
-    pub fn create_broken_tenant(
-        conf: &'static PageServerConf,
-        tenant_shard_id: TenantShardId,
-        remote_storage: GenericRemoteStorage,
-        reason: String,
-    ) -> Arc<Tenant> {
-        Arc::new(Tenant::new(
-            TenantState::Broken {
-                reason,
-                backtrace: String::new(),
-            },
-            conf,
-            AttachedTenantConf::try_from(LocationConf::default()).unwrap(),
-            // Shard identity isn't meaningful for a broken tenant: it's just a placeholder
-            // to occupy the slot for this TenantShardId.
-            ShardIdentity::broken(tenant_shard_id.shard_number, tenant_shard_id.shard_count),
-            None,
-            tenant_shard_id,
-            remote_storage,
-            DeletionQueueClient::broken(),
-        ))
-    }
-
    async fn load_timeline_metadata(
        self: &Arc<Tenant>,
        timeline_ids: HashSet<TimelineId>,
@@ -1377,7 +1371,7 @@ impl Tenant {
        initdb_lsn: Lsn,
        pg_version: u32,
        ctx: &RequestContext,
-        delta_layer_desc: Vec<Vec<(pageserver_api::key::Key, Lsn, crate::repository::Value)>>,
+        delta_layer_desc: Vec<timeline::DeltaLayerTestDesc>,
        image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>,
        end_lsn: Lsn,
    ) -> anyhow::Result<Arc<Timeline>> {
@@ -1653,13 +1647,31 @@ impl Tenant {
            timelines_to_compact
        };

+        // Before doing any I/O work, check our circuit breaker
+        if self.compaction_circuit_breaker.lock().unwrap().is_broken() {
+            info!("Skipping compaction due to previous failures");
+            return Ok(());
+        }
+
        for (timeline_id, timeline) in &timelines_to_compact {
            timeline
                .compact(cancel, EnumSet::empty(), ctx)
                .instrument(info_span!("compact_timeline", %timeline_id))
-                .await?;
+                .await
+                .map_err(|e| {
+                    self.compaction_circuit_breaker
+                        .lock()
+                        .unwrap()
+                        .fail(&CIRCUIT_BREAKERS_BROKEN, &e);
+                    e
+                })?;
        }

+        self.compaction_circuit_breaker
+            .lock()
+            .unwrap()
+            .success(&CIRCUIT_BREAKERS_UNBROKEN);
+
        Ok(())
    }

@@ -1828,9 +1840,15 @@ impl Tenant {
        // If we're still attaching, fire the cancellation token early to drop out: this
        // will prevent us flushing, but ensures timely shutdown if some I/O during attach
        // is very slow.
-        if matches!(self.current_state(), TenantState::Attaching) {
+        let shutdown_mode = if matches!(self.current_state(), TenantState::Attaching) {
            self.cancel.cancel();
-        }
+
+            // Having fired our cancellation token, do not try and flush timelines: their cancellation tokens
+            // are children of ours, so their flush loops will have shut down already
+            timeline::ShutdownMode::Hard
+        } else {
+            shutdown_mode
+        };

        match self.set_stopping(shutdown_progress, false, false).await {
            Ok(()) => {}
@@ -1877,6 +1895,10 @@ impl Tenant {
        tracing::debug!("Waiting for tasks...");
        task_mgr::shutdown_tasks(None, Some(self.tenant_shard_id), None).await;

+        if let Some(walredo_mgr) = self.walredo_mgr.as_ref() {
+            walredo_mgr.shutdown().await;
+        }
+
        // Wait for any in-flight operations to complete
        self.gate.close().await;

@@ -2343,13 +2365,6 @@ impl Tenant {
            .unwrap_or(self.conf.default_tenant_conf.pitr_interval)
    }

-    pub fn get_trace_read_requests(&self) -> bool {
-        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
-        tenant_conf
-            .trace_read_requests
-            .unwrap_or(self.conf.default_tenant_conf.trace_read_requests)
-    }
-
    pub fn get_min_resident_size_override(&self) -> Option<u64> {
        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
        tenant_conf
@@ -2493,7 +2508,12 @@ impl Tenant {
        tenant_shard_id: TenantShardId,
        remote_storage: GenericRemoteStorage,
        deletion_queue_client: DeletionQueueClient,
+        l0_flush_global_state: L0FlushGlobalState,
    ) -> Tenant {
+        debug_assert!(
+            !attached_conf.location.generation.is_none() || conf.control_plane_api.is_none()
+        );
+
        let (state, mut rx) = watch::channel(state);

        tokio::spawn(async move {
@@ -2567,6 +2587,14 @@ impl Tenant {
            cached_logical_sizes: tokio::sync::Mutex::new(HashMap::new()),
            cached_synthetic_tenant_size: Arc::new(AtomicU64::new(0)),
            eviction_task_tenant_state: tokio::sync::Mutex::new(EvictionTaskTenantState::default()),
+            compaction_circuit_breaker: std::sync::Mutex::new(CircuitBreaker::new(
+                format!("compaction-{tenant_shard_id}"),
+                5,
+                // Compaction can be a very expensive operation, and might leak disk space.  It also ought
+                // to be infallible, as long as remote storage is available.  So if it repeatedly fails,
+                // use an extremely long backoff.
+                Some(Duration::from_secs(3600 * 24)),
+            )),
            activate_now_sem: tokio::sync::Semaphore::new(0),
            cancel: CancellationToken::default(),
            gate: Gate::default(),
@@ -2576,6 +2604,7 @@ impl Tenant {
            )),
            tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)),
            ongoing_timeline_detach: std::sync::Mutex::default(),
+            l0_flush_global_state,
        }
    }

@@ -2583,59 +2612,35 @@ impl Tenant {
    pub(super) fn load_tenant_config(
        conf: &'static PageServerConf,
        tenant_shard_id: &TenantShardId,
-    ) -> anyhow::Result<LocationConf> {
-        let legacy_config_path = conf.tenant_config_path(tenant_shard_id);
+    ) -> Result<LocationConf, LoadConfigError> {
        let config_path = conf.tenant_location_config_path(tenant_shard_id);

-        if config_path.exists() {
-            // New-style config takes precedence
-            let deserialized = Self::read_config(&config_path)?;
-            Ok(toml_edit::de::from_document::<LocationConf>(deserialized)?)
-        } else if legacy_config_path.exists() {
-            // Upgrade path: found an old-style configuration only
-            let deserialized = Self::read_config(&legacy_config_path)?;
-
-            let mut tenant_conf = TenantConfOpt::default();
-            for (key, item) in deserialized.iter() {
-                match key {
-                    "tenant_config" => {
-                        tenant_conf = TenantConfOpt::try_from(item.to_owned()).context(format!("Failed to parse config from file '{legacy_config_path}' as pageserver config"))?;
-                    }
-                    _ => bail!(
-                        "config file {legacy_config_path} has unrecognized pageserver option '{key}'"
-                    ),
-                }
-            }
-
-            // Legacy configs are implicitly in attached state, and do not support sharding
-            Ok(LocationConf::attached_single(
-                tenant_conf,
-                Generation::none(),
-                &models::ShardParameters::default(),
-            ))
-        } else {
-            // FIXME If the config file is not found, assume that we're attaching
-            // a detached tenant and config is passed via attach command.
-            // https://github.com/neondatabase/neon/issues/1555
-            // OR: we're loading after incomplete deletion that managed to remove config.
-            info!(
-                "tenant config not found in {} or {}",
-                config_path, legacy_config_path
-            );
-            Ok(LocationConf::default())
-        }
-    }
-
-    fn read_config(path: &Utf8Path) -> anyhow::Result<toml_edit::Document> {
-        info!("loading tenant configuration from {path}");
+        info!("loading tenant configuration from {config_path}");

        // load and parse file
-        let config = fs::read_to_string(path)
-            .with_context(|| format!("Failed to load config from path '{path}'"))?;
+        let config = fs::read_to_string(&config_path).map_err(|e| {
+            match e.kind() {
+                std::io::ErrorKind::NotFound => {
+                    // The config should almost always exist for a tenant directory:
+                    //  - When attaching a tenant, the config is the first thing we write
+                    //  - When detaching a tenant, we atomically move the directory to a tmp location
+                    //    before deleting contents.
+                    //
+                    // The very rare edge case that can result in a missing config is if we crash during attach
+                    // between creating directory and writing config.  Callers should handle that as if the
+                    // directory didn't exist.

-        config
-            .parse::<toml_edit::Document>()
-            .with_context(|| format!("Failed to parse config from file '{path}' as toml file"))
+                    LoadConfigError::NotFound(config_path)
+                }
+                _ => {
+                    // No IO errors except NotFound are acceptable here: other kinds of error indicate local storage or permissions issues
+                    // that we cannot cleanly recover
+                    crate::virtual_file::on_fatal_io_error(&e, "Reading tenant config file")
+                }
+            }
+        })?;
+
+        Ok(toml_edit::de::from_str::<LocationConf>(&config)?)
    }

    #[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))]
@@ -2643,48 +2648,18 @@ impl Tenant {
        conf: &'static PageServerConf,
        tenant_shard_id: &TenantShardId,
        location_conf: &LocationConf,
-    ) -> anyhow::Result<()> {
-        let legacy_config_path = conf.tenant_config_path(tenant_shard_id);
+    ) -> std::io::Result<()> {
        let config_path = conf.tenant_location_config_path(tenant_shard_id);

-        Self::persist_tenant_config_at(
-            tenant_shard_id,
-            &config_path,
-            &legacy_config_path,
-            location_conf,
-        )
-        .await
+        Self::persist_tenant_config_at(tenant_shard_id, &config_path, location_conf).await
    }

    #[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))]
    pub(super) async fn persist_tenant_config_at(
        tenant_shard_id: &TenantShardId,
        config_path: &Utf8Path,
-        legacy_config_path: &Utf8Path,
        location_conf: &LocationConf,
-    ) -> anyhow::Result<()> {
-        if let LocationMode::Attached(attach_conf) = &location_conf.mode {
-            // The modern-style LocationConf config file requires a generation to be set. In case someone
-            // is running a pageserver without the infrastructure to set generations, write out the legacy-style
-            // config file that only contains TenantConf.
-            //
-            // This will eventually be removed in https://github.com/neondatabase/neon/issues/5388
-
-            if attach_conf.generation.is_none() {
-                tracing::info!(
-                    "Running without generations, writing legacy-style tenant config file"
-                );
-                Self::persist_tenant_config_legacy(
-                    tenant_shard_id,
-                    legacy_config_path,
-                    &location_conf.tenant_conf,
-                )
-                .await?;
-
-                return Ok(());
-            }
-        }
-
+    ) -> std::io::Result<()> {
        debug!("persisting tenantconf to {config_path}");

        let mut conf_content = r#"# This file contains a specific per-tenant's config.
@@ -2693,53 +2668,20 @@ impl Tenant {
        .to_string();

        fail::fail_point!("tenant-config-before-write", |_| {
-            anyhow::bail!("tenant-config-before-write");
+            Err(std::io::Error::new(
+                std::io::ErrorKind::Other,
+                "tenant-config-before-write",
+            ))
        });

        // Convert the config to a toml file.
-        conf_content += &toml_edit::ser::to_string_pretty(&location_conf)?;
+        conf_content +=
+            &toml_edit::ser::to_string_pretty(&location_conf).expect("Config serialization failed");

        let temp_path = path_with_suffix_extension(config_path, TEMP_FILE_SUFFIX);

-        let tenant_shard_id = *tenant_shard_id;
-        let config_path = config_path.to_owned();
        let conf_content = conf_content.into_bytes();
-        VirtualFile::crashsafe_overwrite(config_path.clone(), temp_path, conf_content)
-            .await
-            .with_context(|| format!("write tenant {tenant_shard_id} config to {config_path}"))?;
-
-        Ok(())
-    }
-
-    #[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))]
-    async fn persist_tenant_config_legacy(
-        tenant_shard_id: &TenantShardId,
-        target_config_path: &Utf8Path,
-        tenant_conf: &TenantConfOpt,
-    ) -> anyhow::Result<()> {
-        debug!("persisting tenantconf to {target_config_path}");
-
-        let mut conf_content = r#"# This file contains a specific per-tenant's config.
-#  It is read in case of pageserver restart.
-
-[tenant_config]
-"#
-        .to_string();
-
-        // Convert the config to a toml file.
-        conf_content += &toml_edit::ser::to_string(&tenant_conf)?;
-
-        let temp_path = path_with_suffix_extension(target_config_path, TEMP_FILE_SUFFIX);
-
-        let tenant_shard_id = *tenant_shard_id;
-        let target_config_path = target_config_path.to_owned();
-        let conf_content = conf_content.into_bytes();
-        VirtualFile::crashsafe_overwrite(target_config_path.clone(), temp_path, conf_content)
-            .await
-            .with_context(|| {
-                format!("write tenant {tenant_shard_id} config to {target_config_path}")
-            })?;
-        Ok(())
+        VirtualFile::crashsafe_overwrite(config_path.to_owned(), temp_path, conf_content).await
    }

    //
@@ -2957,9 +2899,40 @@ impl Tenant {
            {
                let mut target = timeline.gc_info.write().unwrap();

+                // Cull any expired leases
                let now = SystemTime::now();
                target.leases.retain(|_, lease| !lease.is_expired(&now));

+                timeline
+                    .metrics
+                    .valid_lsn_lease_count_gauge
+                    .set(target.leases.len() as u64);
+
+                // Look up parent's PITR cutoff to update the child's knowledge of whether it is within parent's PITR
+                if let Some(ancestor_id) = timeline.get_ancestor_timeline_id() {
+                    if let Some(ancestor_gc_cutoffs) = gc_cutoffs.get(&ancestor_id) {
+                        target.within_ancestor_pitr =
+                            timeline.get_ancestor_lsn() >= ancestor_gc_cutoffs.time;
+                    }
+                }
+
+                // Update metrics that depend on GC state
+                timeline
+                    .metrics
+                    .archival_size
+                    .set(if target.within_ancestor_pitr {
+                        timeline.metrics.current_logical_size_gauge.get()
+                    } else {
+                        0
+                    });
+                timeline.metrics.pitr_history_size.set(
+                    timeline
+                        .get_last_record_lsn()
+                        .checked_sub(target.cutoffs.time)
+                        .unwrap_or(Lsn(0))
+                        .0,
+                );
+
                match gc_cutoffs.remove(&timeline.timeline_id) {
                    Some(cutoffs) => {
                        target.retain_lsns = branchpoints;
@@ -3011,7 +2984,7 @@ impl Tenant {
        dst_id: TimelineId,
        ancestor_lsn: Option<Lsn>,
        ctx: &RequestContext,
-        delta_layer_desc: Vec<Vec<(pageserver_api::key::Key, Lsn, crate::repository::Value)>>,
+        delta_layer_desc: Vec<timeline::DeltaLayerTestDesc>,
        image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>,
        end_lsn: Lsn,
    ) -> anyhow::Result<Arc<Timeline>> {
@@ -3395,6 +3368,7 @@ impl Tenant {
        TimelineResources {
            remote_client,
            timeline_get_throttle: self.timeline_get_throttle.clone(),
+            l0_flush_global_state: self.l0_flush_global_state.clone(),
        }
    }

@@ -3731,6 +3705,7 @@ pub(crate) mod harness {
    use utils::logging;

    use crate::deletion_queue::mock::MockDeletionQueue;
+    use crate::l0_flush::L0FlushConfig;
    use crate::walredo::apply_neon;
    use crate::{repository::Key, walrecord::NeonWalRecord};

@@ -3768,7 +3743,6 @@ pub(crate) mod harness {
                walreceiver_connect_timeout: Some(tenant_conf.walreceiver_connect_timeout),
                lagging_wal_timeout: Some(tenant_conf.lagging_wal_timeout),
                max_lsn_wal_lag: Some(tenant_conf.max_lsn_wal_lag),
-                trace_read_requests: Some(tenant_conf.trace_read_requests),
                eviction_policy: Some(tenant_conf.eviction_policy),
                min_resident_size_override: tenant_conf.min_resident_size_override,
                evictions_low_residence_duration_metric_threshold: Some(
@@ -3920,12 +3894,14 @@ pub(crate) mod harness {
                self.tenant_shard_id,
                self.remote_storage.clone(),
                self.deletion_queue.new_client(),
+                // TODO: ideally we should run all unit tests with both configs
+                L0FlushGlobalState::new(L0FlushConfig::default()),
            ));

            let preload = tenant
                .preload(&self.remote_storage, CancellationToken::new())
                .await?;
-            tenant.attach(Some(preload), SpawnMode::Eager, ctx).await?;
+            tenant.attach(Some(preload), ctx).await?;

            tenant.state.send_replace(TenantState::Active);
            for timeline in tenant.timelines.lock().unwrap().values() {
@@ -3953,7 +3929,7 @@ pub(crate) mod harness {
            base_img: Option<(Lsn, Bytes)>,
            records: Vec<(Lsn, NeonWalRecord)>,
            _pg_version: u32,
-        ) -> anyhow::Result<Bytes> {
+        ) -> Result<Bytes, walredo::Error> {
            let records_neon = records.iter().all(|r| apply_neon::can_apply_in_neon(&r.1));
            if records_neon {
                // For Neon wal records, we can decode without spawning postgres, so do so.
@@ -4007,7 +3983,7 @@ mod tests {
    use storage_layer::PersistentLayerKey;
    use tests::storage_layer::ValuesReconstructState;
    use tests::timeline::{GetVectoredError, ShutdownMode};
-    use timeline::GcInfo;
+    use timeline::{DeltaLayerTestDesc, GcInfo};
    use utils::bin_ser::BeSer;
    use utils::id::TenantId;

@@ -4286,7 +4262,7 @@ mod tests {
                    .source()
                    .unwrap()
                    .to_string()
-                    .contains("is earlier than latest GC horizon"));
+                    .contains("is earlier than latest GC cutoff"));
            }
        }

@@ -6303,27 +6279,6 @@ mod tests {
            .await
            .unwrap();

-        async fn get_vectored_impl_wrapper(
-            tline: &Arc<Timeline>,
-            key: Key,
-            lsn: Lsn,
-            ctx: &RequestContext,
-        ) -> Result<Option<Bytes>, GetVectoredError> {
-            let mut reconstruct_state = ValuesReconstructState::new();
-            let mut res = tline
-                .get_vectored_impl(
-                    KeySpace::single(key..key.next()),
-                    lsn,
-                    &mut reconstruct_state,
-                    ctx,
-                )
-                .await?;
-            Ok(res.pop_last().map(|(k, v)| {
-                assert_eq!(k, key);
-                v.unwrap()
-            }))
-        }
-
        let lsn = Lsn(0x30);

        // test vectored get on parent timeline
@@ -6363,7 +6318,7 @@ mod tests {

    #[tokio::test]
    async fn test_vectored_missing_metadata_key_reads() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_vectored_missing_data_key_reads")?;
+        let harness = TenantHarness::create("test_vectored_missing_metadata_key_reads")?;
        let (tenant, ctx) = harness.load().await;

        let base_key = Key::from_hex("620000000033333333444444445500000000").unwrap();
@@ -6399,27 +6354,6 @@ mod tests {
            .await
            .unwrap();

-        async fn get_vectored_impl_wrapper(
-            tline: &Arc<Timeline>,
-            key: Key,
-            lsn: Lsn,
-            ctx: &RequestContext,
-        ) -> Result<Option<Bytes>, GetVectoredError> {
-            let mut reconstruct_state = ValuesReconstructState::new();
-            let mut res = tline
-                .get_vectored_impl(
-                    KeySpace::single(key..key.next()),
-                    lsn,
-                    &mut reconstruct_state,
-                    ctx,
-                )
-                .await?;
-            Ok(res.pop_last().map(|(k, v)| {
-                assert_eq!(k, key);
-                v.unwrap()
-            }))
-        }
-
        let lsn = Lsn(0x30);

        // test vectored get on parent timeline
@@ -6495,9 +6429,18 @@ mod tests {
                &ctx,
                // delta layers
                vec![
-                    vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))],
-                    vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))],
-                    vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))],
+                    DeltaLayerTestDesc::new_with_inferred_key_range(
+                        Lsn(0x10)..Lsn(0x20),
+                        vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))],
+                    ),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(
+                        Lsn(0x20)..Lsn(0x30),
+                        vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))],
+                    ),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(
+                        Lsn(0x20)..Lsn(0x30),
+                        vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))],
+                    ),
                ],
                // image layers
                vec![
@@ -6563,17 +6506,29 @@ mod tests {
                &ctx,
                // delta layers
                vec![
-                    vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))],
-                    vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))],
-                    vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))],
-                    vec![
-                        (key0, Lsn(0x30), Value::Image(test_img("metadata key 0"))),
-                        (key3, Lsn(0x30), Value::Image(test_img("metadata key 3"))),
-                    ],
+                    DeltaLayerTestDesc::new_with_inferred_key_range(
+                        Lsn(0x10)..Lsn(0x20),
+                        vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))],
+                    ),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(
+                        Lsn(0x20)..Lsn(0x30),
+                        vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))],
+                    ),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(
+                        Lsn(0x20)..Lsn(0x30),
+                        vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))],
+                    ),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(
+                        Lsn(0x30)..Lsn(0x40),
+                        vec![
+                            (key0, Lsn(0x30), Value::Image(test_img("metadata key 0"))),
+                            (key3, Lsn(0x30), Value::Image(test_img("metadata key 3"))),
+                        ],
+                    ),
                ],
                // image layers
                vec![(Lsn(0x10), vec![(key1, test_img("metadata key 1"))])],
-                Lsn(0x30),
+                Lsn(0x40),
            )
            .await
            .unwrap();
@@ -6596,7 +6551,7 @@ mod tests {

        // Image layers are created at last_record_lsn
        let images = tline
-            .inspect_image_layers(Lsn(0x30), &ctx)
+            .inspect_image_layers(Lsn(0x40), &ctx)
            .await
            .unwrap()
            .into_iter()
@@ -6622,9 +6577,18 @@ mod tests {
                &ctx,
                // delta layers
                vec![
-                    vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))],
-                    vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))],
-                    vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))],
+                    DeltaLayerTestDesc::new_with_inferred_key_range(
+                        Lsn(0x10)..Lsn(0x20),
+                        vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))],
+                    ),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(
+                        Lsn(0x20)..Lsn(0x30),
+                        vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))],
+                    ),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(
+                        Lsn(0x20)..Lsn(0x30),
+                        vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))],
+                    ),
                ],
                // image layers
                vec![(Lsn(0x10), vec![(key1, test_img("metadata key 1"))])],
@@ -6672,15 +6636,21 @@ mod tests {
            key
        }

-        // We create one bottom-most image layer, a delta layer D1 crossing the GC horizon, D2 below the horizon, and D3 above the horizon.
+        // We create
+        // - one bottom-most image layer,
+        // - a delta layer D1 crossing the GC horizon with data below and above the horizon,
+        // - a delta layer D2 crossing the GC horizon with data only below the horizon,
+        // - a delta layer D3 above the horizon.
        //
-        //  | D1 |                       | D3 |
+        //                             | D3 |
+        //  | D1 |
        // -|    |-- gc horizon -----------------
        //  |    |                | D2 |
        // --------- img layer ------------------
        //
        // What we should expact from this compaction is:
-        //  | Part of D1 |               | D3 |
+        //                             | D3 |
+        //  | Part of D1 |
        // --------- img layer with D1+D2 at GC horizon------------------

        // img layer at 0x10
@@ -6720,13 +6690,13 @@ mod tests {
        let delta3 = vec![
            (
                get_key(8),
-                Lsn(0x40),
-                Value::Image(Bytes::from("value 8@0x40")),
+                Lsn(0x48),
+                Value::Image(Bytes::from("value 8@0x48")),
            ),
            (
                get_key(9),
-                Lsn(0x40),
-                Value::Image(Bytes::from("value 9@0x40")),
+                Lsn(0x48),
+                Value::Image(Bytes::from("value 9@0x48")),
            ),
        ];

@@ -6736,7 +6706,11 @@ mod tests {
                Lsn(0x10),
                DEFAULT_PG_VERSION,
                &ctx,
-                vec![delta1, delta2, delta3], // delta layers
+                vec![
+                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta1),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta2),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x48)..Lsn(0x50), delta3),
+                ], // delta layers
                vec![(Lsn(0x10), img_layer)], // image layers
                Lsn(0x50),
            )
@@ -6744,8 +6718,8 @@ mod tests {
        {
            // Update GC info
            let mut guard = tline.gc_info.write().unwrap();
-            guard.cutoffs.pitr = Lsn(0x30);
-            guard.cutoffs.horizon = Lsn(0x30);
+            guard.cutoffs.time = Lsn(0x30);
+            guard.cutoffs.space = Lsn(0x30);
        }

        let expected_result = [
@@ -6757,8 +6731,8 @@ mod tests {
            Bytes::from_static(b"value 5@0x20"),
            Bytes::from_static(b"value 6@0x20"),
            Bytes::from_static(b"value 7@0x10"),
-            Bytes::from_static(b"value 8@0x40"),
-            Bytes::from_static(b"value 9@0x40"),
+            Bytes::from_static(b"value 8@0x48"),
+            Bytes::from_static(b"value 9@0x48"),
        ];

        for (idx, expected) in expected_result.iter().enumerate() {
@@ -6846,10 +6820,10 @@ mod tests {
                    lsn_range: Lsn(0x30)..Lsn(0x41),
                    is_delta: true
                },
-                // The delta layer we created and should not be picked for the compaction
+                // The delta3 layer that should not be picked for the compaction
                PersistentLayerKey {
                    key_range: get_key(8)..get_key(10),
-                    lsn_range: Lsn(0x40)..Lsn(0x41),
+                    lsn_range: Lsn(0x48)..Lsn(0x50),
                    is_delta: true
                }
            ]
@@ -6913,7 +6887,10 @@ mod tests {
                Lsn(0x10),
                DEFAULT_PG_VERSION,
                &ctx,
-                vec![delta1],              // delta layers
+                vec![DeltaLayerTestDesc::new_with_inferred_key_range(
+                    Lsn(0x10)..Lsn(0x40),
+                    delta1,
+                )], // delta layers
                vec![(Lsn(0x10), image1)], // image layers
                Lsn(0x50),
            )
@@ -7037,15 +7014,21 @@ mod tests {
            key
        }

-        // We create one bottom-most image layer, a delta layer D1 crossing the GC horizon, D2 below the horizon, and D3 above the horizon.
+        // We create
+        // - one bottom-most image layer,
+        // - a delta layer D1 crossing the GC horizon with data below and above the horizon,
+        // - a delta layer D2 crossing the GC horizon with data only below the horizon,
+        // - a delta layer D3 above the horizon.
        //
-        //  | D1 |                       | D3 |
+        //                             | D3 |
+        //  | D1 |
        // -|    |-- gc horizon -----------------
        //  |    |                | D2 |
        // --------- img layer ------------------
        //
        // What we should expact from this compaction is:
-        //  | Part of D1 |               | D3 |
+        //                             | D3 |
+        //  | Part of D1 |
        // --------- img layer with D1+D2 at GC horizon------------------

        // img layer at 0x10
@@ -7064,6 +7047,16 @@ mod tests {
                Lsn(0x30),
                Value::WalRecord(NeonWalRecord::wal_append("@0x30")),
            ),
+            (
+                get_key(3),
+                Lsn(0x28),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x28")),
+            ),
+            (
+                get_key(3),
+                Lsn(0x30),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x30")),
+            ),
            (
                get_key(3),
                Lsn(0x40),
@@ -7085,13 +7078,13 @@ mod tests {
        let delta3 = vec![
            (
                get_key(8),
-                Lsn(0x40),
-                Value::WalRecord(NeonWalRecord::wal_append("@0x40")),
+                Lsn(0x48),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
            ),
            (
                get_key(9),
-                Lsn(0x40),
-                Value::WalRecord(NeonWalRecord::wal_append("@0x40")),
+                Lsn(0x48),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
            ),
        ];

@@ -7101,7 +7094,11 @@ mod tests {
                Lsn(0x10),
                DEFAULT_PG_VERSION,
                &ctx,
-                vec![delta1, delta2, delta3], // delta layers
+                vec![
+                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta1),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta2),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x48)..Lsn(0x50), delta3),
+                ], // delta layers
                vec![(Lsn(0x10), img_layer)], // image layers
                Lsn(0x50),
            )
@@ -7112,10 +7109,11 @@ mod tests {
            *guard = GcInfo {
                retain_lsns: vec![],
                cutoffs: GcCutoffs {
-                    pitr: Lsn(0x30),
-                    horizon: Lsn(0x30),
+                    time: Lsn(0x30),
+                    space: Lsn(0x30),
                },
                leases: Default::default(),
+                within_ancestor_pitr: false,
            };
        }

@@ -7123,20 +7121,20 @@ mod tests {
            Bytes::from_static(b"value 0@0x10"),
            Bytes::from_static(b"value 1@0x10@0x20"),
            Bytes::from_static(b"value 2@0x10@0x30"),
-            Bytes::from_static(b"value 3@0x10@0x40"),
+            Bytes::from_static(b"value 3@0x10@0x28@0x30@0x40"),
            Bytes::from_static(b"value 4@0x10"),
            Bytes::from_static(b"value 5@0x10@0x20"),
            Bytes::from_static(b"value 6@0x10@0x20"),
            Bytes::from_static(b"value 7@0x10"),
-            Bytes::from_static(b"value 8@0x10@0x40"),
-            Bytes::from_static(b"value 9@0x10@0x40"),
+            Bytes::from_static(b"value 8@0x10@0x48"),
+            Bytes::from_static(b"value 9@0x10@0x48"),
        ];

        let expected_result_at_gc_horizon = [
            Bytes::from_static(b"value 0@0x10"),
            Bytes::from_static(b"value 1@0x10@0x20"),
            Bytes::from_static(b"value 2@0x10@0x30"),
-            Bytes::from_static(b"value 3@0x10"),
+            Bytes::from_static(b"value 3@0x10@0x28@0x30"),
            Bytes::from_static(b"value 4@0x10"),
            Bytes::from_static(b"value 5@0x10@0x20"),
            Bytes::from_static(b"value 6@0x10@0x20"),
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -6,13 +6,20 @@
 //! is written as a one byte. If it's larger than that, the length
 //! is written as a four-byte integer, in big-endian, with the high
 //! bit set. This way, we can detect whether it's 1- or 4-byte header
-//! by peeking at the first byte.
+//! by peeking at the first byte. For blobs larger than 128 bits,
+//! we also specify three reserved bits, only one of the three bit
+//! patterns is currently in use (0b011) and signifies compression
+//! with zstd.
 //!
 //! len <  128: 0XXXXXXX
-//! len >= 128: 1XXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX
+//! len >= 128: 1CCCXXXX XXXXXXXX XXXXXXXX XXXXXXXX
 //!
+use async_compression::Level;
 use bytes::{BufMut, BytesMut};
+use pageserver_api::models::ImageCompressionAlgorithm;
+use tokio::io::AsyncWriteExt;
 use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice};
+use tracing::warn;

 use crate::context::RequestContext;
 use crate::page_cache::PAGE_SZ;
@@ -66,12 +73,37 @@ impl<'a> BlockCursor<'a> {
                len_buf.copy_from_slice(&buf[off..off + 4]);
                off += 4;
            }
-            len_buf[0] &= 0x7f;
+            let bit_mask = if self.read_compressed {
+                !LEN_COMPRESSION_BIT_MASK
+            } else {
+                0x7f
+            };
+            len_buf[0] &= bit_mask;
            u32::from_be_bytes(len_buf) as usize
        };
+        let compression_bits = first_len_byte & LEN_COMPRESSION_BIT_MASK;

-        dstbuf.clear();
-        dstbuf.reserve(len);
+        let mut tmp_buf = Vec::new();
+        let buf_to_write;
+        let compression = if compression_bits <= BYTE_UNCOMPRESSED || !self.read_compressed {
+            if compression_bits > BYTE_UNCOMPRESSED {
+                warn!("reading key above future limit ({len} bytes)");
+            }
+            buf_to_write = dstbuf;
+            None
+        } else if compression_bits == BYTE_ZSTD {
+            buf_to_write = &mut tmp_buf;
+            Some(dstbuf)
+        } else {
+            let error = std::io::Error::new(
+                std::io::ErrorKind::InvalidData,
+                format!("invalid compression byte {compression_bits:x}"),
+            );
+            return Err(error);
+        };
+
+        buf_to_write.clear();
+        buf_to_write.reserve(len);

        // Read the payload
        let mut remain = len;
@@ -85,14 +117,35 @@ impl<'a> BlockCursor<'a> {
                page_remain = PAGE_SZ;
            }
            let this_blk_len = min(remain, page_remain);
-            dstbuf.extend_from_slice(&buf[off..off + this_blk_len]);
+            buf_to_write.extend_from_slice(&buf[off..off + this_blk_len]);
            remain -= this_blk_len;
            off += this_blk_len;
        }
+
+        if let Some(dstbuf) = compression {
+            if compression_bits == BYTE_ZSTD {
+                let mut decoder = async_compression::tokio::write::ZstdDecoder::new(dstbuf);
+                decoder.write_all(buf_to_write).await?;
+                decoder.flush().await?;
+            } else {
+                unreachable!("already checked above")
+            }
+        }
+
        Ok(())
    }
 }

+/// Reserved bits for length and compression
+pub(super) const LEN_COMPRESSION_BIT_MASK: u8 = 0xf0;
+
+/// The maximum size of blobs we support. The highest few bits
+/// are reserved for compression and other further uses.
+const MAX_SUPPORTED_LEN: usize = 0x0fff_ffff;
+
+pub(super) const BYTE_UNCOMPRESSED: u8 = 0x80;
+pub(super) const BYTE_ZSTD: u8 = BYTE_UNCOMPRESSED | 0x10;
+
 /// A wrapper of `VirtualFile` that allows users to write blobs.
 ///
 /// If a `BlobWriter` is dropped, the internal buffer will be
@@ -219,6 +272,18 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
        &mut self,
        srcbuf: B,
        ctx: &RequestContext,
+    ) -> (B::Buf, Result<u64, Error>) {
+        self.write_blob_maybe_compressed(srcbuf, ctx, ImageCompressionAlgorithm::Disabled)
+            .await
+    }
+
+    /// Write a blob of data. Returns the offset that it was written to,
+    /// which can be used to retrieve the data later.
+    pub async fn write_blob_maybe_compressed<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
+        &mut self,
+        srcbuf: B,
+        ctx: &RequestContext,
+        algorithm: ImageCompressionAlgorithm,
    ) -> (B::Buf, Result<u64, Error>) {
        let offset = self.offset;

@@ -226,29 +291,60 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {

        let mut io_buf = self.io_buf.take().expect("we always put it back below");
        io_buf.clear();
-        let (io_buf, hdr_res) = async {
+        let mut compressed_buf = None;
+        let ((io_buf, hdr_res), srcbuf) = async {
            if len < 128 {
                // Short blob. Write a 1-byte length header
                io_buf.put_u8(len as u8);
-                self.write_all(io_buf, ctx).await
+                (
+                    self.write_all(io_buf, ctx).await,
+                    srcbuf.slice_full().into_inner(),
+                )
            } else {
                // Write a 4-byte length header
-                if len > 0x7fff_ffff {
+                if len > MAX_SUPPORTED_LEN {
                    return (
-                        io_buf,
-                        Err(Error::new(
-                            ErrorKind::Other,
-                            format!("blob too large ({len} bytes)"),
-                        )),
+                        (
+                            io_buf,
+                            Err(Error::new(
+                                ErrorKind::Other,
+                                format!("blob too large ({len} bytes)"),
+                            )),
+                        ),
+                        srcbuf.slice_full().into_inner(),
                    );
                }
-                if len > 0x0fff_ffff {
-                    tracing::warn!("writing blob above future limit ({len} bytes)");
-                }
-                let mut len_buf = (len as u32).to_be_bytes();
-                len_buf[0] |= 0x80;
+                let (high_bit_mask, len_written, srcbuf) = match algorithm {
+                    ImageCompressionAlgorithm::Zstd { level } => {
+                        let mut encoder = if let Some(level) = level {
+                            async_compression::tokio::write::ZstdEncoder::with_quality(
+                                Vec::new(),
+                                Level::Precise(level.into()),
+                            )
+                        } else {
+                            async_compression::tokio::write::ZstdEncoder::new(Vec::new())
+                        };
+                        let slice = srcbuf.slice_full();
+                        encoder.write_all(&slice[..]).await.unwrap();
+                        encoder.shutdown().await.unwrap();
+                        let compressed = encoder.into_inner();
+                        if compressed.len() < len {
+                            let compressed_len = compressed.len();
+                            compressed_buf = Some(compressed);
+                            (BYTE_ZSTD, compressed_len, slice.into_inner())
+                        } else {
+                            (BYTE_UNCOMPRESSED, len, slice.into_inner())
+                        }
+                    }
+                    ImageCompressionAlgorithm::Disabled => {
+                        (BYTE_UNCOMPRESSED, len, srcbuf.slice_full().into_inner())
+                    }
+                };
+                let mut len_buf = (len_written as u32).to_be_bytes();
+                assert_eq!(len_buf[0] & 0xf0, 0);
+                len_buf[0] |= high_bit_mask;
                io_buf.extend_from_slice(&len_buf[..]);
-                self.write_all(io_buf, ctx).await
+                (self.write_all(io_buf, ctx).await, srcbuf)
            }
        }
        .await;
@@ -257,7 +353,12 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
            Ok(_) => (),
            Err(e) => return (Slice::into_inner(srcbuf.slice(..)), Err(e)),
        }
-        let (srcbuf, res) = self.write_all(srcbuf, ctx).await;
+        let (srcbuf, res) = if let Some(compressed_buf) = compressed_buf {
+            let (_buf, res) = self.write_all(compressed_buf, ctx).await;
+            (Slice::into_inner(srcbuf.slice(..)), res)
+        } else {
+            self.write_all(srcbuf, ctx).await
+        };
        (srcbuf, res.map(|_| offset))
    }
 }
@@ -289,37 +390,65 @@ impl BlobWriter<false> {
 }

 #[cfg(test)]
-mod tests {
+pub(crate) mod tests {
    use super::*;
    use crate::{context::DownloadBehavior, task_mgr::TaskKind, tenant::block_io::BlockReaderRef};
+    use camino::Utf8PathBuf;
+    use camino_tempfile::Utf8TempDir;
    use rand::{Rng, SeedableRng};

    async fn round_trip_test<const BUFFERED: bool>(blobs: &[Vec<u8>]) -> Result<(), Error> {
+        round_trip_test_compressed::<BUFFERED>(blobs, false).await
+    }
+
+    pub(crate) async fn write_maybe_compressed<const BUFFERED: bool>(
+        blobs: &[Vec<u8>],
+        compression: bool,
+        ctx: &RequestContext,
+    ) -> Result<(Utf8TempDir, Utf8PathBuf, Vec<u64>), Error> {
        let temp_dir = camino_tempfile::tempdir()?;
        let pathbuf = temp_dir.path().join("file");
-        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);

        // Write part (in block to drop the file)
        let mut offsets = Vec::new();
        {
-            let file = VirtualFile::create(pathbuf.as_path(), &ctx).await?;
+            let file = VirtualFile::create(pathbuf.as_path(), ctx).await?;
            let mut wtr = BlobWriter::<BUFFERED>::new(file, 0);
            for blob in blobs.iter() {
-                let (_, res) = wtr.write_blob(blob.clone(), &ctx).await;
+                let (_, res) = if compression {
+                    wtr.write_blob_maybe_compressed(
+                        blob.clone(),
+                        ctx,
+                        ImageCompressionAlgorithm::Zstd { level: Some(1) },
+                    )
+                    .await
+                } else {
+                    wtr.write_blob(blob.clone(), ctx).await
+                };
                let offs = res?;
                offsets.push(offs);
            }
            // Write out one page worth of zeros so that we can
            // read again with read_blk
-            let (_, res) = wtr.write_blob(vec![0; PAGE_SZ], &ctx).await;
+            let (_, res) = wtr.write_blob(vec![0; PAGE_SZ], ctx).await;
            let offs = res?;
            println!("Writing final blob at offs={offs}");
-            wtr.flush_buffer(&ctx).await?;
+            wtr.flush_buffer(ctx).await?;
        }
+        Ok((temp_dir, pathbuf, offsets))
+    }

-        let file = VirtualFile::open(pathbuf.as_path(), &ctx).await?;
+    async fn round_trip_test_compressed<const BUFFERED: bool>(
+        blobs: &[Vec<u8>],
+        compression: bool,
+    ) -> Result<(), Error> {
+        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
+        let (_temp_dir, pathbuf, offsets) =
+            write_maybe_compressed::<BUFFERED>(blobs, compression, &ctx).await?;
+
+        let file = VirtualFile::open(pathbuf, &ctx).await?;
        let rdr = BlockReaderRef::VirtualFile(&file);
-        let rdr = BlockCursor::new(rdr);
+        let rdr = BlockCursor::new_with_compression(rdr, compression);
        for (idx, (blob, offset)) in blobs.iter().zip(offsets.iter()).enumerate() {
            let blob_read = rdr.read_blob(*offset, &ctx).await?;
            assert_eq!(
@@ -330,7 +459,7 @@ mod tests {
        Ok(())
    }

-    fn random_array(len: usize) -> Vec<u8> {
+    pub(crate) fn random_array(len: usize) -> Vec<u8> {
        let mut rng = rand::thread_rng();
        (0..len).map(|_| rng.gen()).collect::<_>()
    }
@@ -353,6 +482,8 @@ mod tests {
        ];
        round_trip_test::<false>(blobs).await?;
        round_trip_test::<true>(blobs).await?;
+        round_trip_test_compressed::<false>(blobs, true).await?;
+        round_trip_test_compressed::<true>(blobs, true).await?;
        Ok(())
    }

@@ -361,10 +492,15 @@ mod tests {
        let blobs = &[
            b"test".to_vec(),
            random_array(10 * PAGE_SZ),
+            b"hello".to_vec(),
+            random_array(66 * PAGE_SZ),
+            vec![0xf3; 24 * PAGE_SZ],
            b"foobar".to_vec(),
        ];
        round_trip_test::<false>(blobs).await?;
        round_trip_test::<true>(blobs).await?;
+        round_trip_test_compressed::<false>(blobs, true).await?;
+        round_trip_test_compressed::<true>(blobs, true).await?;
        Ok(())
    }

--- a/Show More
+++ b/Show More