More commenting out

more commenting out
Comment out some tests
2026-05-21 15:10:44 +00:00 · 2024-07-10 03:27:33 +02:00 · 2024-07-09 18:18:53 +02:00 · 2024-07-09 18:11:58 +02:00 · 2024-07-09 15:40:25 +02:00
206 changed files with 3232 additions and 8616 deletions
--- a/.github/actions/neon-project-create/action.yml
+++ b/.github/actions/neon-project-create/action.yml
@@ -9,8 +9,8 @@ inputs:
    description: 'Region ID, if not set the project will be created in the default region'
    default: aws-us-east-2
  postgres_version:
-    description: 'Postgres version; default is 16'
-    default: '16'
+    description: 'Postgres version; default is 15'
+    default: '15'
  api_host:
    description: 'Neon API host'
    default: console-stage.neon.build
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -115,7 +115,6 @@ runs:
        export POSTGRES_DISTRIB_DIR=${POSTGRES_DISTRIB_DIR:-/tmp/neon/pg_install}
        export DEFAULT_PG_VERSION=${PG_VERSION#v}
        export LD_LIBRARY_PATH=${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/lib
-        export BENCHMARK_CONNSTR=${BENCHMARK_CONNSTR:-}

        if [ "${BUILD_TYPE}" = "remote" ]; then
          export REMOTE_ENV=1
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -56,27 +56,15 @@ concurrency:
 jobs:
  bench:
    if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - DEFAULT_PG_VERSION: 16
-            PLATFORM: "neon-staging"
-            region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
-            provisioner: 'k8s-pod' 
-          - DEFAULT_PG_VERSION: 16
-            PLATFORM: "azure-staging"
-            region_id: 'azure-eastus2'
-            provisioner: 'k8s-neonvm'
    env:
      TEST_PG_BENCH_DURATIONS_MATRIX: "300"
      TEST_PG_BENCH_SCALES_MATRIX: "10,100"
      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
-      DEFAULT_PG_VERSION: ${{ matrix.DEFAULT_PG_VERSION }}
+      DEFAULT_PG_VERSION: 14
      TEST_OUTPUT: /tmp/test_output
      BUILD_TYPE: remote
      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
-      PLATFORM: ${{ matrix.PLATFORM }}
+      PLATFORM: "neon-staging"

    runs-on: [ self-hosted, us-east-2, x64 ]
    container:
@@ -97,10 +85,9 @@ jobs:
      id: create-neon-project
      uses: ./.github/actions/neon-project-create
      with:
-        region_id: ${{ matrix.region_id }}
+        region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
        postgres_version: ${{ env.DEFAULT_PG_VERSION }}
        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
-        provisioner: ${{ matrix.provisioner }}

    - name: Run benchmark
      uses: ./.github/actions/run-python-test-set
@@ -109,18 +96,10 @@ jobs:
        test_selection: performance
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
-        pg_version: ${{ env.DEFAULT_PG_VERSION }}
        # Set --sparse-ordering option of pytest-order plugin
        # to ensure tests are running in order of appears in the file.
        # It's important for test_perf_pgbench.py::test_pgbench_remote_* tests
-        extra_params:
-          -m remote_cluster
-          --sparse-ordering
-          --timeout 14400
-          --ignore test_runner/performance/test_perf_olap.py
-          --ignore test_runner/performance/test_perf_pgvector_queries.py
-          --ignore test_runner/performance/test_logical_replication.py
-          --ignore test_runner/performance/test_physical_replication.py
+        extra_params: -m remote_cluster --sparse-ordering --timeout 5400 --ignore test_runner/performance/test_perf_olap.py --ignore test_runner/performance/test_perf_pgvector_queries.py
      env:
        BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -146,71 +125,6 @@ jobs:
      env:
        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}

-  replication-tests:
-    if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
-    env:
-      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
-      DEFAULT_PG_VERSION: 14
-      TEST_OUTPUT: /tmp/test_output
-      BUILD_TYPE: remote
-      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
-      PLATFORM: "neon-staging"
-
-    runs-on: [ self-hosted, us-east-2, x64 ]
-    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
-      options: --init
-
-    steps:
-    - uses: actions/checkout@v4
-
-    - name: Download Neon artifact
-      uses: ./.github/actions/download
-      with:
-        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
-        path: /tmp/neon/
-        prefix: latest
-
-    - name: Run benchmark
-      uses: ./.github/actions/run-python-test-set
-      with:
-        build_type: ${{ env.BUILD_TYPE }}
-        test_selection: performance/test_logical_replication.py
-        run_in_parallel: false
-        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
-        extra_params: -m remote_cluster --timeout 5400
-      env:
-        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
-        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
-        NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }}
-
-    - name: Run benchmark
-      uses: ./.github/actions/run-python-test-set
-      with:
-        build_type: ${{ env.BUILD_TYPE }}
-        test_selection: performance/test_physical_replication.py
-        run_in_parallel: false
-        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
-        extra_params: -m remote_cluster --timeout 5400
-        pg_version: ${{ env.DEFAULT_PG_VERSION }}
-      env:
-        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
-        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
-        NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }}
-
-    - name: Create Allure report
-      if: ${{ !cancelled() }}
-      uses: ./.github/actions/allure-report-generate
-
-    - name: Post to a Slack channel
-      if: ${{ github.event.schedule && failure() }}
-      uses: slackapi/slack-github-action@v1
-      with:
-        channel-id: "C033QLM5P7D" # dev-staging-stream
-        slack-message: "Periodic replication testing: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
-      env:
-        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
-
  generate-matrices:
    if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
    # Create matrices for the benchmarking jobs, so we run benchmarks on rds only once a week (on Saturday)
@@ -218,14 +132,11 @@ jobs:
    # Available platforms:
    # - neon-captest-new: Freshly created project (1 CU)
    # - neon-captest-freetier: Use freetier-sized compute (0.25 CU)
-    # - neonvm-captest-azure-new: Freshly created project (1 CU) in azure region
-    # - neonvm-captest-azure-freetier: Use freetier-sized compute (0.25 CU) in azure region
    # - neon-captest-reuse: Reusing existing project
    # - rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs
    # - rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
    env:
      RUN_AWS_RDS_AND_AURORA: ${{ github.event.inputs.run_AWS_RDS_AND_AURORA || 'false' }}
-      DEFAULT_REGION_ID: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
    runs-on: ubuntu-22.04
    outputs:
      pgbench-compare-matrix: ${{ steps.pgbench-compare-matrix.outputs.matrix }}
@@ -236,33 +147,23 @@ jobs:
    - name: Generate matrix for pgbench benchmark
      id: pgbench-compare-matrix
      run: |
-        region_id_default=${{ env.DEFAULT_REGION_ID }}
        matrix='{
-          "pg_version" : [
-            16
-          ],
-          "region_id" : [
-            "'"$region_id_default"'"
-            ],
          "platform": [
            "neon-captest-new",
            "neon-captest-reuse",
            "neonvm-captest-new"
          ],
          "db_size": [ "10gb" ],
-          "include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neon-captest-freetier",         "db_size": "3gb"  },
-                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neon-captest-new",              "db_size": "50gb" },
-                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier",       "db_size": "3gb"  },
-                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new",            "db_size": "50gb" },
-                      { "pg_version": 16, "region_id": "azure-eastus2",      "platform": "neonvm-azure-captest-freetier", "db_size": "3gb"  },
-                      { "pg_version": 16, "region_id": "azure-eastus2",      "platform": "neonvm-azure-captest-new",      "db_size": "10gb" },
-                      { "pg_version": 16, "region_id": "azure-eastus2",      "platform": "neonvm-azure-captest-new",      "db_size": "50gb" },
-                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb" }]
+          "include": [{ "platform": "neon-captest-freetier",         "db_size": "3gb"  },
+                      { "platform": "neon-captest-new",              "db_size": "50gb" },
+                      { "platform": "neonvm-captest-freetier",       "db_size": "3gb"  },
+                      { "platform": "neonvm-captest-new",            "db_size": "50gb" },
+                      { "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb" }]
        }'

        if [ "$(date +%A)" = "Saturday" ]; then
-          matrix=$(echo "$matrix" | jq '.include += [{ "pg_version": 14, "region_id": "'"$region_id_default"'", "platform": "rds-postgres", "db_size": "10gb"},
-                                                     { "pg_version": 14, "region_id": "'"$region_id_default"'", "platform": "rds-aurora",   "db_size": "50gb"}]')
+          matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "db_size": "10gb"},
+                                                     { "platform": "rds-aurora",   "db_size": "50gb"}]')
        fi

        echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
@@ -314,7 +215,7 @@ jobs:
      TEST_PG_BENCH_DURATIONS_MATRIX: "60m"
      TEST_PG_BENCH_SCALES_MATRIX: ${{ matrix.db_size }}
      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
-      DEFAULT_PG_VERSION: ${{ matrix.pg_version }}
+      DEFAULT_PG_VERSION: 14
      TEST_OUTPUT: /tmp/test_output
      BUILD_TYPE: remote
      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
@@ -339,14 +240,14 @@ jobs:
        prefix: latest

    - name: Create Neon Project
-      if: contains(fromJson('["neon-captest-new", "neon-captest-freetier", "neonvm-captest-new", "neonvm-captest-freetier", "neonvm-azure-captest-freetier", "neonvm-azure-captest-new"]'), matrix.platform)
+      if: contains(fromJson('["neon-captest-new", "neon-captest-freetier", "neonvm-captest-new", "neonvm-captest-freetier"]'), matrix.platform)
      id: create-neon-project
      uses: ./.github/actions/neon-project-create
      with:
-        region_id: ${{ matrix.region_id }}
+        region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
        postgres_version: ${{ env.DEFAULT_PG_VERSION }}
        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
-        compute_units: ${{ (contains(matrix.platform, 'captest-freetier') && '[0.25, 0.25]') || '[1, 1]' }}
+        compute_units: ${{ (matrix.platform == 'neon-captest-freetier' && '[0.25, 0.25]') || '[1, 1]' }}
        provisioner: ${{ (contains(matrix.platform, 'neonvm-') && 'k8s-neonvm') || 'k8s-pod' }}

    - name: Set up Connection String
@@ -359,7 +260,7 @@ jobs:
          neonvm-captest-sharding-reuse)
            CONNSTR=${{ secrets.BENCHMARK_CAPTEST_SHARDING_CONNSTR }}
            ;;
-          neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier | neonvm-azure-captest-new | neonvm-azure-captest-freetier)
+          neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier)
            CONNSTR=${{ steps.create-neon-project.outputs.dsn }}
            ;;
          rds-aurora)
@@ -384,7 +285,6 @@ jobs:
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_init
-        pg_version: ${{ env.DEFAULT_PG_VERSION }}
      env:
        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -398,7 +298,6 @@ jobs:
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_simple_update
-        pg_version: ${{ env.DEFAULT_PG_VERSION }}
      env:
        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -412,7 +311,6 @@ jobs:
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_select_only
-        pg_version: ${{ env.DEFAULT_PG_VERSION }}
      env:
        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -439,13 +337,6 @@ jobs:
        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}

  pgbench-pgvector:
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - PLATFORM: "neon-captest-pgvector"
-          - PLATFORM: "azure-captest-pgvector"
-            
    env:
      TEST_PG_BENCH_DURATIONS_MATRIX: "15m"
      TEST_PG_BENCH_SCALES_MATRIX: "1"
@@ -453,9 +344,8 @@ jobs:
      DEFAULT_PG_VERSION: 16
      TEST_OUTPUT: /tmp/test_output
      BUILD_TYPE: remote
-      LD_LIBRARY_PATH: /home/nonroot/pg/usr/lib/x86_64-linux-gnu
      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
-      PLATFORM: ${{ matrix.PLATFORM }}
+      PLATFORM: "neon-captest-pgvector"

    runs-on: [ self-hosted, us-east-2, x64 ]
    container:
@@ -465,39 +355,17 @@ jobs:
    steps:
    - uses: actions/checkout@v4

-    # until https://github.com/neondatabase/neon/issues/8275 is fixed we temporarily install postgresql-16
-    # instead of using Neon artifacts containing pgbench
-    - name: Install postgresql-16 where pytest expects it
-      run: |
-        cd /home/nonroot
-        wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/libpq5_16.3-1.pgdg110%2B1_amd64.deb
-        wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-client-16_16.3-1.pgdg110%2B1_amd64.deb
-        wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-16_16.3-1.pgdg110%2B1_amd64.deb 
-        dpkg -x libpq5_16.3-1.pgdg110+1_amd64.deb pg
-        dpkg -x postgresql-client-16_16.3-1.pgdg110+1_amd64.deb pg
-        dpkg -x postgresql-16_16.3-1.pgdg110+1_amd64.deb pg
-        mkdir -p /tmp/neon/pg_install/v16/bin
-        ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/pgbench /tmp/neon/pg_install/v16/bin/pgbench  
-        ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/psql /tmp/neon/pg_install/v16/bin/psql  
-        ln -s /home/nonroot/pg/usr/lib/x86_64-linux-gnu /tmp/neon/pg_install/v16/lib 
-        /tmp/neon/pg_install/v16/bin/pgbench --version
-        /tmp/neon/pg_install/v16/bin/psql --version
+    - name: Download Neon artifact
+      uses: ./.github/actions/download
+      with:
+        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
+        path: /tmp/neon/
+        prefix: latest

    - name: Set up Connection String
      id: set-up-connstr
      run: |
-        case "${PLATFORM}" in
-          neon-captest-pgvector)
-            CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR }}
-            ;;
-          azure-captest-pgvector)
-            CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR_AZURE }}
-            ;;
-          *)
-            echo >&2 "Unknown PLATFORM=${PLATFORM}"
-            exit 1
-            ;;
-        esac
+        CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR }}

        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT

@@ -509,7 +377,6 @@ jobs:
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 21600 -k test_pgvector_indexing
-        pg_version: ${{ env.DEFAULT_PG_VERSION }}
      env:
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
@@ -523,7 +390,6 @@ jobs:
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 21600
-        pg_version: ${{ env.DEFAULT_PG_VERSION }}
      env:
        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -538,10 +404,11 @@ jobs:
      uses: slackapi/slack-github-action@v1
      with:
        channel-id: "C033QLM5P7D" # dev-staging-stream
-        slack-message: "Periodic perf testing ${PLATFORM}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+        slack-message: "Periodic perf testing neon-captest-pgvector: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
      env:
        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}

+
  clickbench-compare:
    # ClichBench DB for rds-aurora and rds-Postgres deployed to the same clusters
    # we use for performance testing in pgbench-compare.
@@ -785,7 +652,6 @@ jobs:
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 21600 -k test_user_examples
-        pg_version: ${{ env.DEFAULT_PG_VERSION }}
      env:
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -1336,7 +1336,6 @@ jobs:
        env:
          BUCKET: neon-github-public-dev
          PREFIX: artifacts/latest
-          COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
        run: |
          # Update compatibility snapshot for the release
          for pg_version in v14 v15 v16; do
@@ -1350,7 +1349,7 @@ jobs:

          # Update Neon artifact for the release (reuse already uploaded artifact)
          for build_type in debug release; do
-            OLD_PREFIX=artifacts/${COMMIT_SHA}/${GITHUB_RUN_ID}
+            OLD_PREFIX=artifacts/${GITHUB_RUN_ID}
            FILENAME=neon-${{ runner.os }}-${{ runner.arch }}-${build_type}-artifact.tar.zst

            S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${OLD_PREFIX} | jq -r '.Contents[]?.Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1236,7 +1236,6 @@ dependencies = [
 "regex",
 "remote_storage",
 "reqwest 0.12.4",
- "rlimit",
 "rust-ini",
 "serde",
 "serde_json",
@@ -1368,7 +1367,6 @@ dependencies = [
 "tracing",
 "url",
 "utils",
- "whoami",
 "workspace_hack",
 ]

@@ -1399,9 +1397,9 @@ dependencies = [

 [[package]]
 name = "crc32c"
-version = "0.6.8"
+version = "0.6.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3a47af21622d091a8f0fb295b88bc886ac74efcc613efc19f5d0b21de5c89e47"
+checksum = "89254598aa9b9fa608de44b3ae54c810f0f06d755e24c50177f1f8f31ff50ce2"
 dependencies = [
 "rustc_version",
 ]
@@ -1653,16 +1651,6 @@ dependencies = [
 "rusticata-macros",
 ]

-[[package]]
-name = "deranged"
-version = "0.3.11"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b42b6fa04a440b495c8b04d0e71b707c585f83cb9cb28cf8cd0d976c315e31b4"
-dependencies = [
- "powerfmt",
- "serde",
-]
-
 [[package]]
 name = "desim"
 version = "0.1.0"
@@ -2029,6 +2017,16 @@ dependencies = [
 "tokio-util",
 ]

+[[package]]
+name = "fs2"
+version = "0.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9564fc758e15025b46aa6643b1b77d047d1a56a1aea6e01002ac0c7026876213"
+dependencies = [
+ "libc",
+ "winapi",
+]
+
 [[package]]
 name = "fsevent-sys"
 version = "4.1.0"
@@ -3010,9 +3008,9 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"

 [[package]]
 name = "measured"
-version = "0.0.22"
+version = "0.0.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3051f3a030d55d680cdef6ca50e80abd1182f8da29f2344a7c9cb575721138f0"
+checksum = "652bc741286361c06de8cb4d89b21a6437f120c508c51713663589eeb9928ac5"
 dependencies = [
 "bytes",
 "crossbeam-utils",
@@ -3028,9 +3026,9 @@ dependencies = [

 [[package]]
 name = "measured-derive"
-version = "0.0.22"
+version = "0.0.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b9e6777fc80a575f9503d908c8b498782a6c3ee88a06cb416dc3941401e43b94"
+checksum = "6ea497f33e1e856a376c32ad916f69a0bd3c597db1f912a399f842b01a4a685d"
 dependencies = [
 "heck 0.5.0",
 "proc-macro2",
@@ -3040,9 +3038,9 @@ dependencies = [

 [[package]]
 name = "measured-process"
-version = "0.0.22"
+version = "0.0.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7c4b80445aeb08e832d87bf1830049a924cdc1d6b7ef40b6b9b365bff17bf8ec"
+checksum = "b364ccb66937a814b6b2ad751d1a2f7a9d5a78c761144036825fb36bb0771000"
 dependencies = [
 "libc",
 "measured",
@@ -3234,6 +3232,16 @@ dependencies = [
 "winapi",
 ]

+[[package]]
+name = "nu-ansi-term"
+version = "0.46.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84"
+dependencies = [
+ "overload",
+ "winapi",
+]
+
 [[package]]
 name = "num"
 version = "0.4.1"
@@ -3267,12 +3275,6 @@ dependencies = [
 "num-traits",
 ]

-[[package]]
-name = "num-conv"
-version = "0.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9"
-
 [[package]]
 name = "num-integer"
 version = "0.1.45"
@@ -3529,6 +3531,12 @@ version = "0.5.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4030760ffd992bef45b0ae3f10ce1aba99e33464c90d14dd7c039884963ddc7a"

+[[package]]
+name = "overload"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
+
 [[package]]
 name = "p256"
 version = "0.11.1"
@@ -3659,7 +3667,6 @@ dependencies = [
 "sysinfo",
 "tenant_size_model",
 "thiserror",
- "tikv-jemallocator",
 "tokio",
 "tokio-epoll-uring",
 "tokio-io-timeout",
@@ -4070,7 +4077,6 @@ dependencies = [
 "tokio-postgres",
 "tokio-postgres-rustls",
 "tokio-rustls 0.25.0",
- "tokio-util",
 "tracing",
 "workspace_hack",
 ]
@@ -4111,12 +4117,6 @@ dependencies = [
 "workspace_hack",
 ]

-[[package]]
-name = "powerfmt"
-version = "0.2.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391"
-
 [[package]]
 name = "ppv-lite86"
 version = "0.2.17"
@@ -4389,7 +4389,6 @@ dependencies = [
 "tracing-opentelemetry",
 "tracing-subscriber",
 "tracing-utils",
- "typed-json",
 "url",
 "urlencoding",
 "utils",
@@ -4588,15 +4587,6 @@ dependencies = [
 "bitflags 1.3.2",
 ]

-[[package]]
-name = "redox_syscall"
-version = "0.4.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa"
-dependencies = [
- "bitflags 1.3.2",
-]
-
 [[package]]
 name = "regex"
 version = "1.10.2"
@@ -4887,15 +4877,6 @@ dependencies = [
 "windows-sys 0.48.0",
 ]

-[[package]]
-name = "rlimit"
-version = "0.10.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3560f70f30a0f16d11d01ed078a07740fe6b489667abc7c7b029155d9f21c3d8"
-dependencies = [
- "libc",
-]
-
 [[package]]
 name = "routerify"
 version = "3.0.0"
@@ -5164,6 +5145,7 @@ dependencies = [
 "crc32c",
 "desim",
 "fail",
+ "fs2",
 "futures",
 "git-version",
 "hex",
@@ -5190,8 +5172,6 @@ dependencies = [
 "sha2",
 "signal-hook",
 "storage_broker",
- "strum",
- "strum_macros",
 "thiserror",
 "tokio",
 "tokio-io-timeout",
@@ -5416,9 +5396,9 @@ checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4"

 [[package]]
 name = "serde"
-version = "1.0.203"
+version = "1.0.183"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7253ab4de971e72fb7be983802300c30b5a7f0c2e56fab8abfc6a214307c0094"
+checksum = "32ac8da02677876d532745a130fc9d8e6edfa81a269b107c5b00829b91d8eb3c"
 dependencies = [
 "serde_derive",
 ]
@@ -5435,9 +5415,9 @@ dependencies = [

 [[package]]
 name = "serde_derive"
-version = "1.0.203"
+version = "1.0.183"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "500cbc0ebeb6f46627f50f3f5811ccf6bf00643be300b4c3eabc0ef55dc5b5ba"
+checksum = "aafe972d60b0b9bee71a91b92fee2d4fb3c9d7e8f6b179aa99f27203d99a4816"
 dependencies = [
 "proc-macro2",
 "quote",
@@ -5806,28 +5786,6 @@ dependencies = [
 "workspace_hack",
 ]

-[[package]]
-name = "storage_controller_client"
-version = "0.1.0"
-dependencies = [
- "anyhow",
- "async-trait",
- "bytes",
- "futures",
- "pageserver_api",
- "pageserver_client",
- "postgres",
- "reqwest 0.12.4",
- "serde",
- "thiserror",
- "tokio",
- "tokio-postgres",
- "tokio-stream",
- "tokio-util",
- "utils",
- "workspace_hack",
-]
-
 [[package]]
 name = "storage_scrubber"
 version = "0.1.0"
@@ -5862,7 +5820,6 @@ dependencies = [
 "serde",
 "serde_json",
 "serde_with",
- "storage_controller_client",
 "thiserror",
 "tokio",
 "tokio-postgres",
@@ -5892,7 +5849,6 @@ dependencies = [
 "reqwest 0.12.4",
 "serde",
 "serde_json",
- "storage_controller_client",
 "thiserror",
 "tokio",
 "tracing",
@@ -6151,15 +6107,12 @@ dependencies = [

 [[package]]
 name = "time"
-version = "0.3.36"
+version = "0.3.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5dfd88e563464686c916c7e46e623e520ddc6d79fa6641390f2e3fa86e83e885"
+checksum = "8f3403384eaacbca9923fa06940178ac13e4edb725486d70e8e15881d0c836cc"
 dependencies = [
- "deranged",
 "itoa",
 "js-sys",
- "num-conv",
- "powerfmt",
 "serde",
 "time-core",
 "time-macros",
@@ -6167,17 +6120,16 @@ dependencies = [

 [[package]]
 name = "time-core"
-version = "0.1.2"
+version = "0.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3"
+checksum = "7300fbefb4dadc1af235a9cef3737cea692a9d97e1b9cbcd4ebdae6f8868e6fb"

 [[package]]
 name = "time-macros"
-version = "0.2.18"
+version = "0.2.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3f252a68540fde3a3877aeea552b832b40ab9a69e318efd078774a01ddee1ccf"
+checksum = "372950940a5f07bf38dbe211d7283c9e6d7327df53794992d293e534c733d09b"
 dependencies = [
- "num-conv",
 "time-core",
 ]

@@ -6520,6 +6472,17 @@ version = "0.3.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52"

+[[package]]
+name = "trace"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "clap",
+ "pageserver_api",
+ "utils",
+ "workspace_hack",
+]
+
 [[package]]
 name = "tracing"
 version = "0.1.37"
@@ -6619,6 +6582,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "30a651bc37f915e81f087d86e62a18eec5f79550c7faff886f7090b4ea757c77"
 dependencies = [
 "matchers",
+ "nu-ansi-term",
 "once_cell",
 "regex",
 "serde",
@@ -6683,16 +6647,6 @@ dependencies = [
 "static_assertions",
 ]

-[[package]]
-name = "typed-json"
-version = "0.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6024a8d0025400b3f6b189366e9aa92012cf9c4fe1cd2620848dd61425c49eed"
-dependencies = [
- "serde",
- "serde_json",
-]
-
 [[package]]
 name = "typenum"
 version = "1.16.0"
@@ -6850,7 +6804,6 @@ dependencies = [
 "serde_path_to_error",
 "serde_with",
 "signal-hook",
- "smallvec",
 "strum",
 "strum_macros",
 "thiserror",
@@ -6990,12 +6943,6 @@ version = "0.11.0+wasi-snapshot-preview1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"

-[[package]]
-name = "wasite"
-version = "0.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b8dad83b4f25e74f184f64c43b150b91efe7647395b42289f38e50566d82855b"
-
 [[package]]
 name = "wasm-bindgen"
 version = "0.2.92"
@@ -7148,17 +7095,6 @@ dependencies = [
 "once_cell",
 ]

-[[package]]
-name = "whoami"
-version = "1.5.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a44ab49fad634e88f55bf8f9bb3abd2f27d7204172a112c7c9987e01c1c94ea9"
-dependencies = [
- "redox_syscall 0.4.1",
- "wasite",
- "web-sys",
-]
-
 [[package]]
 name = "winapi"
 version = "0.3.9"
@@ -7491,12 +7427,13 @@ dependencies = [
 "clap",
 "clap_builder",
 "crossbeam-utils",
- "deranged",
 "either",
 "fail",
 "futures-channel",
+ "futures-core",
 "futures-executor",
 "futures-io",
+ "futures-sink",
 "futures-util",
 "getrandom 0.2.11",
 "hashbrown 0.14.5",
@@ -7514,9 +7451,7 @@ dependencies = [
 "num-traits",
 "once_cell",
 "parquet",
- "proc-macro2",
 "prost",
- "quote",
 "rand 0.8.5",
 "regex",
 "regex-automata 0.4.3",
@@ -7533,7 +7468,6 @@ dependencies = [
 "syn 1.0.109",
 "syn 2.0.52",
 "sync_wrapper",
- "tikv-jemalloc-sys",
 "time",
 "time-macros",
 "tokio",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -13,9 +13,9 @@ members = [
    "safekeeper",
    "storage_broker",
    "storage_controller",
-    "storage_controller/client",
    "storage_scrubber",
    "workspace_hack",
+    "trace",
    "libs/compute_api",
    "libs/pageserver_api",
    "libs/postgres_ffi",
@@ -84,6 +84,7 @@ enumset = "1.0.12"
 fail = "0.5.0"
 fallible-iterator = "0.2"
 framed-websockets = { version = "0.1.0", git = "https://github.com/neondatabase/framed-websockets" }
+fs2 = "0.4.3"
 futures = "0.3"
 futures-core = "0.3"
 futures-util = "0.3"
@@ -110,8 +111,8 @@ lasso = "0.7"
 leaky-bucket = "1.0.1"
 libc = "0.2"
 md5 = "0.7.0"
-measured = { version = "0.0.22", features=["lasso"] }
-measured-process = { version = "0.0.22" }
+measured = { version = "0.0.21", features=["lasso"] }
+measured-process = { version = "0.0.21" }
 memoffset = "0.8"
 nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] }
 notify = "6.0.0"
@@ -183,16 +184,14 @@ tower-service = "0.3.2"
 tracing = "0.1"
 tracing-error = "0.2.0"
 tracing-opentelemetry = "0.21.0"
-tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
+tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json", "ansi"] }
 twox-hash = { version = "1.6.3", default-features = false }
-typed-json = "0.1"
 url = "2.2"
 urlencoding = "2.1"
 uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] }
 walkdir = "2.3.2"
 rustls-native-certs = "0.7"
 x509-parser = "0.15"
-whoami = "1.5.1"

 ## TODO replace this with tracing
 env_logger = "0.10"
@@ -222,7 +221,6 @@ remote_storage = { version = "0.1", path = "./libs/remote_storage/" }
 safekeeper_api = { version = "0.1", path = "./libs/safekeeper_api" }
 desim = { version = "0.1", path = "./libs/desim" }
 storage_broker = { version = "0.1", path = "./storage_broker/" } # Note: main broker code is inside the binary crate, so linking with the library shouldn't be heavy.
-storage_controller_client = { path = "./storage_controller/client" }
 tenant_size_model = { version = "0.1", path = "./libs/tenant_size_model/" }
 tracing-utils = { version = "0.1", path = "./libs/tracing-utils/" }
 utils = { version = "0.1", path = "./libs/utils/" }
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -311,12 +311,9 @@ RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz
 FROM build-deps AS rum-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-COPY patches/rum.patch /rum.patch
-
 RUN wget https://github.com/postgrespro/rum/archive/refs/tags/1.3.13.tar.gz -O rum.tar.gz && \
    echo "6ab370532c965568df6210bd844ac6ba649f53055e48243525b0b7e5c4d69a7d rum.tar.gz" | sha256sum --check && \
    mkdir rum-src && cd rum-src && tar xzf ../rum.tar.gz --strip-components=1 -C . && \
-    patch -p1 < /rum.patch && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/rum.control
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -44,4 +44,3 @@ vm_monitor = { version = "0.1", path = "../libs/vm_monitor/" }
 zstd = "0.13"
 bytes = "1.0"
 rust-ini = "0.20.0"
-rlimit = "0.10.1"
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -6,7 +6,7 @@
 //! - Every start is a fresh start, so the data directory is removed and
 //!   initialized again on each run.
 //! - If remote_extension_config is provided, it will be used to fetch extensions list
-//!   and download `shared_preload_libraries` from the remote storage.
+//!  and download `shared_preload_libraries` from the remote storage.
 //! - Next it will put configuration files into the `PGDATA` directory.
 //! - Sync safekeepers and get commit LSN.
 //! - Get `basebackup` from pageserver using the returned on the previous step LSN.
@@ -33,6 +33,7 @@
 //!             -b /usr/local/bin/postgres \
 //!             -r http://pg-ext-s3-gateway \
 //! ```
+//!
 use std::collections::HashMap;
 use std::fs::File;
 use std::path::Path;
@@ -63,7 +64,6 @@ use compute_tools::monitor::launch_monitor;
 use compute_tools::params::*;
 use compute_tools::spec::*;
 use compute_tools::swap::resize_swap;
-use rlimit::{setrlimit, Resource};

 // this is an arbitrary build tag. Fine as a default / for testing purposes
 // in-case of not-set environment var
@@ -72,9 +72,6 @@ const BUILD_TAG_DEFAULT: &str = "latest";
 fn main() -> Result<()> {
    let (build_tag, clap_args) = init()?;

-    // enable core dumping for all child processes
-    setrlimit(Resource::CORE, rlimit::INFINITY, rlimit::INFINITY)?;
-
    let (pg_handle, start_pg_result) = {
        // Enter startup tracing context
        let _startup_context_guard = startup_context_from_env();
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -56,7 +56,6 @@ pub struct ComputeNode {
    /// - we push new spec and it does reconfiguration
    /// - but then something happens and compute pod / VM is destroyed,
    ///   so k8s controller starts it again with the **old** spec
-    ///
    /// and the same for empty computes:
    /// - we started compute without any spec
    /// - we push spec and it does configuration
@@ -799,11 +798,7 @@ impl ComputeNode {
        // In this case we need to connect with old `zenith_admin` name
        // and create new user. We cannot simply rename connected user,
        // but we can create a new one and grant it all privileges.
-        let mut connstr = self.connstr.clone();
-        connstr
-            .query_pairs_mut()
-            .append_pair("application_name", "apply_config");
-
+        let connstr = self.connstr.clone();
        let mut client = match Client::connect(connstr.as_str(), NoTls) {
            Err(e) => match e.code() {
                Some(&SqlState::INVALID_PASSWORD)
@@ -872,11 +867,6 @@ impl ComputeNode {

        // Run migrations separately to not hold up cold starts
        thread::spawn(move || {
-            let mut connstr = connstr.clone();
-            connstr
-                .query_pairs_mut()
-                .append_pair("application_name", "migrations");
-
            let mut client = Client::connect(connstr.as_str(), NoTls)?;
            handle_migrations(&mut client).context("apply_config handle_migrations")
        });
@@ -1117,7 +1107,7 @@ impl ComputeNode {
    // EKS worker nodes have following core dump settings:
    //   /proc/sys/kernel/core_pattern -> core
    //   /proc/sys/kernel/core_uses_pid -> 1
-    //   ulimit -c -> unlimited
+    //   ulimint -c -> unlimited
    // which results in core dumps being written to postgres data directory as core.<pid>.
    //
    // Use that as a default location and pattern, except macos where core dumps are written
@@ -1396,9 +1386,7 @@ pub fn forward_termination_signal() {
    let pg_pid = PG_PID.load(Ordering::SeqCst);
    if pg_pid != 0 {
        let pg_pid = nix::unistd::Pid::from_raw(pg_pid as i32);
-        // Use 'fast' shutdown (SIGINT) because it also creates a shutdown checkpoint, which is important for
-        // ROs to get a list of running xacts faster instead of going through the CLOG.
-        // See https://www.postgresql.org/docs/current/server-shutdown.html for the list of modes and signals.
-        kill(pg_pid, Signal::SIGINT).ok();
+        // use 'immediate' shutdown (SIGQUIT): https://www.postgresql.org/docs/current/server-shutdown.html
+        kill(pg_pid, Signal::SIGQUIT).ok();
    }
 }
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -11,7 +11,6 @@ pub mod logger;
 pub mod catalog;
 pub mod compute;
 pub mod extension_server;
-mod migration;
 pub mod monitor;
 pub mod params;
 pub mod pg_helpers;
--- a/compute_tools/src/migration.rs
+++ b/compute_tools/src/migration.rs
@@ -1,105 +0,0 @@
-use anyhow::{Context, Result};
-use postgres::Client;
-use tracing::info;
-
-pub(crate) struct MigrationRunner<'m> {
-    client: &'m mut Client,
-    migrations: &'m [&'m str],
-}
-
-impl<'m> MigrationRunner<'m> {
-    pub fn new(client: &'m mut Client, migrations: &'m [&'m str]) -> Self {
-        // The neon_migration.migration_id::id column is a bigint, which is equivalent to an i64
-        assert!(migrations.len() + 1 < i64::MAX as usize);
-
-        Self { client, migrations }
-    }
-
-    fn get_migration_id(&mut self) -> Result<i64> {
-        let query = "SELECT id FROM neon_migration.migration_id";
-        let row = self
-            .client
-            .query_one(query, &[])
-            .context("run_migrations get migration_id")?;
-
-        Ok(row.get::<&str, i64>("id"))
-    }
-
-    fn update_migration_id(&mut self, migration_id: i64) -> Result<()> {
-        let setval = format!("UPDATE neon_migration.migration_id SET id={}", migration_id);
-
-        self.client
-            .simple_query(&setval)
-            .context("run_migrations update id")?;
-
-        Ok(())
-    }
-
-    fn prepare_migrations(&mut self) -> Result<()> {
-        let query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
-        self.client.simple_query(query)?;
-
-        let query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)";
-        self.client.simple_query(query)?;
-
-        let query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING";
-        self.client.simple_query(query)?;
-
-        let query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin";
-        self.client.simple_query(query)?;
-
-        let query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC";
-        self.client.simple_query(query)?;
-
-        Ok(())
-    }
-
-    pub fn run_migrations(mut self) -> Result<()> {
-        self.prepare_migrations()?;
-
-        let mut current_migration = self.get_migration_id()? as usize;
-        while current_migration < self.migrations.len() {
-            macro_rules! migration_id {
-                ($cm:expr) => {
-                    ($cm + 1) as i64
-                };
-            }
-
-            let migration = self.migrations[current_migration];
-
-            if migration.starts_with("-- SKIP") {
-                info!("Skipping migration id={}", migration_id!(current_migration));
-            } else {
-                info!(
-                    "Running migration id={}:\n{}\n",
-                    migration_id!(current_migration),
-                    migration
-                );
-
-                self.client
-                    .simple_query("BEGIN")
-                    .context("begin migration")?;
-
-                self.client.simple_query(migration).with_context(|| {
-                    format!(
-                        "run_migrations migration id={}",
-                        migration_id!(current_migration)
-                    )
-                })?;
-
-                // Migration IDs start at 1
-                self.update_migration_id(migration_id!(current_migration))?;
-
-                self.client
-                    .simple_query("COMMIT")
-                    .context("commit migration")?;
-
-                info!("Finished migration id={}", migration_id!(current_migration));
-            }
-
-            current_migration += 1;
-        }
-
-        Ok(())
-    }
-}
--- a/compute_tools/src/migrations/0000-neon_superuser_bypass_rls.sql
+++ b/compute_tools/src/migrations/0000-neon_superuser_bypass_rls.sql
--- a/compute_tools/src/migrations/0001-alter_roles.sql
+++ b/compute_tools/src/migrations/0001-alter_roles.sql
--- a/compute_tools/src/migrations/0002-grant_pg_create_subscription_to_neon_superuser.sql
+++ b/compute_tools/src/migrations/0002-grant_pg_create_subscription_to_neon_superuser.sql
--- a/compute_tools/src/migrations/0003-grant_pg_monitor_to_neon_superuser.sql
+++ b/compute_tools/src/migrations/0003-grant_pg_monitor_to_neon_superuser.sql
--- a/compute_tools/src/migrations/0004-grant_all_on_tables_to_neon_superuser.sql
+++ b/compute_tools/src/migrations/0004-grant_all_on_tables_to_neon_superuser.sql
--- a/compute_tools/src/migrations/0005-grant_all_on_sequences_to_neon_superuser.sql
+++ b/compute_tools/src/migrations/0005-grant_all_on_sequences_to_neon_superuser.sql
--- a/compute_tools/src/migrations/0006-grant_all_on_tables_to_neon_superuser_with_grant_option.sql
+++ b/compute_tools/src/migrations/0006-grant_all_on_tables_to_neon_superuser_with_grant_option.sql
--- a/compute_tools/src/migrations/0007-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql
+++ b/compute_tools/src/migrations/0007-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql
--- a/compute_tools/src/migrations/0008-revoke_replication_for_previously_allowed_roles.sql
+++ b/compute_tools/src/migrations/0008-revoke_replication_for_previously_allowed_roles.sql
--- a/compute_tools/src/migrations/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql
+++ b/compute_tools/src/migrations/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql
@@ -1,7 +0,0 @@
-DO $$
-BEGIN
-    IF (SELECT setting::numeric >= 160000 FROM pg_settings WHERE name = 'server_version_num') THEN
-       EXECUTE 'GRANT EXECUTE ON FUNCTION pg_export_snapshot TO neon_superuser';
-       EXECUTE 'GRANT EXECUTE ON FUNCTION pg_log_standby_snapshot TO neon_superuser';
-    END IF;
-END $$;
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -10,7 +10,6 @@ use tracing::{error, info, info_span, instrument, span_enabled, warn, Level};

 use crate::config;
 use crate::logger::inlinify;
-use crate::migration::MigrationRunner;
 use crate::params::PG_HBA_ALL_MD5;
 use crate::pg_helpers::*;

@@ -777,25 +776,84 @@ pub fn handle_migrations(client: &mut Client) -> Result<()> {

    // Add new migrations in numerical order.
    let migrations = [
-        include_str!("./migrations/0001-neon_superuser_bypass_rls.sql"),
-        include_str!("./migrations/0002-alter_roles.sql"),
-        include_str!("./migrations/0003-grant_pg_create_subscription_to_neon_superuser.sql"),
-        include_str!("./migrations/0004-grant_pg_monitor_to_neon_superuser.sql"),
-        include_str!("./migrations/0005-grant_all_on_tables_to_neon_superuser.sql"),
-        include_str!("./migrations/0006-grant_all_on_sequences_to_neon_superuser.sql"),
+        include_str!("./migrations/0000-neon_superuser_bypass_rls.sql"),
+        include_str!("./migrations/0001-alter_roles.sql"),
+        include_str!("./migrations/0002-grant_pg_create_subscription_to_neon_superuser.sql"),
+        include_str!("./migrations/0003-grant_pg_monitor_to_neon_superuser.sql"),
+        include_str!("./migrations/0004-grant_all_on_tables_to_neon_superuser.sql"),
+        include_str!("./migrations/0005-grant_all_on_sequences_to_neon_superuser.sql"),
        include_str!(
-            "./migrations/0007-grant_all_on_tables_to_neon_superuser_with_grant_option.sql"
+            "./migrations/0006-grant_all_on_tables_to_neon_superuser_with_grant_option.sql"
        ),
        include_str!(
-            "./migrations/0008-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql"
-        ),
-        include_str!("./migrations/0009-revoke_replication_for_previously_allowed_roles.sql"),
-        include_str!(
-            "./migrations/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql"
+            "./migrations/0007-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql"
        ),
+        include_str!("./migrations/0008-revoke_replication_for_previously_allowed_roles.sql"),
    ];

-    MigrationRunner::new(client, &migrations).run_migrations()?;
+    let mut func = || {
+        let query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
+        client.simple_query(query)?;
+
+        let query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)";
+        client.simple_query(query)?;
+
+        let query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING";
+        client.simple_query(query)?;
+
+        let query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin";
+        client.simple_query(query)?;
+
+        let query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC";
+        client.simple_query(query)?;
+        Ok::<_, anyhow::Error>(())
+    };
+    func().context("handle_migrations prepare")?;
+
+    let query = "SELECT id FROM neon_migration.migration_id";
+    let row = client
+        .query_one(query, &[])
+        .context("handle_migrations get migration_id")?;
+    let mut current_migration: usize = row.get::<&str, i64>("id") as usize;
+    let starting_migration_id = current_migration;
+
+    let query = "BEGIN";
+    client
+        .simple_query(query)
+        .context("handle_migrations begin")?;
+
+    while current_migration < migrations.len() {
+        let migration = &migrations[current_migration];
+        if migration.starts_with("-- SKIP") {
+            info!("Skipping migration id={}", current_migration);
+        } else {
+            info!(
+                "Running migration id={}:\n{}\n",
+                current_migration, migration
+            );
+            client.simple_query(migration).with_context(|| {
+                format!("handle_migrations current_migration={}", current_migration)
+            })?;
+        }
+        current_migration += 1;
+    }
+    let setval = format!(
+        "UPDATE neon_migration.migration_id SET id={}",
+        migrations.len()
+    );
+    client
+        .simple_query(&setval)
+        .context("handle_migrations update id")?;
+
+    let query = "COMMIT";
+    client
+        .simple_query(query)
+        .context("handle_migrations commit")?;
+
+    info!(
+        "Ran {} migrations",
+        (migrations.len() - starting_migration_id)
+    );

    Ok(())
 }
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -40,7 +40,6 @@ safekeeper_api.workspace = true
 postgres_connection.workspace = true
 storage_broker.workspace = true
 utils.workspace = true
-whoami.workspace = true

 compute_api.workspace = true
 workspace_hack.workspace = true
--- a/control_plane/src/broker.rs
+++ b/control_plane/src/broker.rs
@@ -1,9 +1,9 @@
 //! Code to manage the storage broker
 //!
-//! In the local test environment, the storage broker stores its data directly in
+//! In the local test environment, the data for each safekeeper is stored in
 //!
 //! ```text
-//!   .neon
+//!   .neon/safekeepers/<safekeeper id>
 //! ```
 use std::time::Duration;

--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -1,10 +1,8 @@
 //! Code to manage pageservers
 //!
-//! In the local test environment, the data for each pageserver is stored in
+//! In the local test environment, the pageserver stores its data directly in
 //!
-//! ```text
-//!   .neon/pageserver_<pageserver_id>
-//! ```
+//!   .neon/
 //!
 use std::collections::HashMap;

@@ -17,6 +15,7 @@ use std::time::Duration;

 use anyhow::{bail, Context};
 use camino::Utf8PathBuf;
+use futures::SinkExt;
 use pageserver_api::models::{
    self, AuxFilePolicy, LocationConfig, TenantHistorySize, TenantInfo, TimelineInfo,
 };
@@ -351,6 +350,11 @@ impl PageServerNode {
                .map(|x| x.parse::<NonZeroU64>())
                .transpose()
                .context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?,
+            trace_read_requests: settings
+                .remove("trace_read_requests")
+                .map(|x| x.parse::<bool>())
+                .transpose()
+                .context("Failed to parse 'trace_read_requests' as bool")?,
            eviction_policy: settings
                .remove("eviction_policy")
                .map(serde_json::from_str)
@@ -451,6 +455,11 @@ impl PageServerNode {
                    .map(|x| x.parse::<NonZeroU64>())
                    .transpose()
                    .context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?,
+                trace_read_requests: settings
+                    .remove("trace_read_requests")
+                    .map(|x| x.parse::<bool>())
+                    .transpose()
+                    .context("Failed to parse 'trace_read_requests' as bool")?,
                eviction_policy: settings
                    .remove("eviction_policy")
                    .map(serde_json::from_str)
@@ -557,39 +566,60 @@ impl PageServerNode {
        pg_wal: Option<(Lsn, PathBuf)>,
        pg_version: u32,
    ) -> anyhow::Result<()> {
+        let (client, conn) = self.page_server_psql_client().await?;
+        // The connection object performs the actual communication with the database,
+        // so spawn it off to run on its own.
+        tokio::spawn(async move {
+            if let Err(e) = conn.await {
+                eprintln!("connection error: {}", e);
+            }
+        });
+        let client = std::pin::pin!(client);
+
        // Init base reader
        let (start_lsn, base_tarfile_path) = base;
        let base_tarfile = tokio::fs::File::open(base_tarfile_path).await?;
-        let base_tarfile =
-            mgmt_api::ReqwestBody::wrap_stream(tokio_util::io::ReaderStream::new(base_tarfile));
+        let base_tarfile = tokio_util::io::ReaderStream::new(base_tarfile);

        // Init wal reader if necessary
        let (end_lsn, wal_reader) = if let Some((end_lsn, wal_tarfile_path)) = pg_wal {
            let wal_tarfile = tokio::fs::File::open(wal_tarfile_path).await?;
-            let wal_reader =
-                mgmt_api::ReqwestBody::wrap_stream(tokio_util::io::ReaderStream::new(wal_tarfile));
+            let wal_reader = tokio_util::io::ReaderStream::new(wal_tarfile);
            (end_lsn, Some(wal_reader))
        } else {
            (start_lsn, None)
        };

-        // Import base
-        self.http_client
-            .import_basebackup(
-                tenant_id,
-                timeline_id,
-                start_lsn,
-                end_lsn,
-                pg_version,
-                base_tarfile,
-            )
-            .await?;
+        let copy_in = |reader, cmd| {
+            let client = &client;
+            async move {
+                let writer = client.copy_in(&cmd).await?;
+                let writer = std::pin::pin!(writer);
+                let mut writer = writer.sink_map_err(|e| {
+                    std::io::Error::new(std::io::ErrorKind::Other, format!("{e}"))
+                });
+                let mut reader = std::pin::pin!(reader);
+                writer.send_all(&mut reader).await?;
+                writer.into_inner().finish().await?;
+                anyhow::Ok(())
+            }
+        };

+        // Import base
+        copy_in(
+            base_tarfile,
+            format!(
+                "import basebackup {tenant_id} {timeline_id} {start_lsn} {end_lsn} {pg_version}"
+            ),
+        )
+        .await?;
        // Import wal if necessary
        if let Some(wal_reader) = wal_reader {
-            self.http_client
-                .import_wal(tenant_id, timeline_id, start_lsn, end_lsn, wal_reader)
-                .await?;
+            copy_in(
+                wal_reader,
+                format!("import wal {tenant_id} {timeline_id} {start_lsn} {end_lsn}"),
+            )
+            .await?;
        }

        Ok(())
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -29,6 +29,7 @@ use utils::{
 pub struct StorageController {
    env: LocalEnv,
    listen: String,
+    path: Utf8PathBuf,
    private_key: Option<Vec<u8>>,
    public_key: Option<String>,
    postgres_port: u16,
@@ -40,8 +41,6 @@ const COMMAND: &str = "storage_controller";

 const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16;

-const DB_NAME: &str = "storage_controller";
-
 #[derive(Serialize, Deserialize)]
 pub struct AttachHookRequest {
    pub tenant_shard_id: TenantShardId,
@@ -66,6 +65,10 @@ pub struct InspectResponse {

 impl StorageController {
    pub fn from_env(env: &LocalEnv) -> Self {
+        let path = Utf8PathBuf::from_path_buf(env.base_data_dir.clone())
+            .unwrap()
+            .join("attachments.json");
+
        // Makes no sense to construct this if pageservers aren't going to use it: assume
        // pageservers have control plane API set
        let listen_url = env.control_plane_api.clone().unwrap();
@@ -125,6 +128,7 @@ impl StorageController {

        Self {
            env: env.clone(),
+            path,
            listen,
            private_key,
            public_key,
@@ -199,6 +203,7 @@ impl StorageController {
    ///
    /// Returns the database url
    pub async fn setup_database(&self) -> anyhow::Result<String> {
+        const DB_NAME: &str = "storage_controller";
        let database_url = format!("postgresql://localhost:{}/{DB_NAME}", self.postgres_port);

        let pg_bin_dir = self.get_pg_bin_dir().await?;
@@ -227,30 +232,6 @@ impl StorageController {
        Ok(database_url)
    }

-    pub async fn connect_to_database(
-        &self,
-    ) -> anyhow::Result<(
-        tokio_postgres::Client,
-        tokio_postgres::Connection<tokio_postgres::Socket, tokio_postgres::tls::NoTlsStream>,
-    )> {
-        tokio_postgres::Config::new()
-            .host("localhost")
-            .port(self.postgres_port)
-            // The user is the ambient operating system user name.
-            // That is an impurity which we want to fix in => TODO https://github.com/neondatabase/neon/issues/8400
-            //
-            // Until we get there, use the ambient operating system user name.
-            // Recent tokio-postgres versions default to this if the user isn't specified.
-            // But tokio-postgres fork doesn't have this upstream commit:
-            // https://github.com/sfackler/rust-postgres/commit/cb609be758f3fb5af537f04b584a2ee0cebd5e79
-            // => we should rebase our fork => TODO https://github.com/neondatabase/neon/issues/8399
-            .user(&whoami::username())
-            .dbname(DB_NAME)
-            .connect(tokio_postgres::NoTls)
-            .await
-            .map_err(anyhow::Error::new)
-    }
-
    pub async fn start(&self, retry_timeout: &Duration) -> anyhow::Result<()> {
        // Start a vanilla Postgres process used by the storage controller for persistence.
        let pg_data_path = Utf8PathBuf::from_path_buf(self.env.base_data_dir.clone())
@@ -275,20 +256,17 @@ impl StorageController {
            if !status.success() {
                anyhow::bail!("initdb failed with status {status}");
            }
-        };

-        // Write a minimal config file:
-        // - Specify the port, since this is chosen dynamically
-        // - Switch off fsync, since we're running on lightweight test environments and when e.g. scale testing
-        //   the storage controller we don't want a slow local disk to interfere with that.
-        //
-        // NB: it's important that we rewrite this file on each start command so we propagate changes
-        // from `LocalEnv`'s config file (`.neon/config`).
-        tokio::fs::write(
-            &pg_data_path.join("postgresql.conf"),
-            format!("port = {}\nfsync=off\n", self.postgres_port),
-        )
-        .await?;
+            // Write a minimal config file:
+            // - Specify the port, since this is chosen dynamically
+            // - Switch off fsync, since we're running on lightweight test environments and when e.g. scale testing
+            //   the storage controller we don't want a slow local disk to interfere with that.
+            tokio::fs::write(
+                &pg_data_path.join("postgresql.conf"),
+                format!("port = {}\nfsync=off\n", self.postgres_port),
+            )
+            .await?;
+        };

        println!("Starting storage controller database...");
        let db_start_args = [
@@ -318,38 +296,11 @@ impl StorageController {
        // Run migrations on every startup, in case something changed.
        let database_url = self.setup_database().await?;

-        // We support running a startup SQL script to fiddle with the database before we launch storcon.
-        // This is used by the test suite.
-        let startup_script_path = self
-            .env
-            .base_data_dir
-            .join("storage_controller_db.startup.sql");
-        let startup_script = match tokio::fs::read_to_string(&startup_script_path).await {
-            Ok(script) => {
-                tokio::fs::remove_file(startup_script_path).await?;
-                script
-            }
-            Err(e) => {
-                if e.kind() == std::io::ErrorKind::NotFound {
-                    // always run some startup script so that this code path doesn't bit rot
-                    "BEGIN; COMMIT;".to_string()
-                } else {
-                    anyhow::bail!("Failed to read startup script: {e}")
-                }
-            }
-        };
-        let (mut client, conn) = self.connect_to_database().await?;
-        let conn = tokio::spawn(conn);
-        let tx = client.build_transaction();
-        let tx = tx.start().await?;
-        tx.batch_execute(&startup_script).await?;
-        tx.commit().await?;
-        drop(client);
-        conn.await??;
-
        let mut args = vec![
            "-l",
            &self.listen,
+            "-p",
+            self.path.as_ref(),
            "--dev",
            "--database-url",
            &database_url,
--- a/control_plane/storcon_cli/Cargo.toml
+++ b/control_plane/storcon_cli/Cargo.toml
@@ -17,7 +17,6 @@ pageserver_client.workspace = true
 reqwest.workspace = true
 serde.workspace = true
 serde_json = { workspace = true, features = ["raw_value"] }
-storage_controller_client.workspace = true
 thiserror.workspace = true
 tokio.workspace = true
 tracing.workspace = true
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -14,15 +14,15 @@ use pageserver_api::{
    },
    shard::{ShardStripeSize, TenantShardId},
 };
-use pageserver_client::mgmt_api::{self};
+use pageserver_client::mgmt_api::{self, ResponseErrorMessageExt};
 use reqwest::{Method, StatusCode, Url};
+use serde::{de::DeserializeOwned, Serialize};
 use utils::id::{NodeId, TenantId};

 use pageserver_api::controller_api::{
    NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy,
    TenantShardMigrateRequest, TenantShardMigrateResponse,
 };
-use storage_controller_client::control_api::Client;

 #[derive(Subcommand, Debug)]
 enum Command {
@@ -56,10 +56,6 @@ enum Command {
        #[arg(long)]
        scheduling: Option<NodeSchedulingPolicy>,
    },
-    NodeDelete {
-        #[arg(long)]
-        node_id: NodeId,
-    },
    /// Modify a tenant's policies in the storage controller
    TenantPolicy {
        #[arg(long)]
@@ -249,6 +245,64 @@ impl FromStr for NodeAvailabilityArg {
    }
 }

+struct Client {
+    base_url: Url,
+    jwt_token: Option<String>,
+    client: reqwest::Client,
+}
+
+impl Client {
+    fn new(base_url: Url, jwt_token: Option<String>) -> Self {
+        Self {
+            base_url,
+            jwt_token,
+            client: reqwest::ClientBuilder::new()
+                .build()
+                .expect("Failed to construct http client"),
+        }
+    }
+
+    /// Simple HTTP request wrapper for calling into storage controller
+    async fn dispatch<RQ, RS>(
+        &self,
+        method: Method,
+        path: String,
+        body: Option<RQ>,
+    ) -> mgmt_api::Result<RS>
+    where
+        RQ: Serialize + Sized,
+        RS: DeserializeOwned + Sized,
+    {
+        // The configured URL has the /upcall path prefix for pageservers to use: we will strip that out
+        // for general purpose API access.
+        let url = Url::from_str(&format!(
+            "http://{}:{}/{path}",
+            self.base_url.host_str().unwrap(),
+            self.base_url.port().unwrap()
+        ))
+        .unwrap();
+
+        let mut builder = self.client.request(method, url);
+        if let Some(body) = body {
+            builder = builder.json(&body)
+        }
+        if let Some(jwt_token) = &self.jwt_token {
+            builder = builder.header(
+                reqwest::header::AUTHORIZATION,
+                format!("Bearer {jwt_token}"),
+            );
+        }
+
+        let response = builder.send().await.map_err(mgmt_api::Error::ReceiveBody)?;
+        let response = response.error_from_body().await?;
+
+        response
+            .json()
+            .await
+            .map_err(pageserver_client::mgmt_api::Error::ReceiveBody)
+    }
+}
+
 #[tokio::main]
 async fn main() -> anyhow::Result<()> {
    let cli = Cli::parse();
@@ -283,7 +337,7 @@ async fn main() -> anyhow::Result<()> {
        }
        Command::TenantCreate { tenant_id } => {
            storcon_client
-                .dispatch::<_, ()>(
+                .dispatch(
                    Method::POST,
                    "v1/tenant".to_string(),
                    Some(TenantCreateRequest {
@@ -303,16 +357,13 @@ async fn main() -> anyhow::Result<()> {
            tracing::info!("Delete status: {}", status);
        }
        Command::Nodes {} => {
-            let mut resp = storcon_client
+            let resp = storcon_client
                .dispatch::<(), Vec<NodeDescribeResponse>>(
                    Method::GET,
                    "control/v1/node".to_string(),
                    None,
                )
                .await?;
-
-            resp.sort_by(|a, b| a.listen_http_addr.cmp(&b.listen_http_addr));
-
            let mut table = comfy_table::Table::new();
            table.set_header(["Id", "Hostname", "Scheduling", "Availability"]);
            for node in resp {
@@ -344,16 +395,13 @@ async fn main() -> anyhow::Result<()> {
                .await?;
        }
        Command::Tenants {} => {
-            let mut resp = storcon_client
+            let resp = storcon_client
                .dispatch::<(), Vec<TenantDescribeResponse>>(
                    Method::GET,
                    "control/v1/tenant".to_string(),
                    None,
                )
                .await?;
-
-            resp.sort_by(|a, b| a.tenant_id.cmp(&b.tenant_id));
-
            let mut table = comfy_table::Table::new();
            table.set_header([
                "TenantId",
@@ -602,11 +650,6 @@ async fn main() -> anyhow::Result<()> {
                .dispatch::<(), ()>(Method::POST, format!("debug/v1/node/{node_id}/drop"), None)
                .await?;
        }
-        Command::NodeDelete { node_id } => {
-            storcon_client
-                .dispatch::<(), ()>(Method::DELETE, format!("control/v1/node/{node_id}"), None)
-                .await?;
-        }
        Command::TenantSetTimeBasedEviction {
            tenant_id,
            period,
--- a/docs/rfcs/034-ancestor-deletion.md
+++ b/docs/rfcs/034-ancestor-deletion.md
@@ -1,252 +0,0 @@
-# Ancestor Timeline Deletion
-
-Created on: 2024-02-23
-
-Author: John Spray
-
-# Summary
-
-When a tenant creates a new timeline that they will treat as their 'main' history,
-it is awkward to permanently retain an 'old main' timeline as its ancestor. Currently
-this is necessary because it is forbidden to delete a timeline which has descendents.
-
-A new pageserver API is proposed to 'adopt' data from a parent timeline into
-one of its children, such that the link between ancestor and child can be severed,
-leaving the parent in a state where it may then be deleted.
-
-# Motivation
-
-Retaining parent timelines currently has two costs:
-
- Cognitive load on users, who have to remember which is the "real" main timeline.
- Storage capacity cost, as the parent timeline will retain layers up to the
-  child's timeline point, even if the child fully covers its keyspace with image
-  layers and will never actually read from the parent.
-
-# Solution
-
-A new pageserver API `PUT /v1/tenant/:tenant_id/timeline/:timeline_id/detach_ancestor`
-will be added. The `timeline_id` in this URL is that of the _child_ timeline that we
-wish to detach from its parent.
-
-On success, this API will leave the following state:
-
- The detached child timeline will no longer have an ancestor, and will contain all
-  the data needed to service reads without recursing into an ancestor.
- Any other children of the parent whose timeline points were at a lower LSN than
-  the detached child timeline will be modified to have the child timeline as their
-  new parent.
- The parent timeline will still exist, but the child will no longer have it as an
-  ancestor. If this was the last timeline that depended on the parent, then the
-  parent will become deletable.
-
-This API's implementation will consist of a series of retryable steps, such that
-on failures/timeout it can safely be called again to reach the target state.
-
-## Example
-
-### Before
-
-The user has "rolled back" their project to LSN X, resulting in a "new main"
-timeline. The parent "old main" timeline still exists, and they would like
-to clean it up.
-
-They have two other timelines A and B. A is from before the rollback point,
-and B is from after the rollback point.
-
-```
----"old main" timeline-------X-------------------------------------------->
-                |             |                         |
-                |-> child A   |                         |
-                              |-> "new main" timeline   |
-                                                        -> child B
-
-```
-
-### After calling detach ancestor API
-
-The "new main" timeline is no longer dependent on old main, and neither
-is child A, because it had a branch point before X.
-
-The user may now choose to delete child B and "old main" to get to
-a pristine state. Child B is likely to be unwanted since the user
-chose to roll back to X, and it branches from after X. However, we
-don't assume this in the API; it is up to the user to delete it.
-
-```
-|----"old main" timeline---------------------------------------------------->
-                                                         |
-                                                         |
-                                                         |
-                                                         -> child B
-
-|----"new main" timeline--------->
-                 |
-                 |-> child A
-
-
-```
-
-### After removing timelines
-
-We end up with a totally clean state that leaves no trace that a rollback
-ever happened: there is only one root timeline.
-
-```
-| ----"new main" timeline----------->
-                |
-                |-> child A
-
-
-```
-
-## Caveats
-
-Important things for API users to bear in mind:
-
- this API does not delete the parent timeline: you must still do that explicitly.
- if there are other child timelines ahead of the branch point of the detached
-  child, the parent won't be deletable: you must either delete or detach those
-  children.
- do _not_ simply loop over all children and detach them all: this can have an
-  extremely high storage cost. The detach ancestor API is intended for use on a single
-  timeline to make it the new "main".
- The detach ancestor API should also not be
-  exposed directly to the user as button/API, because they might decide
-  to click it for all the children and thereby generate many copies of the
-  parent's data -- the detach ancestor API should be used as part
-  of a high level "clean up after rollback" feature.
-
-## `detach_ancestor` API implementation
-
-Terms used in the following sections:
-
- "the child": the timeline whose ID is specified in the detach ancestor API URL, also
-  called "new main" in the example.
- "the parent": the parent of "the child". Also called "old main" in the example.
- "the branch point" the ancestor_lsn of "the child"
-
-### Phase 1: write out adopted layers to S3
-
-The child will "adopt" layers from the parent, such that its end state contains
-all the parent's history as well as its own.
-
-For all layers in the parent's layer map whose high LSN is below the branch
-point, issue S3 CopyObject requests to duplicate them into the child timeline's
-prefix. Do not add them to the child's layer map yet.
-
-For delta layers in the parent's layer map which straddle the branch point, read them
-and write out only content up to the branch point into new layer objects.
-
-This is a long running operation if the parent has many layers: it should be
-implemented in a way that resumes rather than restarting from scratch, if the API
-times out and is called again.
-
-As an optimization, if there are no other timelines that will be adopted into
-the child, _and_ the child's image layers already full cover the branch LSN,
-then we may skip adopting layers.
-
-### Phase 2: update the child's index
-
-Having written out all needed layers in phase 1, atomically link them all
-into the child's IndexPart and upload to S3. This may be done while the
-child Timeline is still running.
-
-### Phase 3: modify timelines ancestry
-
-Modify the child's ancestor to None, and upload its IndexPart to persist the change.
-
-For all timelines which have the same parent as the child, and have a branch
-point lower than our branch point, switch their ancestor_timeline to the child,
-and upload their IndexPart to persist the change.
-
-## Alternatives considered
-
-### Generate full image layer on child, rather than adopting parent deltas
-
-This would work for the case of a single child, but would prevent re-targeting
-other timelines that depended on the parent. If we detached many children this
-way, the storage cost would become prohibitive (consider a 1TB database with
-100 child timelines: it would cost 100TiB if they all generated their own image layers).
-
-### Don't rewrite anything: just fake it in the API
-
-We could add a layer of indirection that let a child "pretend" that it had no
-ancestor, when in reality it still had the parent. The pageserver API could
-accept deletion of ancestor timelines, and just update child metadata to make
-them look like they have no ancestor.
-
-This would not achieve the desired reduction in storage cost, and may well be more
-complex to maintain than simply implementing the API described in this RFC.
-
-### Avoid copying objects: enable child index to use parent layers directly
-
-We could teach IndexPart to store a TimelineId for each layer, such that a child
-timeline could reference a parent's layers directly, rather than copying them
-into the child's prefix.
-
-This would impose a cost for the normal case of indices that only target the
-timeline's own layers, add complexity, and break the useful simplifying
-invariant that timelines "own" their own path. If child timelines were
-referencing layers from the parent, we would have to ensure that the parent
-never runs GC/compaction again, which would make the API less flexible (the
-proposal in this RFC enables deletion of the parent but doesn't require it.)
-
-## Performance
-
-### Adopting layers
-
- CopyObject is a relatively cheap operation, but we may need to issue tens of thousands
-  of such requests: this can take up to tens of seconds and will compete for RemoteStorage
-  semaphore units with other activity on the pageserver.
- If we are running on storage backend that doesn't implement CopyObject, then
-  this part will be much more expensive as we would stream all layer content
-  through the pageserver. This is no different to issuing a lot
-  of reads to a timeline that does not have a warm local cache: it will move
-  a lot of gigabytes, but that shouldn't break anything.
- Generating truncated layers for delta that straddle the branch point will
-  require streaming read/write of all the layers in question.
-
-### Updating timeline ancestry
-
-The simplest way to update timeline ancestry will probably be to stop and start
-all the Timeline objects: this is preferable to the complexity of making their
-ancestry mutable at runtime.
-
-There will be a corresponding "stutter" in the availability of the timelines,
-of the order 10-100ms, which is the time taken to upload their IndexPart, and
-restart the Timeline.
-
-# Interaction with other features
-
-## Concurrent timeline creation
-
-If new historic timelines are created using the parent as an ancestor while the
-detach ancestor API is running, they will not be re-parented to the child. This
-doesn't break anything, but it leaves the parent in a state where it might not
-be possible to delete it.
-
-Since timeline creations are an explicit user action, this is not something we need to
-worry about as the storage layer: a user who wants to delete their parent timeline will not create
-new children, and if they do, they can choose to delete those children to
-enable deleting the parent.
-
-For the least surprise to the user, before starting the detach ancestor branch
-operation, the control plane should wait until all branches are created and not
-allow any branches to be created before the branch point on the ancestor branch
-while the operation is ongoing.
-
-## WAL based disaster recovery
-
-WAL based disaster recovery currently supports only restoring of the main
-branch. Enabling WAL based disaster recovery in the future requires that we
-keep a record which timeline generated the WAL and at which LSN was a parent
-detached. Keep a list of timeline ids and the LSN in which they were detached in
-the `index_part.json`. Limit the size of the list to 100 first entries, after
-which the WAL disaster recovery will not be possible.
-
-## Sharded tenants
-
-For sharded tenants, calls to the detach ancestor API will pass through the storage
-controller, which will handle them the same as timeline creations: invoke first
-on shard zero, and then on all the other shards.
--- a/docs/rfcs/034-timeline-archive.md
+++ b/docs/rfcs/034-timeline-archive.md
@@ -1,507 +0,0 @@
-# Timeline Archival
-
-## Summary
-
-This RFC describes a mechanism for pageservers to eliminate local storage + compute work
-for timelines which are not in use, in response to external API calls to "archive" a timeline.
-
-The archived state roughly corresponds to fully offloading a timeline to object storage, such
-that its cost is purely the cost of that object storage.
-
-## Motivation
-
-Archived timelines serve multiple purposes:
- Act as a 'snapshot' for workloads that would like to retain restorable copies of their
-  database from longer ago than their PITR window.
- Enable users to create huge numbers of branches (e.g. one per github PR) without having
-  to diligently clean them up later to avoid overloading the pageserver (currently we support
-  up to ~500 branches per tenant).
-
-### Prior art
-
-Most storage and database systems have some form of snapshot, which can be implemented several ways:
-1. full copies of data (e.g. an EBS snapshot to S3)
-2. shallow snapshots which are CoW relative to the original version of the data, e.g. on a typical NFS appliance, or a filesystem like CephFS.
-3. a series of snapshots which are CoW or de-duplicated relative to one another.
-
-Today's Neon branches are approximately like `2.`, although due to implementation details branches
-often end up storing much more data than they really need, as parent branches assume that all data
-at the branch point is needed.  The layers pinned in the parent branch may have a much larger size
-than the physical size of a compressed image layer representing the data at the branch point.
-
-## Requirements
-
- Enter & exit the archived state in response to external admin API calls
- API calls to modify the archived state are atomic and durable
- An archived timeline should eventually (once out of PITR window) use an efficient compressed
-  representation, and avoid retaining arbitrarily large data in its parent branch.
- Remote object GETs during tenant start may be O(N) with the number of _active_ branches,
-  but must not scale with the number of _archived_ branches.
- Background I/O for archived branches should only be done a limited number of times to evolve them
-  to a long-term-efficient state (e.g. rewriting to image layers).  There should be no ongoing "housekeeping"
-  overhead for archived branches, including operations related to calculating sizes for billing.
- The pageserver should put no load on the safekeeper for archived branches.
- Performance of un-archiving a branch must make good use of S3/disk bandwidth to restore the branch
-  to a performant state in a short time (linear with the branch's logical size)
-
-## Non Goals
-
- Archived branches are not a literal `fullbackup` postgres snapshot: they are still stored
-  in Neon's internal format.
- Compute cold starts after activating an archived branch will not have comparable performance to
-  cold starts on an active branch.
- Archived branches will not use any new/additional compression or de-duplication beyond what
-  is already implemented for image layers (zstd per page).
- The pageserver will not "auto start" archived branches in response to page_service API requests: they
-  are only activated explicitly via the HTTP API.
- We will not implement a total offload of archived timelines from safekeepers: their control file (small) will
-  remain on local disk, although existing eviction mechanisms will remove any segments from local disk.
- We will not expose any prometheus metrics for archived timelines, or make them visible in any
-  detailed HTTP APIs other than the specific API for listing archived timelines.
- A parent branch may not be archived unless all its children are.
-
-## Impacted Components
-
-pageserver, storage controller
-
-## Terminology
-
-**Archived**: a branch is _archived_ when an HTTP API request to archive it has succeeded: the caller
-may assume that this branch is now very cheap to store, although this may not be physically so until the
-branch proceeds to the offloaded state.
-
-**Active** branches are branches which are available for use by page_service clients, and have a relatively
-high cost due to consuming local storage.
-
-**Offloaded** branches are a subset of _archived_ branches, which have had their local state removed such
-that they now consume minimal runtime resources and have a cost similar to the cost of object storage.
-
-**Activate** (verb): transition from Archived to Active
-
-**Archive** (verb): transition from Active to Archived
-
-**Offload** (verb): transition from Archived to Offloaded
-
-**Offload manifest**: an object stored in S3 that describes timelines which pageservers do not load.
-
-**Warm up** (verb): operation done on an active branch, by downloading its active layers.  Once a branch is
-warmed up, good performance will be available to page_service clients.
-
-## Implementation
-
-### High level flow
-
-We may think of a timeline which is archived and then activated as proceeding through a series of states:
-
-```mermaid
-stateDiagram
-  [*] --> Active(warm)
-  Active(warm) --> Archived
-  Archived --> Offloaded
-  Archived --> Active(warm)
-  Offloaded --> Active(cold)
-  Active(cold) --> Active(warm)
-```
-
-Note that the transition from Archived to Active(warm) is expected to be fairly rare: the most common lifecycles
-of branches will be:
- Very frequent: Short lived branches: Active -> Deleted
- Frequent: Long-lived branches: Active -> Archived -> Offloaded -> Deleted
- Rare: Branches used to restore old state: Active ->Archived -> Offloaded -> Active
-
-These states are _not_ all stored as a single physical state on the timeline, but rather represent the combination
-of:
- the timeline's lifecycle state: active or archived, stored in the timeline's index
- its offload state: whether pageserver has chosen to drop local storage of the timeline and write it into the
-  manifest of offloaded timelines.
- cache state (whether it's warm or cold).
-
-### Storage format changes
-
-There are two storage format changes:
-1. `index_part.json` gets a new attribute `state` that describes whether the timeline is to
-   be considered active or archived.
-2. A new tenant-level _manifest_ object `tenant_manifest-v1.json` describes which timelines a tenant does not need to load
-   at startup (and is available for storing other small, rarely changing tenant-wide attributes in future)
-
-The manifest object will have a format like this:
-```
-{
-  "offload_timelines": [
-    {
-      "timeline_id": ...
-      "last_record_lsn": ...
-      "last_record_lsn_time": ...
-      "pitr_interval": ...
-      "last_gc_lsn": ...  # equal to last_record_lsn if this branch has no history (i.e. a snapshot)
-      "logical_size": ...  # The size at last_record_lsn
-      "physical_size" ...
-      "parent": Option<{
-        "timeline_id"...
-        "lsn"... # Branch point LSN on the parent
-        "requires_data": bool # True if this branch depends on layers in its parent, identify it here
-
-      }>
-    }
-  ]
-}
-```
-
-The information about a timeline in its offload state is intentionally minimal: just enough to decide:
- Whether it requires [archive optimization](#archive-branch-optimization) by rewriting as a set of image layers: we may infer this
-  by checking if now > last_record_lsn_time - pitr_interval, and pitr_lsn < last_record_lsn.
- Whether a parent branch should include this offloaded branch in its GC inputs to avoid removing
-  layers that the archived branch depends on
- Whether requests to delete this `timeline_id` should be executed (i.e. if a deletion request
-  is received for a timeline_id that isn't in the site of live `Timelines` or in the manifest, then
-  we don't need to go to S3 for the deletion.
- How much archived space to report in consumption metrics
-
-The contents of the manifest's offload list will also be stored as an attribute of `Tenant`, such that the total
-set of timelines may be found by the union of `Tenant::timelines` (non-offloaded timelines) and `Tenant::offloaded`
-(offloaded timelines).
-
-For split-brain protection, the manifest object will be written with a generation suffix, in the same way as
-index_part objects are (see [generation numbers RFC](025-generation-numbers.md)).  This will add some complexity, but
-give us total safety against two pageservers with the same tenant attached fighting over the object.  Existing code
-for finding the latest generation and for cleaning up old generations (in the scrubber) will be generalized to cover
-the manifest file.
-
-### API & Timeline state
-
-Timelines will store a lifecycle state (enum of Active or Archived) in their IndexPart.  This will
-be controlled by a new per-timeline `configure` endpoint.  This is intentionally generic naming, which
-may be used in future to control other per-timeline attributes (e.g. in future we may make PITR interval
-a per-timeline configuration).
-
-`PUT /v1/tenants/{tenant_id}/timelines/{timeline_id}/configure`
-```
-{
-  'state': 'active|archive'
-}
-```
-
-When archiving a timeline, this API will complete as soon as the timeline's state has been set in index_part, and that index has been uploaded.
-
-When activating a timeline, this API will complete as soon as the timeline's state has been set in index_part,
-**and** the `Timeline` object has been instantiated and activated.  This will require reading the timeline's
-index, but not any data: it should be about as fast as a couple of small S3 requests.
-
-The API will be available with identical path via the storage controller: calling this on a sharded tenant
-will simply map the API call to all the shards.
-
-Archived timelines may never have descendent timelines which are active.  This will be enforced at the API level,
-such that activating a timeline requires that all its ancestors are active, and archiving a timeline requires
-that all its descendents are archived.  It is the callers responsibility to walk the hierarchy of timelines
-in the proper order if they would like to archive whole trees of branches.
-
-Because archive timelines will be excluded from the usual timeline listing APIs, a new API specifically
-for archived timelines will be added: this is for use in support/debug:
-
-```
-GET /v1/tenants/{tenant_id}/archived_timelines
-
-{
-  ...same per-timeline content as the tenant manifest...
-}
-
-```
-
-### Tenant attach changes
-
-Currently, during Tenant::spawn we list all the timelines in the S3 bucket, and then for each timeline
-we load their index_part.json.  To avoid the number of GETs scaling linearly with the number of archived
-timelines, we must have a single object that tells us which timelines do not need to be loaded.  The
-number of ListObjects requests while listing timelines will still scale O(N), but this is less problematic
-because each request covers 1000 timelines.
-
-This is **not** literally the same as the set of timelines who have state=archived.  Rather, it is
-the set of timelines which have been offloaded in the background after their state was set to archived.
-
-We may simply skip loading these timelines: there will be no special state of `Timeline`, they just won't
-exist from the perspective of an active `Tenant` apart from in deletion: timeline deletion will need
-to check for offloaded timelines as well as active timelines, to avoid wrongly returning 404 on trying
-to delete an offloaded timeline.
-
-### Warm-up API
-
-`PUT /v1/tenants/{tenant_id}/timelines/{timeline_id}/download?wait_ms=1234`
-
-This API will be similar to the existing `download_remote_layers` API, but smarter:
- It will not download _all_ remote layers, just the visible set (i.e. layers needed for a read)
- It will download layers in the visible set until reaching `wait_ms`, then return a struct describing progress
-  of downloads, so that the caller can poll.
-
-The _visible set_ mentioned above will be calculated by the pageserver in the background, by taking the set
-of readable LSNs (i.e. branch points and heads of branches), and walking the layer map to work out which layers
-can possibly be read from these LSNs.  This concept of layer visibility is more generally useful for cache
-eviction and heatmaps, as well as in this specific case of warming up a timeline.
-
-The caller does not have to wait for the warm up API, or call it at all.  But it is strongly advised
-to call it, because otherwise populating local contents for a timeline can take a long time when waiting
-for SQL queries to coincidentally hit all the layers, and during that time query latency remains quite
-volatile.
-
-### Background work
-
-Archived branches are not subject to normal compaction.  Instead, when the compaction loop encounters
-an archived branch, it will consider rewriting the branch to just image layers if the branch has no history
-([archive branch optimization](#archive-branch-optimization)), or offloading the timeline from local disk
-if its state permits that.
-
-Additionally, the tenant compaction task will walk the state of already offloaded timelines to consider
-optimizing their storage, e.g. if a timeline had some history when offloaded, but since then its PITR
-has elapsed and it can now be rewritten to image layers.
-
-#### Archive branch offload
-
-Recall that when we archive a timeline via the HTTP API, this only sets a state: it doesn't do
-any actual work.
-
-This work is done in the background compaction loop.  It makes sense to tag this work on to the compaction
-loop, because it is spiritually aligned: offloading data for archived branches improves storage efficiency.
-
-The condition for offload is simple:
- - a `Timeline` object exists with state `Archived`
- - the timeline does not have any non-offloaded children.
- 
- Regarding the condition that children must be offloaded, this will always be eventually true, because
- we enforce at the API level that children of archived timelines must themselves be archived, and all
- archived timelines will eventually be offloaded.
-
-Offloading a timeline is simple:
- Read the timeline's attributes that we will store in its offloaded state (especially its logical size)
- Call `shutdown()` on the timeline and remove it from the `Tenant` (as if we were about to delete it)
- Erase all the timeline's content from local storage (`remove_dir_all` on its path)
- Write the tenant manifest to S3 to prevent this timeline being loaded on next start.
-
-#### Archive branch optimization (flattening)
-
-When we offloaded a branch, it might have had some history that prevented rewriting it to a single
-point in time set of image layers.  For example, a branch might have several days of writes and a 7
-day PITR: when we archive it, it still has those days of history.
-
-Once the PITR has expired, we have an opportunity to reduce the physical footprint of the branch by:
- Writing compressed image layers within the archived branch, as these are more efficient as a way of storing
-  a point in time compared with delta layers
- Updating the branch's offload metadata to indicate that this branch no longer depends on its ancestor
-  for data, i.e. the ancestor is free to GC layers files at+below the branch point
-
-Fully compacting an archived branch into image layers at a single LSN may be thought of as *flattening* the
-branch, such that it is now a one-dimensional keyspace rather than a two-dimensional key/lsn space. It becomes
-a true snapshot at that LSN.
-
-It is not always more efficient to flatten a branch than to keep some extra history on the parent: this
-is described in more detail in [optimizations](#delaying-storage-optimization-if-retaining-parent-layers-is-cheaper)
-
-Archive branch optimization should be done _before_ background offloads during compaction, because there may
-be timelines which are ready to be offloaded but also would benefit from the optimization step before
-being offloaded.  For example, a branch which has already fallen out of PITR window and has no history
-of its own may be immediately re-written as a series of image layers before being offloaded.
-
-### Consumption metrics
-
-Archived timelines and offloaded timelines will be excluded from the synthetic size calculation, in anticipating
-that billing structures based on consumption metrics are highly likely to apply different $/GB rates to archived
-vs. ordinary content.
-
-Archived and offloaded timelines' logical size will be reported under the existing `timeline_logical_size`
-variant of `MetricsKey`: receivers are then free to bill on this metric as they please.
-
-### Secondary locations
-
-Archived timelines (including offloaded timelines) will be excluded from heatmaps, and thereby
-when a timeline is archived, after the next cycle of heatmap upload & secondary download, its contents
-will be dropped from secondary locations.
-
-### Sharding
-
-Archiving or activating a timeline will be done symmetrically across all shards in a tenant, in
-the same way that timeline creation and deletion is done.  There are no special rules about ordering:
-the storage controller may dispatch concurrent calls to all shards when archiving or activating a timeline.
-
-Since consumption metrics are only transmitted from shard zero, the state of archival on this shard
-will be authoritative for consumption metrics.
-
-## Error cases
-
-### Errors in sharded tenants
-
-If one shard in a tenant fails an operation but others succeed, the tenant may end up in a mixed
-state, where a timeline is archived on some shards but not on others.  
-
-We will not bother implementing a rollback mechanism for this: errors in archiving/activating a timeline
-are either transient (e.g. S3 unavailable, shutting down), or the fault of the caller (NotFound, BadRequest).
-In the transient case callers are expected to retry until success, or to make appropriate API calls to clear
-up their mistake.  We rely on this good behavior of callers to eventually get timelines into a consistent
-state across all shards.  If callers do leave a timeline in an inconsistent state across shards, this doesn't
-break anything, it's just "weird".
-
-This is similar to the status quo for timeline creation and deletion: callers are expected to retry
-these operations until they succeed.
-
-### Archiving/activating
-
-Archiving/activating a timeline can fail in a limited number of ways:
-1. I/O error storing/reading the timeline's updated index
-    - These errors are always retryable: a fundamental design assumption of the pageserver is that remote
-      storage errors are always transient. 
-2. NotFound if the timeline doesn't exist
-    - Callers of the API are expected to avoid calling deletion and archival APIs concurrently.
-    - The storage controller has runtime locking to prevent races such as deleting a timeline while
-      archiving it.
-3. BadRequest if the rules around ancestors/descendents of archived timelines would be violated
-    - Callers are expected to do their own checks to avoid hitting this case.  If they make
-      a mistake and encounter this error, they should give up.
-
-### Offloading
-
-Offloading can only fail if remote storage is unavailable, which would prevent us from writing the
-tenant manifest.  In such error cases, we give up in the expectation that offloading will be tried 
-again at the next iteration of the compaction loop.
-
-### Archive branch optimization
-
-Optimization is a special form of compaction, so can encounter all the same errors as regular compaction
-can: it should return Result<(), CompactionError>, and as with compaction it will be retried on
-the next iteration of the compaction loop.
-
-## Optimizations
-
-### Delaying storage optimization if retaining parent layers is cheaper
-
-Optimizing archived branches to image layers and thereby enabling parent branch GC to progress
-is a safe default: archived branches cannot over-fill a pageserver's local disk, and once they
-are offloaded to S3 they're totally safe, inert things.
-
-However, in some cases it can be advantageous to retain extra history on their parent branch rather
-than flattening the archived branch.  For example, if a 1TB parent branch is rather slow-changing (1GB
-of data per day), and archive branches are being created nightly, then writing out full 1TB image layers
-for each nightly branch is inefficient compared with just keeping more history on the main branch.
-
-Getting this right requires consideration of:
- Compaction: if keeping more history on the main branch is going to prompt the main branch's compaction to
-  write out extra image layers, then it might make more sense to just write out the image layers on
-  the archived branch.
- Metadata bloat: keeping extra history on a parent branch doesn't just cost GB of storage, it makes
-  the layer map (and index_part) bigger.  There are practical limits beyond which writing an indefinitely
-  large layer map can cause problems elsewhere.
-
-This optimization can probably be implemented quite cheaply with some basic heuristics like:
- don't bother doing optimization on an archive branch if the LSN distance between
-  its branch point and the end of the PITR window is <5% of the logical size of the archive branch.
- ...but, Don't keep more history on the main branch than double the PITR
-
-### Creating a timeline in archived state (a snapshot)
-
-Sometimes, one might want to create a branch with no history, which will not be written to
-before it is archived.  This is a snapshot, although we do not require a special snapshot API,
-since a snapshot can be represented as a timeline with no history.
-
-This can be accomplished by simply creating a timeline and then immediately archiving it, but
-that is somewhat wasteful: this timeline it will spin up various tasks and open a connection to the storage
-broker to try and ingest WAL, before being shutdown in the subsequent archival call.  To explicitly
-support this common special case, we may add a parameter to the timeline creation API which
-creates a timeline directly into the archived state.
-
-Such a timeline creation will do exactly two I/Os at creation time:
- write the index_part object to record the timeline's existence
- when the timeline is offloaded in the next iteration of the compaction loop (~20s later),
-  write the tenant manifest.
-
-Later, when the timeline falls off the end of the PITR interval, the usual offload logic will wake
-up the 'snapshot' branch and write out image layers.
-
-## Future Work
-
-### Enabling `fullbackup` dumps from archive branches
-
-It would be useful to be able to export an archive branch to another system, or for use in a local
-postgres database.
-
-This could be implemented as a general capability for all branches, in which case it would "just work"
-for archive branches by activating them.  However, downloading all the layers in a branch just to generate
-a fullbackup is a bit inefficient: we could implement a special case for flattened archived branches
-which streams image layers from S3 and outputs the fullbackup stream without writing the layers out to disk.
-
-Implementing `fullbackup` is a bit more complicated than this because of sharding, but solving that problem
-is unrelated to the topic of archived branches (it probably involves having each shard write out a fullbackup 
-stream to S3 in an intermediate format and, then having one node stitch them together).
-
-### Tagging layers from archived branches
-
-When we know a layer is an image layer written for an archived branch that has fallen off the PITR window,
-we may add tags to the S3 objects to enable writing lifecycle policies that transition such layers to even
-cheaper storage.
-
-This could be done for all archived layers, or it could be driven by the archival API, to give the pageserver
-external hints on which branches are likely to be reactivated, and which branches are good candidates for
-tagging for low performance storage.
-
-Tagging+lifecycles is just one mechanism: one might also directly use S3 storage classes.  Other clouds' object
-stores have similar mechanisms.
-
-### Storing sequences of archive branches as deltas
-
-When archived branches are used as scheduled snapshots, we could store them even more efficiently
-by encoding them as deltas relative to each other (i.e. for nightly snapshots, when we do the
-storage optimization for Tuesday's snapshot, we would read Monday's snapshot and store only the modified
-pages). This is the kind of encoding that many backup storage systems use.
-
-The utility of this depends a lot on the churn rate of the data, and the cost of doing the delta encoding
-vs. just writing out a simple stream of the entire database.  For smaller databases, writing out a full
-copy is pretty trivial (e.g. writing a compressed copy of a 10GiB database to S3 can take under 10 seconds,
-so the complexity tradeoff of diff-encoding it is dubious).
-
-One does not necessarily have to read-back the previous snapshot in order to encoded the next one: if the
-pageserver knows about the schedule, it can intentionally retain extra history on the main branch so that
-we can say: "A branch exists from Monday night.  I have Monday night's data still active in the main branch,
-so now I can read at the Monday LSN and the Tuesday LSN, calculate the delta, and store it as Tuesday's
-delta snapshot".
-
-Clearly this all requires careful housekeeping to retain the relationship between branches that depend on
-each other: perhaps this would be done by making the archive branches have child/parent relationships with
-each other, or perhaps we would permit them to remain children of their original parent, but additionally
-have a relationship with the snapshot they're encoded relative to.
-
-Activating a branch that is diff-encoded may require activating several earlier branches too, so figuring
-out how frequently to write a full copy is important.  This is essentially a zoomed-out version of what
-we do with delta layers and image layers within a timeline, except each "layer" is a whole timeline.
-
-
-## FAQ/Alternatives
-
-### Store all timelines in the tenant manifest
-
-Rather than special-casing offloaded timelines in the offload manifest, we could store a total
-manifest of all timelines, eliminating the need for the pageserver to list timelines in S3 on
-startup.
-
-That would be a more invasive change (require hooking in to timeline creation), and would
-generate much more I/O to this manifest for tenants that had many branches _and_ frequent
-create/delete cycles for short lived branches.  Restricting the manifest to offloaded timelines
-means that we only have to cope with the rate at which long-lived timelines are archived, rather
-than the rate at which sort lived timelines are created & destroyed.
-
-### Automatically archiving/activating timelines without external API calls
-
-We could implement TTL driven offload of timelines, waking them up when a page request
-arrives.
-
-This has downsides:
- Opacity: if we do TTL-driven offload inside the pageserver, then the end user doesn't
-  know which of their branches are in this state, and might get a surprise when they try
-  to use such a branch.
- Price fluctuation: if the archival of a branch is used in end user pricing, then users
-  prefer clarity & consistency.  Ideally a branch's storage should cost the same from the moment it
-  is created, rather than having a usage-dependency storage price.
- Complexity: enabling the page service to call up into the Tenant to activate a timeline
-  would be awkward, compared with an external entry point.
-
-### Make offloaded a state of Timeline
-
-To reduce the operator-facing complexity of having some timelines APIs that only return
-non-offloaded timelines, we could build the offloaded state into the Timeline type.
-
-`timeline.rs` is already one of the most egregiously long source files in the tree, so
-this is rejected on the basis that we need to avoid making that complexity worse.
--- a/docs/storage_controller.md
+++ b/docs/storage_controller.md
@@ -44,7 +44,7 @@ If you need to modify the database schema, here’s how to create a migration:
 - Use `diesel migration generate <name>` to create a new migration
 - Populate the SQL files in the `migrations/` subdirectory
 - Use `DATABASE_URL=... diesel migration run` to apply the migration you just wrote: this will update the `[schema.rs](http://schema.rs)` file automatically.
-  - This requires a running database: the easiest way to do that is to just run `cargo neon init ; cargo neon start`, which will leave a database available at `postgresql://localhost:1235/storage_controller`
+  - This requires a running database: the easiest way to do that is to just run `cargo neon init ; cargo neon start`, which will leave a database available at `postgresql://localhost:1235/attachment_service`
 - Commit the migration files and the changes to schema.rs
 - If you need to iterate, you can rewind migrations with `diesel migration revert -a` and then `diesel migration run` again.
 - The migrations are build into the storage controller binary, and automatically run at startup after it is deployed, so once you’ve committed a migration no further steps are needed.
--- a/libs/metrics/src/hll.rs
+++ b/libs/metrics/src/hll.rs
@@ -13,7 +13,11 @@ use std::{

 use measured::{
    label::{LabelGroupVisitor, LabelName, LabelValue, LabelVisitor},
-    metric::{counter::CounterState, name::MetricNameEncoder, Metric, MetricType, MetricVec},
+    metric::{
+        group::{Encoding, MetricValue},
+        name::MetricNameEncoder,
+        Metric, MetricType, MetricVec,
+    },
    text::TextEncoder,
    LabelGroup,
 };
@@ -140,7 +144,6 @@ impl<const N: usize> HyperLogLogState<N> {
        })
    }
 }
-
 impl<W: std::io::Write, const N: usize> measured::metric::MetricEncoding<TextEncoder<W>>
    for HyperLogLogState<N>
 {
@@ -179,13 +182,12 @@ impl<W: std::io::Write, const N: usize> measured::metric::MetricEncoding<TextEnc
            .into_iter()
            .enumerate()
            .try_for_each(|(hll_shard, val)| {
-                CounterState::new(val as u64).collect_into(
-                    &(),
+                enc.write_metric_value(
+                    name.by_ref(),
                    labels.by_ref().compose_with(HllShardLabel {
                        hll_shard: hll_shard as i64,
                    }),
-                    name.by_ref(),
-                    enc,
+                    MetricValue::Int(val as i64),
                )
            })
    }
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -9,7 +9,7 @@ use measured::{
    metric::{
        counter::CounterState,
        gauge::GaugeState,
-        group::Encoding,
+        group::{Encoding, MetricValue},
        name::{MetricName, MetricNameEncoder},
        MetricEncoding, MetricFamilyEncoding,
    },
@@ -171,11 +171,8 @@ fn write_gauge<Enc: Encoding>(
    labels: impl LabelGroup,
    name: impl MetricNameEncoder,
    enc: &mut Enc,
-) -> Result<(), Enc::Err>
-where
-    GaugeState: MetricEncoding<Enc>,
-{
-    GaugeState::new(x).collect_into(&(), labels, name, enc)
+) -> Result<(), Enc::Err> {
+    enc.write_metric_value(name, labels, MetricValue::Int(x))
 }

 #[derive(Default)]
@@ -547,6 +544,15 @@ impl<T: Encoding> Encoding for Inc<T> {
    fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> {
        self.0.write_help(name, help)
    }
+
+    fn write_metric_value(
+        &mut self,
+        name: impl MetricNameEncoder,
+        labels: impl LabelGroup,
+        value: MetricValue,
+    ) -> Result<(), Self::Err> {
+        self.0.write_metric_value(name, labels, value)
+    }
 }

 impl<T: Encoding> MetricEncoding<Inc<T>> for MeasuredCounterPairState
@@ -573,6 +579,15 @@ impl<T: Encoding> Encoding for Dec<T> {
    fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> {
        self.0.write_help(name, help)
    }
+
+    fn write_metric_value(
+        &mut self,
+        name: impl MetricNameEncoder,
+        labels: impl LabelGroup,
+        value: MetricValue,
+    ) -> Result<(), Self::Err> {
+        self.0.write_metric_value(name, labels, value)
+    }
 }

 /// Write the dec counter to the encoder
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -87,7 +87,7 @@ pub struct TenantLocateResponse {
    pub shard_params: ShardParameters,
 }

-#[derive(Serialize, Deserialize, Debug)]
+#[derive(Serialize, Deserialize)]
 pub struct TenantDescribeResponse {
    pub tenant_id: TenantId,
    pub shards: Vec<TenantDescribeResponseShard>,
@@ -110,7 +110,7 @@ pub struct NodeDescribeResponse {
    pub listen_pg_port: u16,
 }

-#[derive(Serialize, Deserialize, Debug)]
+#[derive(Serialize, Deserialize)]
 pub struct TenantDescribeResponseShard {
    pub tenant_shard_id: TenantShardId,

--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -12,7 +12,7 @@ use crate::reltag::{BlockNumber, RelTag, SlruKind};
 ///
 /// The Repository treats this as an opaque struct, but see the code in pgdatadir_mapping.rs
 /// for what we actually store in these fields.
-#[derive(Debug, Clone, Copy, Hash, Eq, PartialEq, Ord, Serialize, Deserialize)]
+#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize)]
 pub struct Key {
    pub field1: u8,
    pub field2: u32,
@@ -22,41 +22,6 @@ pub struct Key {
    pub field6: u32,
 }

-impl PartialOrd for Key {
-    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
-        if self.field1 == other.field1
-            && self.field2 == other.field2
-            && self.field3 == other.field3
-            && self.field4 == other.field4
-            && self.field5 == other.field5
-        {
-            self.field6.partial_cmp(&other.field6)
-        } else {
-            match self.field1.partial_cmp(&other.field1) {
-                Some(core::cmp::Ordering::Equal) => {}
-                ord => return ord,
-            }
-            match self.field2.partial_cmp(&other.field2) {
-                Some(core::cmp::Ordering::Equal) => {}
-                ord => return ord,
-            }
-            match self.field3.partial_cmp(&other.field3) {
-                Some(core::cmp::Ordering::Equal) => {}
-                ord => return ord,
-            }
-            match self.field4.partial_cmp(&other.field4) {
-                Some(core::cmp::Ordering::Equal) => {}
-                ord => return ord,
-            }
-            match self.field5.partial_cmp(&other.field5) {
-                Some(core::cmp::Ordering::Equal) => {}
-                ord => return ord,
-            }
-            self.field6.partial_cmp(&other.field6)
-        }
-    }
-}
-
 /// The storage key size.
 pub const KEY_SIZE: usize = 18;

--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -294,6 +294,7 @@ pub struct TenantConfig {
    pub walreceiver_connect_timeout: Option<String>,
    pub lagging_wal_timeout: Option<String>,
    pub max_lsn_wal_lag: Option<NonZeroU64>,
+    pub trace_read_requests: Option<bool>,
    pub eviction_policy: Option<EvictionPolicy>,
    pub min_resident_size_override: Option<u64>,
    pub evictions_low_residence_duration_metric_threshold: Option<String>,
@@ -439,6 +440,9 @@ pub enum CompactionAlgorithm {

 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
 pub enum ImageCompressionAlgorithm {
+    /// Disabled for writes, and never decompress during reading.
+    /// Never set this after you've enabled compression once!
+    DisabledNoDecompress,
    // Disabled for writes, support decompressing during read path
    Disabled,
    /// Zstandard compression. Level 0 means and None mean the same (default level). Levels can be negative as well.
@@ -448,6 +452,12 @@ pub enum ImageCompressionAlgorithm {
    },
 }

+impl ImageCompressionAlgorithm {
+    pub fn allow_decompression(&self) -> bool {
+        !matches!(self, ImageCompressionAlgorithm::DisabledNoDecompress)
+    }
+}
+
 impl FromStr for ImageCompressionAlgorithm {
    type Err = anyhow::Error;
    fn from_str(s: &str) -> Result<Self, Self::Err> {
@@ -456,6 +466,7 @@ impl FromStr for ImageCompressionAlgorithm {
            .next()
            .ok_or_else(|| anyhow::anyhow!("empty string"))?;
        match first {
+            "disabled-no-decompress" => Ok(ImageCompressionAlgorithm::DisabledNoDecompress),
            "disabled" => Ok(ImageCompressionAlgorithm::Disabled),
            "zstd" => {
                let level = if let Some(v) = components.next() {
@@ -651,17 +662,6 @@ pub struct TenantDetails {
    pub timelines: Vec<TimelineId>,
 }

-#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Copy, Debug)]
-pub enum TimelineArchivalState {
-    Archived,
-    Unarchived,
-}
-
-#[derive(Serialize, Deserialize, PartialEq, Eq, Clone)]
-pub struct TimelineArchivalConfigRequest {
-    pub state: TimelineArchivalState,
-}
-
 /// This represents the output of the "timeline_detail" and "timeline_list" API calls.
 #[derive(Debug, Serialize, Deserialize, Clone)]
 pub struct TimelineInfo {
@@ -1683,6 +1683,10 @@ mod tests {
            ImageCompressionAlgorithm::from_str("disabled").unwrap(),
            Disabled
        );
+        assert_eq!(
+            ImageCompressionAlgorithm::from_str("disabled-no-decompress").unwrap(),
+            DisabledNoDecompress
+        );
        assert_eq!(
            ImageCompressionAlgorithm::from_str("zstd").unwrap(),
            Zstd { level: None }
--- a/libs/pageserver_api/src/models/detach_ancestor.rs
+++ b/libs/pageserver_api/src/models/detach_ancestor.rs
@@ -1,6 +1,6 @@
 use utils::id::TimelineId;

-#[derive(Debug, Default, PartialEq, serde::Serialize, serde::Deserialize)]
+#[derive(Default, serde::Serialize)]
 pub struct AncestorDetached {
    pub reparented_timelines: Vec<TimelineId>,
 }
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -1,42 +1,59 @@
-//! See docs/rfcs/031-sharding-static.md for an overview of sharding.
-//!
-//! This module contains a variety of types used to represent the concept of sharding
-//! a Neon tenant across multiple physical shards.  Since there are quite a few of these,
-//! we provide an summary here.
-//!
-//! Types used to describe shards:
-//! - [`ShardCount`] describes how many shards make up a tenant, plus the magic `unsharded` value
-//!   which identifies a tenant which is not shard-aware.  This means its storage paths do not include
-//!   a shard suffix.
-//! - [`ShardNumber`] is simply the zero-based index of a shard within a tenant.
-//! - [`ShardIndex`] is the 2-tuple of `ShardCount` and `ShardNumber`, it's just like a `TenantShardId`
-//!   without the tenant ID.  This is useful for things that are implicitly scoped to a particular
-//!   tenant, such as layer files.
-//! - [`ShardIdentity`]` is the full description of a particular shard's parameters, in sufficient
-//!   detail to convert a [`Key`] to a [`ShardNumber`] when deciding where to write/read.
-//! - The [`ShardSlug`] is a terse formatter for ShardCount and ShardNumber, written as
-//!   four hex digits.  An unsharded tenant is `0000`.
-//! - [`TenantShardId`] is the unique ID of a particular shard within a particular tenant
-//!
-//! Types used to describe the parameters for data distribution in a sharded tenant:
-//! - [`ShardStripeSize`] controls how long contiguous runs of [`Key`]s (stripes) are when distributed across
-//!   multiple shards.  Its value is given in 8kiB pages.
-//! - [`ShardLayout`] describes the data distribution scheme, and at time of writing is
-//!   always zero: this is provided for future upgrades that might introduce different
-//!   data distribution schemes.
-//!
-//! Examples:
-//! - A legacy unsharded tenant has one shard with ShardCount(0), ShardNumber(0), and its slug is 0000
-//! - A single sharded tenant has one shard with ShardCount(1), ShardNumber(0), and its slug is 0001
-//! - In a tenant with 4 shards, each shard has ShardCount(N), ShardNumber(i) where i in 0..N-1 (inclusive),
-//!   and their slugs are 0004, 0104, 0204, and 0304.
+use std::{ops::RangeInclusive, str::FromStr};

 use crate::{key::Key, models::ShardParameters};
+use hex::FromHex;
 use postgres_ffi::relfile_utils::INIT_FORKNUM;
 use serde::{Deserialize, Serialize};
+use utils::id::TenantId;

-#[doc(inline)]
-pub use ::utils::shard::*;
+/// See docs/rfcs/031-sharding-static.md for an overview of sharding.
+///
+/// This module contains a variety of types used to represent the concept of sharding
+/// a Neon tenant across multiple physical shards.  Since there are quite a few of these,
+/// we provide an summary here.
+///
+/// Types used to describe shards:
+/// - [`ShardCount`] describes how many shards make up a tenant, plus the magic `unsharded` value
+///   which identifies a tenant which is not shard-aware.  This means its storage paths do not include
+///   a shard suffix.
+/// - [`ShardNumber`] is simply the zero-based index of a shard within a tenant.
+/// - [`ShardIndex`] is the 2-tuple of `ShardCount` and `ShardNumber`, it's just like a `TenantShardId`
+///   without the tenant ID.  This is useful for things that are implicitly scoped to a particular
+///   tenant, such as layer files.
+/// - [`ShardIdentity`]` is the full description of a particular shard's parameters, in sufficient
+///   detail to convert a [`Key`] to a [`ShardNumber`] when deciding where to write/read.
+/// - The [`ShardSlug`] is a terse formatter for ShardCount and ShardNumber, written as
+///   four hex digits.  An unsharded tenant is `0000`.
+/// - [`TenantShardId`] is the unique ID of a particular shard within a particular tenant
+///
+/// Types used to describe the parameters for data distribution in a sharded tenant:
+/// - [`ShardStripeSize`] controls how long contiguous runs of [`Key`]s (stripes) are when distributed across
+///   multiple shards.  Its value is given in 8kiB pages.
+/// - [`ShardLayout`] describes the data distribution scheme, and at time of writing is
+///   always zero: this is provided for future upgrades that might introduce different
+///   data distribution schemes.
+///
+/// Examples:
+/// - A legacy unsharded tenant has one shard with ShardCount(0), ShardNumber(0), and its slug is 0000
+/// - A single sharded tenant has one shard with ShardCount(1), ShardNumber(0), and its slug is 0001
+/// - In a tenant with 4 shards, each shard has ShardCount(N), ShardNumber(i) where i in 0..N-1 (inclusive),
+///   and their slugs are 0004, 0104, 0204, and 0304.
+
+#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
+pub struct ShardNumber(pub u8);
+
+#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
+pub struct ShardCount(u8);
+
+/// Combination of ShardNumber and ShardCount.  For use within the context of a particular tenant,
+/// when we need to know which shard we're dealing with, but do not need to know the full
+/// ShardIdentity (because we won't be doing any page->shard mapping), and do not need to know
+/// the fully qualified TenantShardId.
+#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
+pub struct ShardIndex {
+    pub shard_number: ShardNumber,
+    pub shard_count: ShardCount,
+}

 /// The ShardIdentity contains enough information to map a [`Key`] to a [`ShardNumber`],
 /// and to check whether that [`ShardNumber`] is the same as the current shard.
@@ -48,6 +65,362 @@ pub struct ShardIdentity {
    layout: ShardLayout,
 }

+/// Formatting helper, for generating the `shard_id` label in traces.
+struct ShardSlug<'a>(&'a TenantShardId);
+
+/// TenantShardId globally identifies a particular shard in a particular tenant.
+///
+/// These are written as `<TenantId>-<ShardSlug>`, for example:
+///   # The second shard in a two-shard tenant
+///   072f1291a5310026820b2fe4b2968934-0102
+///
+/// If the `ShardCount` is _unsharded_, the `TenantShardId` is written without
+/// a shard suffix and is equivalent to the encoding of a `TenantId`: this enables
+/// an unsharded [`TenantShardId`] to be used interchangably with a [`TenantId`].
+///
+/// The human-readable encoding of an unsharded TenantShardId, such as used in API URLs,
+/// is both forward and backward compatible with TenantId: a legacy TenantId can be
+/// decoded as a TenantShardId, and when re-encoded it will be parseable
+/// as a TenantId.
+#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
+pub struct TenantShardId {
+    pub tenant_id: TenantId,
+    pub shard_number: ShardNumber,
+    pub shard_count: ShardCount,
+}
+
+impl ShardCount {
+    pub const MAX: Self = Self(u8::MAX);
+
+    /// The internal value of a ShardCount may be zero, which means "1 shard, but use
+    /// legacy format for TenantShardId that excludes the shard suffix", also known
+    /// as [`TenantShardId::unsharded`].
+    ///
+    /// This method returns the actual number of shards, i.e. if our internal value is
+    /// zero, we return 1 (unsharded tenants have 1 shard).
+    pub fn count(&self) -> u8 {
+        if self.0 > 0 {
+            self.0
+        } else {
+            1
+        }
+    }
+
+    /// The literal internal value: this is **not** the number of shards in the
+    /// tenant, as we have a special zero value for legacy unsharded tenants.  Use
+    /// [`Self::count`] if you want to know the cardinality of shards.
+    pub fn literal(&self) -> u8 {
+        self.0
+    }
+
+    /// Whether the `ShardCount` is for an unsharded tenant, so uses one shard but
+    /// uses the legacy format for `TenantShardId`. See also the documentation for
+    /// [`Self::count`].
+    pub fn is_unsharded(&self) -> bool {
+        self.0 == 0
+    }
+
+    /// `v` may be zero, or the number of shards in the tenant.  `v` is what
+    /// [`Self::literal`] would return.
+    pub const fn new(val: u8) -> Self {
+        Self(val)
+    }
+}
+
+impl ShardNumber {
+    pub const MAX: Self = Self(u8::MAX);
+}
+
+impl TenantShardId {
+    pub fn unsharded(tenant_id: TenantId) -> Self {
+        Self {
+            tenant_id,
+            shard_number: ShardNumber(0),
+            shard_count: ShardCount(0),
+        }
+    }
+
+    /// The range of all TenantShardId that belong to a particular TenantId.  This is useful when
+    /// you have a BTreeMap of TenantShardId, and are querying by TenantId.
+    pub fn tenant_range(tenant_id: TenantId) -> RangeInclusive<Self> {
+        RangeInclusive::new(
+            Self {
+                tenant_id,
+                shard_number: ShardNumber(0),
+                shard_count: ShardCount(0),
+            },
+            Self {
+                tenant_id,
+                shard_number: ShardNumber::MAX,
+                shard_count: ShardCount::MAX,
+            },
+        )
+    }
+
+    pub fn shard_slug(&self) -> impl std::fmt::Display + '_ {
+        ShardSlug(self)
+    }
+
+    /// Convenience for code that has special behavior on the 0th shard.
+    pub fn is_shard_zero(&self) -> bool {
+        self.shard_number == ShardNumber(0)
+    }
+
+    /// The "unsharded" value is distinct from simply having a single shard: it represents
+    /// a tenant which is not shard-aware at all, and whose storage paths will not include
+    /// a shard suffix.
+    pub fn is_unsharded(&self) -> bool {
+        self.shard_number == ShardNumber(0) && self.shard_count.is_unsharded()
+    }
+
+    /// Convenience for dropping the tenant_id and just getting the ShardIndex: this
+    /// is useful when logging from code that is already in a span that includes tenant ID, to
+    /// keep messages reasonably terse.
+    pub fn to_index(&self) -> ShardIndex {
+        ShardIndex {
+            shard_number: self.shard_number,
+            shard_count: self.shard_count,
+        }
+    }
+
+    /// Calculate the children of this TenantShardId when splitting the overall tenant into
+    /// the given number of shards.
+    pub fn split(&self, new_shard_count: ShardCount) -> Vec<TenantShardId> {
+        let effective_old_shard_count = std::cmp::max(self.shard_count.0, 1);
+        let mut child_shards = Vec::new();
+        for shard_number in 0..ShardNumber(new_shard_count.0).0 {
+            // Key mapping is based on a round robin mapping of key hash modulo shard count,
+            // so our child shards are the ones which the same keys would map to.
+            if shard_number % effective_old_shard_count == self.shard_number.0 {
+                child_shards.push(TenantShardId {
+                    tenant_id: self.tenant_id,
+                    shard_number: ShardNumber(shard_number),
+                    shard_count: new_shard_count,
+                })
+            }
+        }
+
+        child_shards
+    }
+}
+
+impl<'a> std::fmt::Display for ShardSlug<'a> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "{:02x}{:02x}",
+            self.0.shard_number.0, self.0.shard_count.0
+        )
+    }
+}
+
+impl std::fmt::Display for TenantShardId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        if self.shard_count != ShardCount(0) {
+            write!(f, "{}-{}", self.tenant_id, self.shard_slug())
+        } else {
+            // Legacy case (shard_count == 0) -- format as just the tenant id.  Note that this
+            // is distinct from the normal single shard case (shard count == 1).
+            self.tenant_id.fmt(f)
+        }
+    }
+}
+
+impl std::fmt::Debug for TenantShardId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        // Debug is the same as Display: the compact hex representation
+        write!(f, "{}", self)
+    }
+}
+
+impl std::str::FromStr for TenantShardId {
+    type Err = hex::FromHexError;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        // Expect format: 16 byte TenantId, '-', 1 byte shard number, 1 byte shard count
+        if s.len() == 32 {
+            // Legacy case: no shard specified
+            Ok(Self {
+                tenant_id: TenantId::from_str(s)?,
+                shard_number: ShardNumber(0),
+                shard_count: ShardCount(0),
+            })
+        } else if s.len() == 37 {
+            let bytes = s.as_bytes();
+            let tenant_id = TenantId::from_hex(&bytes[0..32])?;
+            let mut shard_parts: [u8; 2] = [0u8; 2];
+            hex::decode_to_slice(&bytes[33..37], &mut shard_parts)?;
+            Ok(Self {
+                tenant_id,
+                shard_number: ShardNumber(shard_parts[0]),
+                shard_count: ShardCount(shard_parts[1]),
+            })
+        } else {
+            Err(hex::FromHexError::InvalidStringLength)
+        }
+    }
+}
+
+impl From<[u8; 18]> for TenantShardId {
+    fn from(b: [u8; 18]) -> Self {
+        let tenant_id_bytes: [u8; 16] = b[0..16].try_into().unwrap();
+
+        Self {
+            tenant_id: TenantId::from(tenant_id_bytes),
+            shard_number: ShardNumber(b[16]),
+            shard_count: ShardCount(b[17]),
+        }
+    }
+}
+
+impl ShardIndex {
+    pub fn new(number: ShardNumber, count: ShardCount) -> Self {
+        Self {
+            shard_number: number,
+            shard_count: count,
+        }
+    }
+    pub fn unsharded() -> Self {
+        Self {
+            shard_number: ShardNumber(0),
+            shard_count: ShardCount(0),
+        }
+    }
+
+    /// The "unsharded" value is distinct from simply having a single shard: it represents
+    /// a tenant which is not shard-aware at all, and whose storage paths will not include
+    /// a shard suffix.
+    pub fn is_unsharded(&self) -> bool {
+        self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
+    }
+
+    /// For use in constructing remote storage paths: concatenate this with a TenantId
+    /// to get a fully qualified TenantShardId.
+    ///
+    /// Backward compat: this function returns an empty string if Self::is_unsharded, such
+    /// that the legacy pre-sharding remote key format is preserved.
+    pub fn get_suffix(&self) -> String {
+        if self.is_unsharded() {
+            "".to_string()
+        } else {
+            format!("-{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
+        }
+    }
+}
+
+impl std::fmt::Display for ShardIndex {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
+    }
+}
+
+impl std::fmt::Debug for ShardIndex {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        // Debug is the same as Display: the compact hex representation
+        write!(f, "{}", self)
+    }
+}
+
+impl std::str::FromStr for ShardIndex {
+    type Err = hex::FromHexError;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        // Expect format: 1 byte shard number, 1 byte shard count
+        if s.len() == 4 {
+            let bytes = s.as_bytes();
+            let mut shard_parts: [u8; 2] = [0u8; 2];
+            hex::decode_to_slice(bytes, &mut shard_parts)?;
+            Ok(Self {
+                shard_number: ShardNumber(shard_parts[0]),
+                shard_count: ShardCount(shard_parts[1]),
+            })
+        } else {
+            Err(hex::FromHexError::InvalidStringLength)
+        }
+    }
+}
+
+impl From<[u8; 2]> for ShardIndex {
+    fn from(b: [u8; 2]) -> Self {
+        Self {
+            shard_number: ShardNumber(b[0]),
+            shard_count: ShardCount(b[1]),
+        }
+    }
+}
+
+impl Serialize for TenantShardId {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        if serializer.is_human_readable() {
+            serializer.collect_str(self)
+        } else {
+            // Note: while human encoding of [`TenantShardId`] is backward and forward
+            // compatible, this binary encoding is not.
+            let mut packed: [u8; 18] = [0; 18];
+            packed[0..16].clone_from_slice(&self.tenant_id.as_arr());
+            packed[16] = self.shard_number.0;
+            packed[17] = self.shard_count.0;
+
+            packed.serialize(serializer)
+        }
+    }
+}
+
+impl<'de> Deserialize<'de> for TenantShardId {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        struct IdVisitor {
+            is_human_readable_deserializer: bool,
+        }
+
+        impl<'de> serde::de::Visitor<'de> for IdVisitor {
+            type Value = TenantShardId;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
+                if self.is_human_readable_deserializer {
+                    formatter.write_str("value in form of hex string")
+                } else {
+                    formatter.write_str("value in form of integer array([u8; 18])")
+                }
+            }
+
+            fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
+            where
+                A: serde::de::SeqAccess<'de>,
+            {
+                let s = serde::de::value::SeqAccessDeserializer::new(seq);
+                let id: [u8; 18] = Deserialize::deserialize(s)?;
+                Ok(TenantShardId::from(id))
+            }
+
+            fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
+            where
+                E: serde::de::Error,
+            {
+                TenantShardId::from_str(v).map_err(E::custom)
+            }
+        }
+
+        if deserializer.is_human_readable() {
+            deserializer.deserialize_str(IdVisitor {
+                is_human_readable_deserializer: true,
+            })
+        } else {
+            deserializer.deserialize_tuple(
+                18,
+                IdVisitor {
+                    is_human_readable_deserializer: false,
+                },
+            )
+        }
+    }
+}
+
 /// Stripe size in number of pages
 #[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
 pub struct ShardStripeSize(pub u32);
@@ -212,6 +585,77 @@ impl ShardIdentity {
    }
 }

+impl Serialize for ShardIndex {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        if serializer.is_human_readable() {
+            serializer.collect_str(self)
+        } else {
+            // Binary encoding is not used in index_part.json, but is included in anticipation of
+            // switching various structures (e.g. inter-process communication, remote metadata) to more
+            // compact binary encodings in future.
+            let mut packed: [u8; 2] = [0; 2];
+            packed[0] = self.shard_number.0;
+            packed[1] = self.shard_count.0;
+            packed.serialize(serializer)
+        }
+    }
+}
+
+impl<'de> Deserialize<'de> for ShardIndex {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        struct IdVisitor {
+            is_human_readable_deserializer: bool,
+        }
+
+        impl<'de> serde::de::Visitor<'de> for IdVisitor {
+            type Value = ShardIndex;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
+                if self.is_human_readable_deserializer {
+                    formatter.write_str("value in form of hex string")
+                } else {
+                    formatter.write_str("value in form of integer array([u8; 2])")
+                }
+            }
+
+            fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
+            where
+                A: serde::de::SeqAccess<'de>,
+            {
+                let s = serde::de::value::SeqAccessDeserializer::new(seq);
+                let id: [u8; 2] = Deserialize::deserialize(s)?;
+                Ok(ShardIndex::from(id))
+            }
+
+            fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
+            where
+                E: serde::de::Error,
+            {
+                ShardIndex::from_str(v).map_err(E::custom)
+            }
+        }
+
+        if deserializer.is_human_readable() {
+            deserializer.deserialize_str(IdVisitor {
+                is_human_readable_deserializer: true,
+            })
+        } else {
+            deserializer.deserialize_tuple(
+                2,
+                IdVisitor {
+                    is_human_readable_deserializer: false,
+                },
+            )
+        }
+    }
+}
+
 /// Whether this key is always held on shard 0 (e.g. shard 0 holds all SLRU keys
 /// in order to be able to serve basebackup requests without peer communication).
 fn key_is_shard0(key: &Key) -> bool {
@@ -293,9 +737,7 @@ pub fn describe(

 #[cfg(test)]
 mod tests {
-    use std::str::FromStr;
-
-    use utils::{id::TenantId, Hex};
+    use utils::Hex;

    use super::*;

--- a/libs/postgres_backend/Cargo.toml
+++ b/libs/postgres_backend/Cargo.toml
@@ -13,7 +13,6 @@ rustls.workspace = true
 serde.workspace = true
 thiserror.workspace = true
 tokio.workspace = true
-tokio-util.workspace = true
 tokio-rustls.workspace = true
 tracing.workspace = true

@@ -24,4 +23,4 @@ workspace_hack.workspace = true
 once_cell.workspace = true
 rustls-pemfile.workspace = true
 tokio-postgres.workspace = true
-tokio-postgres-rustls.workspace = true
+tokio-postgres-rustls.workspace = true
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -16,7 +16,6 @@ use std::{fmt, io};
 use std::{future::Future, str::FromStr};
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_rustls::TlsAcceptor;
-use tokio_util::sync::CancellationToken;
 use tracing::{debug, error, info, trace, warn};

 use pq_proto::framed::{ConnectionError, Framed, FramedReader, FramedWriter};
@@ -401,15 +400,21 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
    }

    /// Wrapper for run_message_loop() that shuts down socket when we are done
-    pub async fn run(
+    pub async fn run<F, S>(
        mut self,
        handler: &mut impl Handler<IO>,
-        cancel: &CancellationToken,
-    ) -> Result<(), QueryError> {
-        let ret = self.run_message_loop(handler, cancel).await;
+        shutdown_watcher: F,
+    ) -> Result<(), QueryError>
+    where
+        F: Fn() -> S + Clone,
+        S: Future,
+    {
+        let ret = self
+            .run_message_loop(handler, shutdown_watcher.clone())
+            .await;

        tokio::select! {
-            _ = cancel.cancelled() => {
+            _ = shutdown_watcher() => {
                // do nothing; we most likely got already stopped by shutdown and will log it next.
            }
            _ = self.framed.shutdown() => {
@@ -439,17 +444,21 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
        }
    }

-    async fn run_message_loop(
+    async fn run_message_loop<F, S>(
        &mut self,
        handler: &mut impl Handler<IO>,
-        cancel: &CancellationToken,
-    ) -> Result<(), QueryError> {
+        shutdown_watcher: F,
+    ) -> Result<(), QueryError>
+    where
+        F: Fn() -> S,
+        S: Future,
+    {
        trace!("postgres backend to {:?} started", self.peer_addr);

        tokio::select!(
            biased;

-            _ = cancel.cancelled() => {
+            _ = shutdown_watcher() => {
                // We were requested to shut down.
                tracing::info!("shutdown request received during handshake");
                return Err(QueryError::Shutdown)
@@ -464,7 +473,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
        let mut query_string = Bytes::new();
        while let Some(msg) = tokio::select!(
            biased;
-            _ = cancel.cancelled() => {
+            _ = shutdown_watcher() => {
                // We were requested to shut down.
                tracing::info!("shutdown request received in run_message_loop");
                return Err(QueryError::Shutdown)
@@ -476,7 +485,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
            let result = self.process_message(handler, msg, &mut query_string).await;
            tokio::select!(
                biased;
-                _ = cancel.cancelled() => {
+                _ = shutdown_watcher() => {
                    // We were requested to shut down.
                    tracing::info!("shutdown request received during response flush");

@@ -663,17 +672,11 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
        assert!(self.state < ProtoState::Authentication);
        let have_tls = self.tls_config.is_some();
        match msg {
-            FeStartupPacket::SslRequest { direct } => {
+            FeStartupPacket::SslRequest => {
                debug!("SSL requested");

-                if !direct {
-                    self.write_message(&BeMessage::EncryptionResponse(have_tls))
-                        .await?;
-                } else if !have_tls {
-                    return Err(QueryError::Other(anyhow::anyhow!(
-                        "direct SSL negotiation but no TLS support"
-                    )));
-                }
+                self.write_message(&BeMessage::EncryptionResponse(have_tls))
+                    .await?;

                if have_tls {
                    self.start_tls().await?;
--- a/libs/postgres_backend/tests/simple_select.rs
+++ b/libs/postgres_backend/tests/simple_select.rs
@@ -3,14 +3,13 @@ use once_cell::sync::Lazy;
 use postgres_backend::{AuthType, Handler, PostgresBackend, QueryError};
 use pq_proto::{BeMessage, RowDescriptor};
 use std::io::Cursor;
-use std::sync::Arc;
+use std::{future, sync::Arc};
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio::net::{TcpListener, TcpStream};
 use tokio_postgres::config::SslMode;
 use tokio_postgres::tls::MakeTlsConnect;
 use tokio_postgres::{Config, NoTls, SimpleQueryMessage};
 use tokio_postgres_rustls::MakeRustlsConnect;
-use tokio_util::sync::CancellationToken;

 // generate client, server test streams
 async fn make_tcp_pair() -> (TcpStream, TcpStream) {
@@ -51,7 +50,7 @@ async fn simple_select() {

    tokio::spawn(async move {
        let mut handler = TestHandler {};
-        pgbackend.run(&mut handler, &CancellationToken::new()).await
+        pgbackend.run(&mut handler, future::pending::<()>).await
    });

    let conf = Config::new();
@@ -103,7 +102,7 @@ async fn simple_select_ssl() {

    tokio::spawn(async move {
        let mut handler = TestHandler {};
-        pgbackend.run(&mut handler, &CancellationToken::new()).await
+        pgbackend.run(&mut handler, future::pending::<()>).await
    });

    let client_cfg = rustls::ClientConfig::builder()
--- a/libs/postgres_ffi/src/lib.rs
+++ b/libs/postgres_ffi/src/lib.rs
@@ -132,7 +132,7 @@ pub const RELSEG_SIZE: u32 = 1024 * 1024 * 1024 / (BLCKSZ as u32);
 pub const XLOG_BLCKSZ: usize = 8192;
 pub const WAL_SEGMENT_SIZE: usize = 16 * 1024 * 1024;

-pub const MAX_SEND_SIZE: usize = XLOG_BLCKSZ * 128;
+pub const MAX_SEND_SIZE: usize = XLOG_BLCKSZ * 16;

 // Export some version independent functions that are used outside of this mod
 pub use v14::xlog_utils::encode_logical_message;
--- a/libs/pq_proto/src/framed.rs
+++ b/libs/pq_proto/src/framed.rs
@@ -44,9 +44,9 @@ impl ConnectionError {
 /// Wraps async io `stream`, providing messages to write/flush + read Postgres
 /// messages.
 pub struct Framed<S> {
-    pub stream: S,
-    pub read_buf: BytesMut,
-    pub write_buf: BytesMut,
+    stream: S,
+    read_buf: BytesMut,
+    write_buf: BytesMut,
 }

 impl<S> Framed<S> {
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -39,39 +39,14 @@ pub enum FeMessage {
    PasswordMessage(Bytes),
 }

-#[derive(Clone, Copy, PartialEq, PartialOrd)]
-pub struct ProtocolVersion(u32);
-
-impl ProtocolVersion {
-    pub const fn new(major: u16, minor: u16) -> Self {
-        Self((major as u32) << 16 | minor as u32)
-    }
-    pub const fn minor(self) -> u16 {
-        self.0 as u16
-    }
-    pub const fn major(self) -> u16 {
-        (self.0 >> 16) as u16
-    }
-}
-
-impl fmt::Debug for ProtocolVersion {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        f.debug_list()
-            .entry(&self.major())
-            .entry(&self.minor())
-            .finish()
-    }
-}
-
 #[derive(Debug)]
 pub enum FeStartupPacket {
    CancelRequest(CancelKeyData),
-    SslRequest {
-        direct: bool,
-    },
+    SslRequest,
    GssEncRequest,
    StartupMessage {
-        version: ProtocolVersion,
+        major_version: u32,
+        minor_version: u32,
        params: StartupMessageParams,
    },
 }
@@ -326,23 +301,11 @@ impl FeStartupPacket {
    /// different from [`FeMessage::parse`] because startup messages don't have
    /// message type byte; otherwise, its comments apply.
    pub fn parse(buf: &mut BytesMut) -> Result<Option<FeStartupPacket>, ProtocolError> {
-        /// <https://github.com/postgres/postgres/blob/ca481d3c9ab7bf69ff0c8d71ad3951d407f6a33c/src/include/libpq/pqcomm.h#L118>
        const MAX_STARTUP_PACKET_LENGTH: usize = 10000;
-        const RESERVED_INVALID_MAJOR_VERSION: u16 = 1234;
-        /// <https://github.com/postgres/postgres/blob/ca481d3c9ab7bf69ff0c8d71ad3951d407f6a33c/src/include/libpq/pqcomm.h#L132>
-        const CANCEL_REQUEST_CODE: ProtocolVersion = ProtocolVersion::new(1234, 5678);
-        /// <https://github.com/postgres/postgres/blob/ca481d3c9ab7bf69ff0c8d71ad3951d407f6a33c/src/include/libpq/pqcomm.h#L166>
-        const NEGOTIATE_SSL_CODE: ProtocolVersion = ProtocolVersion::new(1234, 5679);
-        /// <https://github.com/postgres/postgres/blob/ca481d3c9ab7bf69ff0c8d71ad3951d407f6a33c/src/include/libpq/pqcomm.h#L167>
-        const NEGOTIATE_GSS_CODE: ProtocolVersion = ProtocolVersion::new(1234, 5680);
-
-        // <https://github.com/postgres/postgres/blob/04bcf9e19a4261fe9c7df37c777592c2e10c32a7/src/backend/tcop/backend_startup.c#L378-L382>
-        // First byte indicates standard SSL handshake message
-        // (It can't be a Postgres startup length because in network byte order
-        // that would be a startup packet hundreds of megabytes long)
-        if buf.first() == Some(&0x16) {
-            return Ok(Some(FeStartupPacket::SslRequest { direct: true }));
-        }
+        const RESERVED_INVALID_MAJOR_VERSION: u32 = 1234;
+        const CANCEL_REQUEST_CODE: u32 = 5678;
+        const NEGOTIATE_SSL_CODE: u32 = 5679;
+        const NEGOTIATE_GSS_CODE: u32 = 5680;

        // need at least 4 bytes with packet len
        if buf.len() < 4 {
@@ -375,10 +338,12 @@ impl FeStartupPacket {
        let mut msg = buf.split_to(len).freeze();
        msg.advance(4); // consume len

-        let request_code = ProtocolVersion(msg.get_u32());
+        let request_code = msg.get_u32();
+        let req_hi = request_code >> 16;
+        let req_lo = request_code & ((1 << 16) - 1);
        // StartupMessage, CancelRequest, SSLRequest etc are differentiated by request code.
-        let message = match request_code {
-            CANCEL_REQUEST_CODE => {
+        let message = match (req_hi, req_lo) {
+            (RESERVED_INVALID_MAJOR_VERSION, CANCEL_REQUEST_CODE) => {
                if msg.remaining() != 8 {
                    return Err(ProtocolError::BadMessage(
                        "CancelRequest message is malformed, backend PID / secret key missing"
@@ -390,22 +355,21 @@ impl FeStartupPacket {
                    cancel_key: msg.get_i32(),
                })
            }
-            NEGOTIATE_SSL_CODE => {
+            (RESERVED_INVALID_MAJOR_VERSION, NEGOTIATE_SSL_CODE) => {
                // Requested upgrade to SSL (aka TLS)
-                FeStartupPacket::SslRequest { direct: false }
+                FeStartupPacket::SslRequest
            }
-            NEGOTIATE_GSS_CODE => {
+            (RESERVED_INVALID_MAJOR_VERSION, NEGOTIATE_GSS_CODE) => {
                // Requested upgrade to GSSAPI
                FeStartupPacket::GssEncRequest
            }
-            version if version.major() == RESERVED_INVALID_MAJOR_VERSION => {
+            (RESERVED_INVALID_MAJOR_VERSION, unrecognized_code) => {
                return Err(ProtocolError::Protocol(format!(
-                    "Unrecognized request code {}",
-                    version.minor()
+                    "Unrecognized request code {unrecognized_code}"
                )));
            }
            // TODO bail if protocol major_version is not 3?
-            version => {
+            (major_version, minor_version) => {
                // StartupMessage

                let s = str::from_utf8(&msg).map_err(|_e| {
@@ -418,7 +382,8 @@ impl FeStartupPacket {
                })?;

                FeStartupPacket::StartupMessage {
-                    version,
+                    major_version,
+                    minor_version,
                    params: StartupMessageParams {
                        params: msg.slice_ref(s.as_bytes()),
                    },
@@ -557,10 +522,6 @@ pub enum BeMessage<'a> {
    RowDescription(&'a [RowDescriptor<'a>]),
    XLogData(XLogDataBody<'a>),
    NoticeResponse(&'a str),
-    NegotiateProtocolVersion {
-        version: ProtocolVersion,
-        options: &'a [&'a str],
-    },
    KeepAlive(WalSndKeepAlive),
 }

@@ -984,18 +945,6 @@ impl<'a> BeMessage<'a> {
                    buf.put_u8(u8::from(req.request_reply));
                });
            }
-
-            BeMessage::NegotiateProtocolVersion { version, options } => {
-                buf.put_u8(b'v');
-                write_body(buf, |buf| {
-                    buf.put_u32(version.0);
-                    buf.put_u32(options.len() as u32);
-                    for option in options.iter() {
-                        write_cstr(option, buf)?;
-                    }
-                    Ok(())
-                })?
-            }
        }
        Ok(())
    }
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -443,7 +443,7 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
 }

 impl GenericRemoteStorage {
-    pub async fn from_config(storage_config: &RemoteStorageConfig) -> anyhow::Result<Self> {
+    pub fn from_config(storage_config: &RemoteStorageConfig) -> anyhow::Result<Self> {
        let timeout = storage_config.timeout;
        Ok(match &storage_config.storage {
            RemoteStorageKind::LocalFs { local_path: path } => {
@@ -458,7 +458,7 @@ impl GenericRemoteStorage {
                    std::env::var("AWS_ACCESS_KEY_ID").unwrap_or_else(|_| "<none>".into());
                info!("Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}', profile: {profile}, access_key_id: {access_key_id}",
                      s3_config.bucket_name, s3_config.bucket_region, s3_config.prefix_in_bucket, s3_config.endpoint);
-                Self::AwsS3(Arc::new(S3Bucket::new(s3_config, timeout).await?))
+                Self::AwsS3(Arc::new(S3Bucket::new(s3_config, timeout)?))
            }
            RemoteStorageKind::AzureContainer(azure_config) => {
                let storage_account = azure_config
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -16,10 +16,16 @@ use std::{

 use anyhow::{anyhow, Context as _};
 use aws_config::{
-    default_provider::credentials::DefaultCredentialsChain,
+    environment::credentials::EnvironmentVariableCredentialsProvider,
+    imds::credentials::ImdsCredentialsProvider,
+    meta::credentials::CredentialsProviderChain,
+    profile::ProfileFileCredentialsProvider,
+    provider_config::ProviderConfig,
    retry::{RetryConfigBuilder, RetryMode},
+    web_identity_token::WebIdentityTokenCredentialsProvider,
    BehaviorVersion,
 };
+use aws_credential_types::provider::SharedCredentialsProvider;
 use aws_sdk_s3::{
    config::{AsyncSleep, IdentityCache, Region, SharedAsyncSleep},
    error::SdkError,
@@ -70,27 +76,40 @@ struct GetObjectRequest {
 }
 impl S3Bucket {
    /// Creates the S3 storage, errors if incorrect AWS S3 configuration provided.
-    pub async fn new(remote_storage_config: &S3Config, timeout: Duration) -> anyhow::Result<Self> {
+    pub fn new(remote_storage_config: &S3Config, timeout: Duration) -> anyhow::Result<Self> {
        tracing::debug!(
            "Creating s3 remote storage for S3 bucket {}",
            remote_storage_config.bucket_name
        );

-        let region = Region::new(remote_storage_config.bucket_region.clone());
-        let region_opt = Some(region.clone());
+        let region = Some(Region::new(remote_storage_config.bucket_region.clone()));

-        // https://docs.aws.amazon.com/sdkref/latest/guide/standardized-credentials.html
-        // https://docs.rs/aws-config/latest/aws_config/default_provider/credentials/struct.DefaultCredentialsChain.html
-        // Incomplete list of auth methods used by this:
-        // * "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
-        // * "AWS_PROFILE" / `aws sso login --profile <profile>`
-        // * "AWS_WEB_IDENTITY_TOKEN_FILE", "AWS_ROLE_ARN", "AWS_ROLE_SESSION_NAME"
-        // * http (ECS/EKS) container credentials
-        // * imds v2
-        let credentials_provider = DefaultCredentialsChain::builder()
-            .region(region)
-            .build()
-            .await;
+        let provider_conf = ProviderConfig::without_region().with_region(region.clone());
+
+        let credentials_provider = {
+            // uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
+            CredentialsProviderChain::first_try(
+                "env",
+                EnvironmentVariableCredentialsProvider::new(),
+            )
+            // uses "AWS_PROFILE" / `aws sso login --profile <profile>`
+            .or_else(
+                "profile-sso",
+                ProfileFileCredentialsProvider::builder()
+                    .configure(&provider_conf)
+                    .build(),
+            )
+            // uses "AWS_WEB_IDENTITY_TOKEN_FILE", "AWS_ROLE_ARN", "AWS_ROLE_SESSION_NAME"
+            // needed to access remote extensions bucket
+            .or_else(
+                "token",
+                WebIdentityTokenCredentialsProvider::builder()
+                    .configure(&provider_conf)
+                    .build(),
+            )
+            // uses imds v2
+            .or_else("imds", ImdsCredentialsProvider::builder().build())
+        };

        // AWS SDK requires us to specify how the RetryConfig should sleep when it wants to back off
        let sleep_impl: Arc<dyn AsyncSleep> = Arc::new(TokioSleep::new());
@@ -99,9 +118,9 @@ impl S3Bucket {
            #[allow(deprecated)] /* TODO: https://github.com/neondatabase/neon/issues/7665 */
            BehaviorVersion::v2023_11_09(),
        )
-        .region(region_opt)
+        .region(region)
        .identity_cache(IdentityCache::lazy().build())
-        .credentials_provider(credentials_provider)
+        .credentials_provider(SharedCredentialsProvider::new(credentials_provider))
        .sleep_impl(SharedAsyncSleep::from(sleep_impl));

        let sdk_config: aws_config::SdkConfig = std::thread::scope(|s| {
@@ -1022,8 +1041,8 @@ mod tests {

    use crate::{RemotePath, S3Bucket, S3Config};

-    #[tokio::test]
-    async fn relative_path() {
+    #[test]
+    fn relative_path() {
        let all_paths = ["", "some/path", "some/path/"];
        let all_paths: Vec<RemotePath> = all_paths
            .iter()
@@ -1066,9 +1085,8 @@ mod tests {
                max_keys_per_list_response: Some(5),
                upload_storage_class: None,
            };
-            let storage = S3Bucket::new(&config, std::time::Duration::ZERO)
-                .await
-                .expect("remote storage init");
+            let storage =
+                S3Bucket::new(&config, std::time::Duration::ZERO).expect("remote storage init");
            for (test_path_idx, test_path) in all_paths.iter().enumerate() {
                let result = storage.relative_path_to_s3_object(test_path);
                let expected = expected_outputs[prefix_idx][test_path_idx];
--- a/libs/remote_storage/tests/test_real_azure.rs
+++ b/libs/remote_storage/tests/test_real_azure.rs
@@ -31,7 +31,6 @@ struct EnabledAzure {
 impl EnabledAzure {
    async fn setup(max_keys_in_list_response: Option<i32>) -> Self {
        let client = create_azure_client(max_keys_in_list_response)
-            .await
            .context("Azure client creation")
            .expect("Azure client creation failed");

@@ -188,7 +187,7 @@ impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs {
    }
 }

-async fn create_azure_client(
+fn create_azure_client(
    max_keys_per_list_response: Option<i32>,
 ) -> anyhow::Result<Arc<GenericRemoteStorage>> {
    use rand::Rng;
@@ -222,8 +221,6 @@ async fn create_azure_client(
        timeout: Duration::from_secs(120),
    };
    Ok(Arc::new(
-        GenericRemoteStorage::from_config(&remote_storage_config)
-            .await
-            .context("remote storage init")?,
+        GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?,
    ))
 }
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -197,7 +197,6 @@ struct EnabledS3 {
 impl EnabledS3 {
    async fn setup(max_keys_in_list_response: Option<i32>) -> Self {
        let client = create_s3_client(max_keys_in_list_response)
-            .await
            .context("S3 client creation")
            .expect("S3 client creation failed");

@@ -353,7 +352,7 @@ impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs {
    }
 }

-async fn create_s3_client(
+fn create_s3_client(
    max_keys_per_list_response: Option<i32>,
 ) -> anyhow::Result<Arc<GenericRemoteStorage>> {
    use rand::Rng;
@@ -386,9 +385,7 @@ async fn create_s3_client(
        timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
    };
    Ok(Arc::new(
-        GenericRemoteStorage::from_config(&remote_storage_config)
-            .await
-            .context("remote storage init")?,
+        GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?,
    ))
 }

--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -36,7 +36,6 @@ routerify.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 signal-hook.workspace = true
-smallvec.workspace = true
 thiserror.workspace = true
 tokio.workspace = true
 tokio-tar.workspace = true
--- a/libs/utils/src/auth.rs
+++ b/libs/utils/src/auth.rs
@@ -33,10 +33,6 @@ pub enum Scope {
    GenerationsApi,
    // Allows access to control plane managment API and some storage controller endpoints.
    Admin,
-
-    /// Allows access to storage controller APIs used by the scrubber, to interrogate the state
-    /// of a tenant & post scrub results.
-    Scrubber,
 }

 /// JWT payload. See docs/authentication.md for the format
--- a/libs/utils/src/circuit_breaker.rs
+++ b/libs/utils/src/circuit_breaker.rs
@@ -1,114 +0,0 @@
-use std::{
-    fmt::Display,
-    time::{Duration, Instant},
-};
-
-use metrics::IntCounter;
-
-/// Circuit breakers are for operations that are expensive and fallible: if they fail repeatedly,
-/// we will stop attempting them for some period of time, to avoid denial-of-service from retries, and
-/// to mitigate the log spam from repeated failures.
-pub struct CircuitBreaker {
-    /// An identifier that enables us to log useful errors when a circuit is broken
-    name: String,
-
-    /// Consecutive failures since last success
-    fail_count: usize,
-
-    /// How many consecutive failures before we break the circuit
-    fail_threshold: usize,
-
-    /// If circuit is broken, when was it broken?
-    broken_at: Option<Instant>,
-
-    /// If set, we will auto-reset the circuit this long after it was broken.  If None, broken
-    /// circuits stay broken forever, or until success() is called.
-    reset_period: Option<Duration>,
-
-    /// If this is true, no actual circuit-breaking happens.  This is for overriding a circuit breaker
-    /// to permit something to keep running even if it would otherwise have tripped it.
-    short_circuit: bool,
-}
-
-impl CircuitBreaker {
-    pub fn new(name: String, fail_threshold: usize, reset_period: Option<Duration>) -> Self {
-        Self {
-            name,
-            fail_count: 0,
-            fail_threshold,
-            broken_at: None,
-            reset_period,
-            short_circuit: false,
-        }
-    }
-
-    /// Construct an unbreakable circuit breaker, for use in unit tests etc.
-    pub fn short_circuit() -> Self {
-        Self {
-            name: String::new(),
-            fail_threshold: 0,
-            fail_count: 0,
-            broken_at: None,
-            reset_period: None,
-            short_circuit: true,
-        }
-    }
-
-    pub fn fail<E>(&mut self, metric: &IntCounter, error: E)
-    where
-        E: Display,
-    {
-        if self.short_circuit {
-            return;
-        }
-
-        self.fail_count += 1;
-        if self.broken_at.is_none() && self.fail_count >= self.fail_threshold {
-            self.break_circuit(metric, error);
-        }
-    }
-
-    /// Call this after successfully executing an operation
-    pub fn success(&mut self, metric: &IntCounter) {
-        self.fail_count = 0;
-        if let Some(broken_at) = &self.broken_at {
-            tracing::info!(breaker=%self.name, "Circuit breaker failure ended (was broken for {})",
-                humantime::format_duration(broken_at.elapsed()));
-            self.broken_at = None;
-            metric.inc();
-        }
-    }
-
-    /// Call this before attempting an operation, and skip the operation if we are currently broken.
-    pub fn is_broken(&mut self) -> bool {
-        if self.short_circuit {
-            return false;
-        }
-
-        if let Some(broken_at) = self.broken_at {
-            match self.reset_period {
-                Some(reset_period) if broken_at.elapsed() > reset_period => {
-                    self.reset_circuit();
-                    false
-                }
-                _ => true,
-            }
-        } else {
-            false
-        }
-    }
-
-    fn break_circuit<E>(&mut self, metric: &IntCounter, error: E)
-    where
-        E: Display,
-    {
-        self.broken_at = Some(Instant::now());
-        tracing::error!(breaker=%self.name, "Circuit breaker broken!  Last error: {error}");
-        metric.inc();
-    }
-
-    fn reset_circuit(&mut self) {
-        self.broken_at = None;
-        self.fail_count = 0;
-    }
-}
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -52,17 +52,17 @@ struct RequestId(String);
 /// There could be other ways to implement similar functionality:
 ///
 /// * procmacros placed on top of all handler methods
-///   With all the drawbacks of procmacros, brings no difference implementation-wise,
-///   and little code reduction compared to the existing approach.
+/// With all the drawbacks of procmacros, brings no difference implementation-wise,
+/// and little code reduction compared to the existing approach.
 ///
 /// * Another `TraitExt` with e.g. the `get_with_span`, `post_with_span` methods to do similar logic,
-///   implemented for [`RouterBuilder`].
-///   Could be simpler, but we don't want to depend on [`routerify`] more, targeting to use other library later.
+/// implemented for [`RouterBuilder`].
+/// Could be simpler, but we don't want to depend on [`routerify`] more, targeting to use other library later.
 ///
 /// * In theory, a span guard could've been created in a pre-request middleware and placed into a global collection, to be dropped
-///   later, in a post-response middleware.
-///   Due to suspendable nature of the futures, would give contradictive results which is exactly the opposite of what `tracing-futures`
-///   tries to achive with its `.instrument` used in the current approach.
+/// later, in a post-response middleware.
+/// Due to suspendable nature of the futures, would give contradictive results which is exactly the opposite of what `tracing-futures`
+/// tries to achive with its `.instrument` used in the current approach.
 ///
 /// If needed, a declarative macro to substitute the |r| ... closure boilerplate could be introduced.
 pub async fn request_span<R, H>(request: Request<Body>, handler: H) -> R::Output
--- a/libs/utils/src/http/request.rs
+++ b/libs/utils/src/http/request.rs
@@ -74,15 +74,6 @@ pub fn parse_query_param<E: fmt::Display, T: FromStr<Err = E>>(
        .transpose()
 }

-pub fn must_parse_query_param<E: fmt::Display, T: FromStr<Err = E>>(
-    request: &Request<Body>,
-    param_name: &str,
-) -> Result<T, ApiError> {
-    parse_query_param(request, param_name)?.ok_or_else(|| {
-        ApiError::BadRequest(anyhow!("no {param_name} specified in query parameters"))
-    })
-}
-
 pub async fn ensure_no_body(request: &mut Request<Body>) -> Result<(), ApiError> {
    match request.body_mut().data().await {
        Some(_) => Err(ApiError::BadRequest(anyhow!("Unexpected request body"))),
--- a/libs/utils/src/id.rs
+++ b/libs/utils/src/id.rs
@@ -302,6 +302,17 @@ pub struct TenantId(Id);

 id_newtype!(TenantId);

+/// Neon Connection Id identifies long-lived connections (for example a pagestream
+/// connection with the page_service). Is used for better logging and tracing
+///
+/// NOTE: It (de)serializes as an array of hex bytes, so the string representation would look
+/// like `[173,80,132,115,129,226,72,254,170,201,135,108,199,26,228,24]`.
+/// See [`Id`] for alternative ways to serialize it.
+#[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, PartialOrd, Ord)]
+pub struct ConnectionId(Id);
+
+id_newtype!(ConnectionId);
+
 // A pair uniquely identifying Neon instance.
 #[derive(Debug, Clone, Copy, PartialOrd, Ord, PartialEq, Eq, Hash, Serialize, Deserialize)]
 pub struct TenantTimelineId {
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -26,8 +26,6 @@ pub mod auth;
 // utility functions and helper traits for unified unique id generation/serialization etc.
 pub mod id;

-pub mod shard;
-
 mod hex;
 pub use hex::Hex;

@@ -98,8 +96,6 @@ pub mod poison;

 pub mod toml_edit_ext;

-pub mod circuit_breaker;
-
 /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
 ///
 /// we have several cases:
--- a/libs/utils/src/shard.rs
+++ b/libs/utils/src/shard.rs
@@ -1,451 +0,0 @@
-//! See `pageserver_api::shard` for description on sharding.
-
-use std::{ops::RangeInclusive, str::FromStr};
-
-use hex::FromHex;
-use serde::{Deserialize, Serialize};
-
-use crate::id::TenantId;
-
-#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
-pub struct ShardNumber(pub u8);
-
-#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
-pub struct ShardCount(pub u8);
-
-/// Combination of ShardNumber and ShardCount.  For use within the context of a particular tenant,
-/// when we need to know which shard we're dealing with, but do not need to know the full
-/// ShardIdentity (because we won't be doing any page->shard mapping), and do not need to know
-/// the fully qualified TenantShardId.
-#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
-pub struct ShardIndex {
-    pub shard_number: ShardNumber,
-    pub shard_count: ShardCount,
-}
-
-/// Formatting helper, for generating the `shard_id` label in traces.
-pub struct ShardSlug<'a>(&'a TenantShardId);
-
-/// TenantShardId globally identifies a particular shard in a particular tenant.
-///
-/// These are written as `<TenantId>-<ShardSlug>`, for example:
-///   # The second shard in a two-shard tenant
-///   072f1291a5310026820b2fe4b2968934-0102
-///
-/// If the `ShardCount` is _unsharded_, the `TenantShardId` is written without
-/// a shard suffix and is equivalent to the encoding of a `TenantId`: this enables
-/// an unsharded [`TenantShardId`] to be used interchangably with a [`TenantId`].
-///
-/// The human-readable encoding of an unsharded TenantShardId, such as used in API URLs,
-/// is both forward and backward compatible with TenantId: a legacy TenantId can be
-/// decoded as a TenantShardId, and when re-encoded it will be parseable
-/// as a TenantId.
-#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
-pub struct TenantShardId {
-    pub tenant_id: TenantId,
-    pub shard_number: ShardNumber,
-    pub shard_count: ShardCount,
-}
-
-impl ShardCount {
-    pub const MAX: Self = Self(u8::MAX);
-
-    /// The internal value of a ShardCount may be zero, which means "1 shard, but use
-    /// legacy format for TenantShardId that excludes the shard suffix", also known
-    /// as [`TenantShardId::unsharded`].
-    ///
-    /// This method returns the actual number of shards, i.e. if our internal value is
-    /// zero, we return 1 (unsharded tenants have 1 shard).
-    pub fn count(&self) -> u8 {
-        if self.0 > 0 {
-            self.0
-        } else {
-            1
-        }
-    }
-
-    /// The literal internal value: this is **not** the number of shards in the
-    /// tenant, as we have a special zero value for legacy unsharded tenants.  Use
-    /// [`Self::count`] if you want to know the cardinality of shards.
-    pub fn literal(&self) -> u8 {
-        self.0
-    }
-
-    /// Whether the `ShardCount` is for an unsharded tenant, so uses one shard but
-    /// uses the legacy format for `TenantShardId`. See also the documentation for
-    /// [`Self::count`].
-    pub fn is_unsharded(&self) -> bool {
-        self.0 == 0
-    }
-
-    /// `v` may be zero, or the number of shards in the tenant.  `v` is what
-    /// [`Self::literal`] would return.
-    pub const fn new(val: u8) -> Self {
-        Self(val)
-    }
-}
-
-impl ShardNumber {
-    pub const MAX: Self = Self(u8::MAX);
-}
-
-impl TenantShardId {
-    pub fn unsharded(tenant_id: TenantId) -> Self {
-        Self {
-            tenant_id,
-            shard_number: ShardNumber(0),
-            shard_count: ShardCount(0),
-        }
-    }
-
-    /// The range of all TenantShardId that belong to a particular TenantId.  This is useful when
-    /// you have a BTreeMap of TenantShardId, and are querying by TenantId.
-    pub fn tenant_range(tenant_id: TenantId) -> RangeInclusive<Self> {
-        RangeInclusive::new(
-            Self {
-                tenant_id,
-                shard_number: ShardNumber(0),
-                shard_count: ShardCount(0),
-            },
-            Self {
-                tenant_id,
-                shard_number: ShardNumber::MAX,
-                shard_count: ShardCount::MAX,
-            },
-        )
-    }
-
-    pub fn shard_slug(&self) -> impl std::fmt::Display + '_ {
-        ShardSlug(self)
-    }
-
-    /// Convenience for code that has special behavior on the 0th shard.
-    pub fn is_shard_zero(&self) -> bool {
-        self.shard_number == ShardNumber(0)
-    }
-
-    /// The "unsharded" value is distinct from simply having a single shard: it represents
-    /// a tenant which is not shard-aware at all, and whose storage paths will not include
-    /// a shard suffix.
-    pub fn is_unsharded(&self) -> bool {
-        self.shard_number == ShardNumber(0) && self.shard_count.is_unsharded()
-    }
-
-    /// Convenience for dropping the tenant_id and just getting the ShardIndex: this
-    /// is useful when logging from code that is already in a span that includes tenant ID, to
-    /// keep messages reasonably terse.
-    pub fn to_index(&self) -> ShardIndex {
-        ShardIndex {
-            shard_number: self.shard_number,
-            shard_count: self.shard_count,
-        }
-    }
-
-    /// Calculate the children of this TenantShardId when splitting the overall tenant into
-    /// the given number of shards.
-    pub fn split(&self, new_shard_count: ShardCount) -> Vec<TenantShardId> {
-        let effective_old_shard_count = std::cmp::max(self.shard_count.0, 1);
-        let mut child_shards = Vec::new();
-        for shard_number in 0..ShardNumber(new_shard_count.0).0 {
-            // Key mapping is based on a round robin mapping of key hash modulo shard count,
-            // so our child shards are the ones which the same keys would map to.
-            if shard_number % effective_old_shard_count == self.shard_number.0 {
-                child_shards.push(TenantShardId {
-                    tenant_id: self.tenant_id,
-                    shard_number: ShardNumber(shard_number),
-                    shard_count: new_shard_count,
-                })
-            }
-        }
-
-        child_shards
-    }
-}
-
-impl<'a> std::fmt::Display for ShardSlug<'a> {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(
-            f,
-            "{:02x}{:02x}",
-            self.0.shard_number.0, self.0.shard_count.0
-        )
-    }
-}
-
-impl std::fmt::Display for TenantShardId {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        if self.shard_count != ShardCount(0) {
-            write!(f, "{}-{}", self.tenant_id, self.shard_slug())
-        } else {
-            // Legacy case (shard_count == 0) -- format as just the tenant id.  Note that this
-            // is distinct from the normal single shard case (shard count == 1).
-            self.tenant_id.fmt(f)
-        }
-    }
-}
-
-impl std::fmt::Debug for TenantShardId {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        // Debug is the same as Display: the compact hex representation
-        write!(f, "{}", self)
-    }
-}
-
-impl std::str::FromStr for TenantShardId {
-    type Err = hex::FromHexError;
-
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        // Expect format: 16 byte TenantId, '-', 1 byte shard number, 1 byte shard count
-        if s.len() == 32 {
-            // Legacy case: no shard specified
-            Ok(Self {
-                tenant_id: TenantId::from_str(s)?,
-                shard_number: ShardNumber(0),
-                shard_count: ShardCount(0),
-            })
-        } else if s.len() == 37 {
-            let bytes = s.as_bytes();
-            let tenant_id = TenantId::from_hex(&bytes[0..32])?;
-            let mut shard_parts: [u8; 2] = [0u8; 2];
-            hex::decode_to_slice(&bytes[33..37], &mut shard_parts)?;
-            Ok(Self {
-                tenant_id,
-                shard_number: ShardNumber(shard_parts[0]),
-                shard_count: ShardCount(shard_parts[1]),
-            })
-        } else {
-            Err(hex::FromHexError::InvalidStringLength)
-        }
-    }
-}
-
-impl From<[u8; 18]> for TenantShardId {
-    fn from(b: [u8; 18]) -> Self {
-        let tenant_id_bytes: [u8; 16] = b[0..16].try_into().unwrap();
-
-        Self {
-            tenant_id: TenantId::from(tenant_id_bytes),
-            shard_number: ShardNumber(b[16]),
-            shard_count: ShardCount(b[17]),
-        }
-    }
-}
-
-impl ShardIndex {
-    pub fn new(number: ShardNumber, count: ShardCount) -> Self {
-        Self {
-            shard_number: number,
-            shard_count: count,
-        }
-    }
-    pub fn unsharded() -> Self {
-        Self {
-            shard_number: ShardNumber(0),
-            shard_count: ShardCount(0),
-        }
-    }
-
-    /// The "unsharded" value is distinct from simply having a single shard: it represents
-    /// a tenant which is not shard-aware at all, and whose storage paths will not include
-    /// a shard suffix.
-    pub fn is_unsharded(&self) -> bool {
-        self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
-    }
-
-    /// For use in constructing remote storage paths: concatenate this with a TenantId
-    /// to get a fully qualified TenantShardId.
-    ///
-    /// Backward compat: this function returns an empty string if Self::is_unsharded, such
-    /// that the legacy pre-sharding remote key format is preserved.
-    pub fn get_suffix(&self) -> String {
-        if self.is_unsharded() {
-            "".to_string()
-        } else {
-            format!("-{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
-        }
-    }
-}
-
-impl std::fmt::Display for ShardIndex {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
-    }
-}
-
-impl std::fmt::Debug for ShardIndex {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        // Debug is the same as Display: the compact hex representation
-        write!(f, "{}", self)
-    }
-}
-
-impl std::str::FromStr for ShardIndex {
-    type Err = hex::FromHexError;
-
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        // Expect format: 1 byte shard number, 1 byte shard count
-        if s.len() == 4 {
-            let bytes = s.as_bytes();
-            let mut shard_parts: [u8; 2] = [0u8; 2];
-            hex::decode_to_slice(bytes, &mut shard_parts)?;
-            Ok(Self {
-                shard_number: ShardNumber(shard_parts[0]),
-                shard_count: ShardCount(shard_parts[1]),
-            })
-        } else {
-            Err(hex::FromHexError::InvalidStringLength)
-        }
-    }
-}
-
-impl From<[u8; 2]> for ShardIndex {
-    fn from(b: [u8; 2]) -> Self {
-        Self {
-            shard_number: ShardNumber(b[0]),
-            shard_count: ShardCount(b[1]),
-        }
-    }
-}
-
-impl Serialize for TenantShardId {
-    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
-    where
-        S: serde::Serializer,
-    {
-        if serializer.is_human_readable() {
-            serializer.collect_str(self)
-        } else {
-            // Note: while human encoding of [`TenantShardId`] is backward and forward
-            // compatible, this binary encoding is not.
-            let mut packed: [u8; 18] = [0; 18];
-            packed[0..16].clone_from_slice(&self.tenant_id.as_arr());
-            packed[16] = self.shard_number.0;
-            packed[17] = self.shard_count.0;
-
-            packed.serialize(serializer)
-        }
-    }
-}
-
-impl<'de> Deserialize<'de> for TenantShardId {
-    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
-    where
-        D: serde::Deserializer<'de>,
-    {
-        struct IdVisitor {
-            is_human_readable_deserializer: bool,
-        }
-
-        impl<'de> serde::de::Visitor<'de> for IdVisitor {
-            type Value = TenantShardId;
-
-            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
-                if self.is_human_readable_deserializer {
-                    formatter.write_str("value in form of hex string")
-                } else {
-                    formatter.write_str("value in form of integer array([u8; 18])")
-                }
-            }
-
-            fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
-            where
-                A: serde::de::SeqAccess<'de>,
-            {
-                let s = serde::de::value::SeqAccessDeserializer::new(seq);
-                let id: [u8; 18] = Deserialize::deserialize(s)?;
-                Ok(TenantShardId::from(id))
-            }
-
-            fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
-            where
-                E: serde::de::Error,
-            {
-                TenantShardId::from_str(v).map_err(E::custom)
-            }
-        }
-
-        if deserializer.is_human_readable() {
-            deserializer.deserialize_str(IdVisitor {
-                is_human_readable_deserializer: true,
-            })
-        } else {
-            deserializer.deserialize_tuple(
-                18,
-                IdVisitor {
-                    is_human_readable_deserializer: false,
-                },
-            )
-        }
-    }
-}
-
-impl Serialize for ShardIndex {
-    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
-    where
-        S: serde::Serializer,
-    {
-        if serializer.is_human_readable() {
-            serializer.collect_str(self)
-        } else {
-            // Binary encoding is not used in index_part.json, but is included in anticipation of
-            // switching various structures (e.g. inter-process communication, remote metadata) to more
-            // compact binary encodings in future.
-            let mut packed: [u8; 2] = [0; 2];
-            packed[0] = self.shard_number.0;
-            packed[1] = self.shard_count.0;
-            packed.serialize(serializer)
-        }
-    }
-}
-
-impl<'de> Deserialize<'de> for ShardIndex {
-    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
-    where
-        D: serde::Deserializer<'de>,
-    {
-        struct IdVisitor {
-            is_human_readable_deserializer: bool,
-        }
-
-        impl<'de> serde::de::Visitor<'de> for IdVisitor {
-            type Value = ShardIndex;
-
-            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
-                if self.is_human_readable_deserializer {
-                    formatter.write_str("value in form of hex string")
-                } else {
-                    formatter.write_str("value in form of integer array([u8; 2])")
-                }
-            }
-
-            fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
-            where
-                A: serde::de::SeqAccess<'de>,
-            {
-                let s = serde::de::value::SeqAccessDeserializer::new(seq);
-                let id: [u8; 2] = Deserialize::deserialize(s)?;
-                Ok(ShardIndex::from(id))
-            }
-
-            fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
-            where
-                E: serde::de::Error,
-            {
-                ShardIndex::from_str(v).map_err(E::custom)
-            }
-        }
-
-        if deserializer.is_human_readable() {
-            deserializer.deserialize_str(IdVisitor {
-                is_human_readable_deserializer: true,
-            })
-        } else {
-            deserializer.deserialize_tuple(
-                2,
-                IdVisitor {
-                    is_human_readable_deserializer: false,
-                },
-            )
-        }
-    }
-}
--- a/libs/utils/src/vec_map.rs
+++ b/libs/utils/src/vec_map.rs
@@ -1,15 +1,11 @@
 use std::{alloc::Layout, cmp::Ordering, ops::RangeBounds};

-use smallvec::SmallVec;
-
 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
 pub enum VecMapOrdering {
    Greater,
    GreaterOrEqual,
 }

-const INLINE_ELEMENTS: usize = 1;
-
 /// Ordered map datastructure implemented in a Vec.
 /// Append only - can only add keys that are larger than the
 /// current max key.
@@ -17,7 +13,7 @@ const INLINE_ELEMENTS: usize = 1;
 /// during `VecMap` construction.
 #[derive(Clone, Debug)]
 pub struct VecMap<K, V> {
-    data: SmallVec<[(K, V); INLINE_ELEMENTS]>,
+    data: Vec<(K, V)>,
    ordering: VecMapOrdering,
 }

@@ -41,18 +37,14 @@ pub enum VecMapError {
 impl<K: Ord, V> VecMap<K, V> {
    pub fn new(ordering: VecMapOrdering) -> Self {
        Self {
-            data: Default::default(),
+            data: Vec::new(),
            ordering,
        }
    }

-    pub fn len(&self) -> usize {
-        self.data.len()
-    }
-
    pub fn with_capacity(capacity: usize, ordering: VecMapOrdering) -> Self {
        Self {
-            data: SmallVec::with_capacity(capacity),
+            data: Vec::with_capacity(capacity),
            ordering,
        }
    }
@@ -127,11 +119,6 @@ impl<K: Ord, V> VecMap<K, V> {
        Ok((None, delta_size))
    }

-    /// Where the key is known to be unique, and we don't want any instrumentation
-    pub fn append2(&mut self, key: K, value: V) {
-        self.data.push((key, value));
-    }
-
    /// Split the map into two.
    ///
    /// The left map contains everything before `cutoff` (exclusive).
@@ -148,11 +135,11 @@ impl<K: Ord, V> VecMap<K, V> {

        (
            VecMap {
-                data: SmallVec::from(&self.data[..split_idx]),
+                data: self.data[..split_idx].to_vec(),
                ordering: self.ordering,
            },
            VecMap {
-                data: SmallVec::from(&self.data[split_idx..]),
+                data: self.data[split_idx..].to_vec(),
                ordering: self.ordering,
            },
        )
@@ -199,10 +186,7 @@ impl<K: Ord, V> VecMap<K, V> {
    /// Instrument an operation on the underlying [`Vec`].
    /// Will panic if the operation decreases capacity.
    /// Returns the increase in memory usage caused by the op.
-    fn instrument_vec_op(
-        &mut self,
-        op: impl FnOnce(&mut SmallVec<[(K, V); INLINE_ELEMENTS]>),
-    ) -> usize {
+    fn instrument_vec_op(&mut self, op: impl FnOnce(&mut Vec<(K, V)>)) -> usize {
        let old_cap = self.data.capacity();
        op(&mut self.data);
        let new_cap = self.data.capacity();
@@ -242,7 +226,7 @@ impl<K: Ord, V> VecMap<K, V> {

 impl<K: Ord, V> IntoIterator for VecMap<K, V> {
    type Item = (K, V);
-    type IntoIter = smallvec::IntoIter<[(K, V); INLINE_ELEMENTS]>;
+    type IntoIter = std::vec::IntoIter<(K, V)>;

    fn into_iter(self) -> Self::IntoIter {
        self.data.into_iter()
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -62,7 +62,6 @@ sync_wrapper.workspace = true
 sysinfo.workspace = true
 tokio-tar.workspace = true
 thiserror.workspace = true
-tikv-jemallocator.workspace = true
 tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
 tokio-epoll-uring.workspace = true
 tokio-io-timeout.workspace = true
--- a/pageserver/client/Cargo.toml
+++ b/pageserver/client/Cargo.toml
@@ -8,7 +8,7 @@ license.workspace = true
 pageserver_api.workspace = true
 thiserror.workspace = true
 async-trait.workspace = true
-reqwest = { workspace = true, features = [ "stream" ] }
+reqwest.workspace = true
 utils.workspace = true
 serde.workspace = true
 workspace_hack = { version = "0.1", path = "../../workspace_hack" }
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -1,7 +1,6 @@
 use std::collections::HashMap;

 use bytes::Bytes;
-use detach_ancestor::AncestorDetached;
 use pageserver_api::{models::*, shard::TenantShardId};
 use reqwest::{IntoUrl, Method, StatusCode};
 use utils::{
@@ -10,8 +9,6 @@ use utils::{
    lsn::Lsn,
 };

-pub use reqwest::Body as ReqwestBody;
-
 pub mod util;

 #[derive(Debug, Clone)]
@@ -23,9 +20,6 @@ pub struct Client {

 #[derive(thiserror::Error, Debug)]
 pub enum Error {
-    #[error("send request: {0}")]
-    SendRequest(reqwest::Error),
-
    #[error("receive body: {0}")]
    ReceiveBody(reqwest::Error),

@@ -179,30 +173,19 @@ impl Client {
        self.request(Method::GET, uri, ()).await
    }

-    fn start_request<U: reqwest::IntoUrl>(
-        &self,
-        method: Method,
-        uri: U,
-    ) -> reqwest::RequestBuilder {
-        let req = self.client.request(method, uri);
-        if let Some(value) = &self.authorization_header {
-            req.header(reqwest::header::AUTHORIZATION, value)
-        } else {
-            req
-        }
-    }
-
    async fn request_noerror<B: serde::Serialize, U: reqwest::IntoUrl>(
        &self,
        method: Method,
        uri: U,
        body: B,
    ) -> Result<reqwest::Response> {
-        self.start_request(method, uri)
-            .json(&body)
-            .send()
-            .await
-            .map_err(Error::ReceiveBody)
+        let req = self.client.request(method, uri);
+        let req = if let Some(value) = &self.authorization_header {
+            req.header(reqwest::header::AUTHORIZATION, value)
+        } else {
+            req
+        };
+        req.json(&body).send().await.map_err(Error::ReceiveBody)
    }

    async fn request<B: serde::Serialize, U: reqwest::IntoUrl>(
@@ -419,23 +402,6 @@ impl Client {
        }
    }

-    pub async fn timeline_detach_ancestor(
-        &self,
-        tenant_shard_id: TenantShardId,
-        timeline_id: TimelineId,
-    ) -> Result<AncestorDetached> {
-        let uri = format!(
-            "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/detach_ancestor",
-            self.mgmt_api_endpoint
-        );
-
-        self.request(Method::PUT, &uri, ())
-            .await?
-            .json()
-            .await
-            .map_err(Error::ReceiveBody)
-    }
-
    pub async fn tenant_reset(&self, tenant_shard_id: TenantShardId) -> Result<()> {
        let uri = format!(
            "{}/v1/tenant/{}/reset",
@@ -643,53 +609,4 @@ impl Client {
            }),
        }
    }
-
-    pub async fn import_basebackup(
-        &self,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-        base_lsn: Lsn,
-        end_lsn: Lsn,
-        pg_version: u32,
-        basebackup_tarball: ReqwestBody,
-    ) -> Result<()> {
-        let uri = format!(
-            "{}/v1/tenant/{tenant_id}/timeline/{timeline_id}/import_basebackup?base_lsn={base_lsn}&end_lsn={end_lsn}&pg_version={pg_version}",
-            self.mgmt_api_endpoint,
-        );
-        self.start_request(Method::PUT, uri)
-            .body(basebackup_tarball)
-            .send()
-            .await
-            .map_err(Error::SendRequest)?
-            .error_from_body()
-            .await?
-            .json()
-            .await
-            .map_err(Error::ReceiveBody)
-    }
-
-    pub async fn import_wal(
-        &self,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-        start_lsn: Lsn,
-        end_lsn: Lsn,
-        wal_tarball: ReqwestBody,
-    ) -> Result<()> {
-        let uri = format!(
-            "{}/v1/tenant/{tenant_id}/timeline/{timeline_id}/import_wal?start_lsn={start_lsn}&end_lsn={end_lsn}",
-            self.mgmt_api_endpoint,
-        );
-        self.start_request(Method::PUT, uri)
-            .body(wal_tarball)
-            .send()
-            .await
-            .map_err(Error::SendRequest)?
-            .error_from_body()
-            .await?
-            .json()
-            .await
-            .map_err(Error::ReceiveBody)
-    }
 }
--- a/pageserver/compaction/src/interface.rs
+++ b/pageserver/compaction/src/interface.rs
@@ -131,7 +131,7 @@ impl CompactionKey for Key {
 pub type CompactionKeySpace<K> = Vec<Range<K>>;

 /// Functions needed from all layers.
-pub trait CompactionLayer<K: CompactionKey> {
+pub trait CompactionLayer<K: CompactionKey + ?Sized> {
    fn key_range(&self) -> &Range<K>;
    fn lsn_range(&self) -> &Range<Lsn>;

--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -179,7 +179,7 @@ async fn main() -> anyhow::Result<()> {
                .get("remote_storage")
                .expect("need remote_storage");
            let config = RemoteStorageConfig::from_toml(toml_item)?;
-            let storage = remote_storage::GenericRemoteStorage::from_config(&config).await;
+            let storage = remote_storage::GenericRemoteStorage::from_config(&config);
            let cancel = CancellationToken::new();
            storage
                .unwrap()
--- a/pageserver/src/auth.rs
+++ b/pageserver/src/auth.rs
@@ -14,14 +14,12 @@ pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<
        }
        (Scope::PageServerApi, None) => Ok(()), // access to management api for PageServerApi scope
        (Scope::PageServerApi, Some(_)) => Ok(()), // access to tenant api using PageServerApi scope
-        (Scope::Admin | Scope::SafekeeperData | Scope::GenerationsApi | Scope::Scrubber, _) => {
-            Err(AuthError(
-                format!(
-                    "JWT scope '{:?}' is ineligible for Pageserver auth",
-                    claims.scope
-                )
-                .into(),
-            ))
-        }
+        (Scope::Admin | Scope::SafekeeperData | Scope::GenerationsApi, _) => Err(AuthError(
+            format!(
+                "JWT scope '{:?}' is ineligible for Pageserver auth",
+                claims.scope
+            )
+            .into(),
+        )),
    }
 }
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -47,9 +47,6 @@ use utils::{
 project_git_version!(GIT_VERSION);
 project_build_tag!(BUILD_TAG);

-#[global_allocator]
-static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
-
 const PID_FILE_NAME: &str = "pageserver.pid";

 const FEATURES: &[&str] = &[
@@ -385,7 +382,7 @@ fn start_pageserver(
    let shutdown_pageserver = tokio_util::sync::CancellationToken::new();

    // Set up remote storage client
-    let remote_storage = BACKGROUND_RUNTIME.block_on(create_remote_storage_client(conf))?;
+    let remote_storage = create_remote_storage_client(conf)?;

    // Set up deletion queue
    let (deletion_queue, deletion_workers) = DeletionQueue::new(
@@ -622,6 +619,7 @@ fn start_pageserver(
                        metric_collection_endpoint,
                        &conf.metric_collection_bucket,
                        conf.metric_collection_interval,
+                        conf.cached_metric_collection_interval,
                        conf.synthetic_size_calculation_interval,
                        conf.id,
                        local_disk_storage,
@@ -659,6 +657,7 @@ fn start_pageserver(
                async move {
                    page_service::libpq_listener_main(
                        tenant_manager,
+                        broker_client,
                        pg_auth,
                        pageserver_listener,
                        conf.pg_auth_type,
@@ -701,7 +700,7 @@ fn start_pageserver(
    }
 }

-async fn create_remote_storage_client(
+fn create_remote_storage_client(
    conf: &'static PageServerConf,
 ) -> anyhow::Result<GenericRemoteStorage> {
    let config = if let Some(config) = &conf.remote_storage_config {
@@ -711,7 +710,7 @@ async fn create_remote_storage_client(
    };

    // Create the client
-    let mut remote_storage = GenericRemoteStorage::from_config(config).await?;
+    let mut remote_storage = GenericRemoteStorage::from_config(config)?;

    // If `test_remote_failures` is non-zero, wrap the client with a
    // wrapper that simulates failures.
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -12,6 +12,7 @@ use serde::de::IntoDeserializer;
 use std::env;
 use storage_broker::Uri;
 use utils::crashsafe::path_with_suffix_extension;
+use utils::id::ConnectionId;
 use utils::logging::SecretString;

 use once_cell::sync::OnceCell;
@@ -68,6 +69,7 @@ pub mod defaults {
        super::ConfigurableSemaphore::DEFAULT_INITIAL.get();

    pub const DEFAULT_METRIC_COLLECTION_INTERVAL: &str = "10 min";
+    pub const DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL: &str = "0s";
    pub const DEFAULT_METRIC_COLLECTION_ENDPOINT: Option<reqwest::Url> = None;
    pub const DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL: &str = "10 min";
    pub const DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY: &str = "10s";
@@ -90,7 +92,7 @@ pub mod defaults {
    pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB

    pub const DEFAULT_IMAGE_COMPRESSION: ImageCompressionAlgorithm =
-        ImageCompressionAlgorithm::Disabled;
+        ImageCompressionAlgorithm::DisabledNoDecompress;

    pub const DEFAULT_VALIDATE_VECTORED_GET: bool = true;

@@ -122,6 +124,7 @@ pub mod defaults {
 #concurrent_tenant_warmup = '{DEFAULT_CONCURRENT_TENANT_WARMUP}'

 #metric_collection_interval = '{DEFAULT_METRIC_COLLECTION_INTERVAL}'
+#cached_metric_collection_interval = '{DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL}'
 #synthetic_size_calculation_interval = '{DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL}'

 #disk_usage_based_eviction = {{ max_usage_pct = .., min_avail_bytes = .., period = "10s"}}
@@ -236,6 +239,7 @@ pub struct PageServerConf {
    // How often to collect metrics and send them to the metrics endpoint.
    pub metric_collection_interval: Duration,
    // How often to send unchanged cached metrics to the metrics endpoint.
+    pub cached_metric_collection_interval: Duration,
    pub metric_collection_endpoint: Option<Url>,
    pub metric_collection_bucket: Option<RemoteStorageConfig>,
    pub synthetic_size_calculation_interval: Duration,
@@ -367,6 +371,7 @@ struct PageServerConfigBuilder {
    concurrent_tenant_size_logical_size_queries: BuilderValue<NonZeroUsize>,

    metric_collection_interval: BuilderValue<Duration>,
+    cached_metric_collection_interval: BuilderValue<Duration>,
    metric_collection_endpoint: BuilderValue<Option<Url>>,
    synthetic_size_calculation_interval: BuilderValue<Duration>,
    metric_collection_bucket: BuilderValue<Option<RemoteStorageConfig>>,
@@ -450,6 +455,10 @@ impl PageServerConfigBuilder {
                DEFAULT_METRIC_COLLECTION_INTERVAL,
            )
            .expect("cannot parse default metric collection interval")),
+            cached_metric_collection_interval: Set(humantime::parse_duration(
+                DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL,
+            )
+            .expect("cannot parse default cached_metric_collection_interval")),
            synthetic_size_calculation_interval: Set(humantime::parse_duration(
                DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL,
            )
@@ -581,6 +590,14 @@ impl PageServerConfigBuilder {
        self.metric_collection_interval = BuilderValue::Set(metric_collection_interval)
    }

+    pub fn cached_metric_collection_interval(
+        &mut self,
+        cached_metric_collection_interval: Duration,
+    ) {
+        self.cached_metric_collection_interval =
+            BuilderValue::Set(cached_metric_collection_interval)
+    }
+
    pub fn metric_collection_endpoint(&mut self, metric_collection_endpoint: Option<Url>) {
        self.metric_collection_endpoint = BuilderValue::Set(metric_collection_endpoint)
    }
@@ -714,6 +731,7 @@ impl PageServerConfigBuilder {
                broker_keepalive_interval,
                log_format,
                metric_collection_interval,
+                cached_metric_collection_interval,
                metric_collection_endpoint,
                metric_collection_bucket,
                synthetic_size_calculation_interval,
@@ -852,6 +870,22 @@ impl PageServerConf {
        )
    }

+    pub fn traces_path(&self) -> Utf8PathBuf {
+        self.workdir.join("traces")
+    }
+
+    pub fn trace_path(
+        &self,
+        tenant_shard_id: &TenantShardId,
+        timeline_id: &TimelineId,
+        connection_id: &ConnectionId,
+    ) -> Utf8PathBuf {
+        self.traces_path()
+            .join(tenant_shard_id.to_string())
+            .join(timeline_id.to_string())
+            .join(connection_id.to_string())
+    }
+
    /// Turns storage remote path of a file into its local path.
    pub fn local_path(&self, remote_path: &RemotePath) -> Utf8PathBuf {
        remote_path.with_base(&self.workdir)
@@ -930,6 +964,7 @@ impl PageServerConf {
                    NonZeroUsize::new(permits).context("initial semaphore permits out of range: 0, use other configuration to disable a feature")?
                }),
                "metric_collection_interval" => builder.metric_collection_interval(parse_toml_duration(key, item)?),
+                "cached_metric_collection_interval" => builder.cached_metric_collection_interval(parse_toml_duration(key, item)?),
                "metric_collection_endpoint" => {
                    let endpoint = parse_toml_string(key, item)?.parse().context("failed to parse metric_collection_endpoint")?;
                    builder.metric_collection_endpoint(Some(endpoint));
@@ -1062,6 +1097,7 @@ impl PageServerConf {
            eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::default(
            ),
            metric_collection_interval: Duration::from_secs(60),
+            cached_metric_collection_interval: Duration::from_secs(60 * 60),
            metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
            metric_collection_bucket: None,
            synthetic_size_calculation_interval: Duration::from_secs(60),
@@ -1240,6 +1276,7 @@ initial_superuser_name = 'zzzz'
 id = 10

 metric_collection_interval = '222 s'
+cached_metric_collection_interval = '22200 s'
 metric_collection_endpoint = 'http://localhost:80/metrics'
 synthetic_size_calculation_interval = '333 s'

@@ -1295,6 +1332,9 @@ background_task_maximum_delay = '334 s'
                metric_collection_interval: humantime::parse_duration(
                    defaults::DEFAULT_METRIC_COLLECTION_INTERVAL
                )?,
+                cached_metric_collection_interval: humantime::parse_duration(
+                    defaults::DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL
+                )?,
                metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
                metric_collection_bucket: None,
                synthetic_size_calculation_interval: humantime::parse_duration(
@@ -1373,6 +1413,7 @@ background_task_maximum_delay = '334 s'
                eviction_task_immitated_concurrent_logical_size_queries:
                    ConfigurableSemaphore::default(),
                metric_collection_interval: Duration::from_secs(222),
+                cached_metric_collection_interval: Duration::from_secs(22200),
                metric_collection_endpoint: Some(Url::parse("http://localhost:80/metrics")?),
                metric_collection_bucket: None,
                synthetic_size_calculation_interval: Duration::from_secs(333),
@@ -1519,6 +1560,34 @@ broker_endpoint = '{broker_endpoint}'
        Ok(())
    }

+    #[test]
+    fn parse_tenant_config() -> anyhow::Result<()> {
+        let tempdir = tempdir()?;
+        let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?;
+
+        let broker_endpoint = "http://127.0.0.1:7777";
+        let trace_read_requests = true;
+
+        let config_string = format!(
+            r#"{ALL_BASE_VALUES_TOML}
+pg_distrib_dir='{pg_distrib_dir}'
+broker_endpoint = '{broker_endpoint}'
+
+[tenant_config]
+trace_read_requests = {trace_read_requests}"#,
+        );
+
+        let toml = config_string.parse()?;
+
+        let conf = PageServerConf::parse_and_validate(&toml, &workdir)?;
+        assert_eq!(
+            conf.default_tenant_conf.trace_read_requests, trace_read_requests,
+            "Tenant config from pageserver config file should be parsed and udpated values used as defaults for all tenants",
+        );
+
+        Ok(())
+    }
+
    #[test]
    fn parse_incorrect_tenant_config() -> anyhow::Result<()> {
        let config_string = r#"
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -46,12 +46,19 @@ pub async fn collect_metrics(
    metric_collection_endpoint: &Url,
    metric_collection_bucket: &Option<RemoteStorageConfig>,
    metric_collection_interval: Duration,
+    _cached_metric_collection_interval: Duration,
    synthetic_size_calculation_interval: Duration,
    node_id: NodeId,
    local_disk_storage: Utf8PathBuf,
    cancel: CancellationToken,
    ctx: RequestContext,
 ) -> anyhow::Result<()> {
+    if _cached_metric_collection_interval != Duration::ZERO {
+        tracing::warn!(
+            "cached_metric_collection_interval is no longer used, please set it to zero."
+        )
+    }
+
    // spin up background worker that caclulates tenant sizes
    let worker_ctx =
        ctx.detached_child(TaskKind::CalculateSyntheticSize, DownloadBehavior::Download);
@@ -96,7 +103,7 @@ pub async fn collect_metrics(
        .expect("Failed to create http client with timeout");

    let bucket_client = if let Some(bucket_config) = metric_collection_bucket {
-        match GenericRemoteStorage::from_config(bucket_config).await {
+        match GenericRemoteStorage::from_config(bucket_config) {
            Ok(client) => Some(client),
            Err(e) => {
                // Non-fatal error: if we were given an invalid config, we will proceed
--- a/pageserver/src/context.rs
+++ b/pageserver/src/context.rs
@@ -59,7 +59,6 @@
 //! 1. It should be easy to forward the context to callees.
 //! 2. To propagate more data from high-level to low-level code, the functions in
 //!    the middle should not need to be modified.
-//!
 //! The solution is to have a container structure ([`RequestContext`]) that
 //! carries the information. Functions that don't care about what's in it
 //! pass it along to callees.
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -693,6 +693,7 @@ impl DeletionQueue {
 }

 #[cfg(test)]
+#[allow(unused)]
 mod test {
    use camino::Utf8Path;
    use hex_literal::hex;
@@ -828,9 +829,9 @@ mod test {
        }
    }

-    async fn setup(test_name: &str) -> anyhow::Result<TestSetup> {
+    fn setup(test_name: &str) -> anyhow::Result<TestSetup> {
        let test_name = Box::leak(Box::new(format!("deletion_queue__{test_name}")));
-        let harness = TenantHarness::create(test_name).await?;
+        let harness = TenantHarness::create(test_name)?;

        // We do not load() the harness: we only need its config and remote_storage

@@ -844,9 +845,7 @@ mod test {
            },
            timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
        };
-        let storage = GenericRemoteStorage::from_config(&storage_config)
-            .await
-            .unwrap();
+        let storage = GenericRemoteStorage::from_config(&storage_config).unwrap();

        let mock_control_plane = MockControlPlane::new();

@@ -924,9 +923,7 @@ mod test {
    #[tokio::test]
    async fn deletion_queue_smoke() -> anyhow::Result<()> {
        // Basic test that the deletion queue processes the deletions we pass into it
-        let ctx = setup("deletion_queue_smoke")
-            .await
-            .expect("Failed test setup");
+        let ctx = setup("deletion_queue_smoke").expect("Failed test setup");
        let client = ctx.deletion_queue.new_client();
        client.recover(HashMap::new())?;

@@ -995,10 +992,9 @@ mod test {
    }

    #[tokio::test]
+    #[cfg(any())]
    async fn deletion_queue_validation() -> anyhow::Result<()> {
-        let ctx = setup("deletion_queue_validation")
-            .await
-            .expect("Failed test setup");
+        let ctx = setup("deletion_queue_validation").expect("Failed test setup");
        let client = ctx.deletion_queue.new_client();
        client.recover(HashMap::new())?;

@@ -1057,9 +1053,7 @@ mod test {
    #[tokio::test]
    async fn deletion_queue_recovery() -> anyhow::Result<()> {
        // Basic test that the deletion queue processes the deletions we pass into it
-        let mut ctx = setup("deletion_queue_recovery")
-            .await
-            .expect("Failed test setup");
+        let mut ctx = setup("deletion_queue_recovery").expect("Failed test setup");
        let client = ctx.deletion_queue.new_client();
        client.recover(HashMap::new())?;

--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -377,7 +377,7 @@ paths:
              schema:
                $ref: "#/components/schemas/ConflictError"

-  /v1/tenant/{tenant_id}/timeline/{timeline_id}/preserve_initdb_archive:
+  /v1/tenant/{tenant_id}/{timeline_id}/preserve_initdb_archive:
    parameters:
      - name: tenant_id
        in: path
@@ -397,51 +397,6 @@ paths:
        "202":
          description: Tenant scheduled to load successfully

-  /v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/archival_config:
-    parameters:
-      - name: tenant_shard_id
-        in: path
-        required: true
-        schema:
-          type: string
-      - name: timeline_id
-        in: path
-        required: true
-        schema:
-          type: string
-    put:
-      description: |
-        Either archives or unarchives the given timeline.
-        An archived timeline may not have any non-archived children.
-      requestBody:
-        required: false
-        content:
-          application/json:
-            schema:
-              $ref: "#/components/schemas/ArchivalConfigRequest"
-      responses:
-        "200":
-          description: Timeline (un)archived successfully
-        "409":
-          description: |
-            The tenant/timeline is already being modified, perhaps by a concurrent call to this API
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ConflictError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
-
  /v1/tenant/{tenant_id}/synthetic_size:
    parameters:
      - name: tenant_id
@@ -474,9 +429,7 @@ paths:
              schema:
                $ref: "#/components/schemas/SyntheticSizeResponse"
            text/html:
-              schema:
-                type: string
-                description: SVG representation of the tenant and its timelines.
+              description: SVG representation of the tenant and it's timelines.
        "401":
          description: Unauthorized Error
          content:
@@ -615,7 +568,7 @@ paths:
          type: string
      - name: timeline_id
        in: path
-        required: true
+        ŕequired: true
        schema:
          type: string

@@ -821,13 +774,15 @@ components:
    TenantCreateRequest:
      allOf:
        - $ref: '#/components/schemas/TenantConfig'
-        - $ref: '#/components/schemas/TenantLoadRequest'
        - type: object
          required:
            - new_tenant_id
          properties:
            new_tenant_id:
              type: string
+            generation:
+              type: integer
+              description: Attachment generation number.
    TenantLoadRequest:
      type: object
      properties:
@@ -891,15 +846,6 @@ components:
        warm:
          type: boolean
          description: Whether to poll remote storage for layers to download.  If false, secondary locations don't download anything.
-    ArchivalConfigRequest:
-      type: object
-      required
-        - state
-      properties:
-        state:
-          description: The archival state of a timeline
-          type: string
-          enum: ["Archived", "Unarchived"]
    TenantConfig:
      type: object
      properties:
@@ -927,6 +873,8 @@ components:
          type: string
        max_lsn_wal_lag:
          type: integer
+        trace_read_requests:
+          type: boolean
        heatmap_period:
          type: string
    TenantConfigResponse:
@@ -1160,7 +1108,7 @@ components:
        reparented_timelines:
          type: array
          description: Set of reparented timeline ids
-          items:
+          properties:
            type: string
            format: hex
            description: TimelineId
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -10,7 +10,6 @@ use std::time::Duration;

 use anyhow::{anyhow, Context, Result};
 use enumset::EnumSet;
-use futures::StreamExt;
 use futures::TryFutureExt;
 use humantime::format_rfc3339;
 use hyper::header;
@@ -18,17 +17,14 @@ use hyper::StatusCode;
 use hyper::{Body, Request, Response, Uri};
 use metrics::launch_timestamp::LaunchTimestamp;
 use pageserver_api::models::AuxFilePolicy;
-use pageserver_api::models::DownloadRemoteLayersTaskSpawnRequest;
 use pageserver_api::models::IngestAuxFilesRequest;
 use pageserver_api::models::ListAuxFilesRequest;
 use pageserver_api::models::LocationConfig;
 use pageserver_api::models::LocationConfigListResponse;
-use pageserver_api::models::LocationConfigMode;
 use pageserver_api::models::LsnLease;
 use pageserver_api::models::LsnLeaseRequest;
 use pageserver_api::models::ShardParameters;
 use pageserver_api::models::TenantDetails;
-use pageserver_api::models::TenantLocationConfigRequest;
 use pageserver_api::models::TenantLocationConfigResponse;
 use pageserver_api::models::TenantScanRemoteStorageResponse;
 use pageserver_api::models::TenantScanRemoteStorageShard;
@@ -36,24 +32,24 @@ use pageserver_api::models::TenantShardLocation;
 use pageserver_api::models::TenantShardSplitRequest;
 use pageserver_api::models::TenantShardSplitResponse;
 use pageserver_api::models::TenantSorting;
-use pageserver_api::models::TimelineArchivalConfigRequest;
 use pageserver_api::models::TopTenantShardItem;
 use pageserver_api::models::TopTenantShardsRequest;
 use pageserver_api::models::TopTenantShardsResponse;
+use pageserver_api::models::{
+    DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantLocationConfigRequest,
+};
 use pageserver_api::shard::ShardCount;
 use pageserver_api::shard::TenantShardId;
 use remote_storage::DownloadError;
 use remote_storage::GenericRemoteStorage;
 use remote_storage::TimeTravelError;
 use tenant_size_model::{svg::SvgBranchKind, SizeResult, StorageModel};
-use tokio_util::io::StreamReader;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::auth::JwtAuth;
 use utils::failpoint_support::failpoints_handler;
 use utils::http::endpoint::prometheus_metrics_handler;
 use utils::http::endpoint::request_span;
-use utils::http::request::must_parse_query_param;
 use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};

 use crate::context::{DownloadBehavior, RequestContext};
@@ -665,39 +661,6 @@ async fn timeline_preserve_initdb_handler(
    json_response(StatusCode::OK, ())
 }

-async fn timeline_archival_config_handler(
-    mut request: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
-    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
-
-    let request_data: TimelineArchivalConfigRequest = json_request(&mut request).await?;
-    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
-    let state = get_state(&request);
-
-    async {
-        let tenant = state
-            .tenant_manager
-            .get_attached_tenant_shard(tenant_shard_id)?;
-
-        tenant
-            .apply_timeline_archival_config(timeline_id, request_data.state)
-            .await
-            .context("applying archival config")
-            .map_err(ApiError::InternalServerError)?;
-        Ok::<_, ApiError>(())
-    }
-    .instrument(info_span!("timeline_archival_config",
-                tenant_id = %tenant_shard_id.tenant_id,
-                shard_id = %tenant_shard_id.shard_slug(),
-                state = ?request_data.state,
-                %timeline_id))
-    .await?;
-
-    json_response(StatusCode::OK, ())
-}
-
 async fn timeline_detail_handler(
    request: Request<Body>,
    _cancel: CancellationToken,
@@ -1755,9 +1718,7 @@ async fn timeline_detach_ancestor_handler(
    request: Request<Body>,
    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    use crate::tenant::timeline::detach_ancestor;
-    use pageserver_api::models::detach_ancestor::AncestorDetached;
-
+    use crate::tenant::timeline::detach_ancestor::Options;
    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
@@ -1765,7 +1726,7 @@ async fn timeline_detach_ancestor_handler(
    let span = tracing::info_span!("detach_ancestor", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id);

    async move {
-        let mut options = detach_ancestor::Options::default();
+        let mut options = Options::default();

        let rewrite_concurrency =
            parse_query_param::<_, std::num::NonZeroUsize>(&request, "rewrite_concurrency")?;
@@ -1793,36 +1754,27 @@ async fn timeline_detach_ancestor_handler(

        let timeline = tenant.get_timeline(timeline_id, true)?;

-        let progress = timeline
+        let (_guard, prepared) = timeline
            .prepare_to_detach_from_ancestor(&tenant, options, ctx)
            .await?;

-        // uncomment to allow early as possible Tenant::drop
-        // drop(tenant);
+        let res = state
+            .tenant_manager
+            .complete_detaching_timeline_ancestor(tenant_shard_id, timeline_id, prepared, ctx)
+            .await;

-        let resp = match progress {
-            detach_ancestor::Progress::Prepared(_guard, prepared) => {
-                // it would be great to tag the guard on to the tenant activation future
-                let reparented_timelines = state
-                    .tenant_manager
-                    .complete_detaching_timeline_ancestor(
-                        tenant_shard_id,
-                        timeline_id,
-                        prepared,
-                        ctx,
-                    )
-                    .await
-                    .context("timeline detach ancestor completion")
-                    .map_err(ApiError::InternalServerError)?;
-
-                AncestorDetached {
+        match res {
+            Ok(reparented_timelines) => {
+                let resp = pageserver_api::models::detach_ancestor::AncestorDetached {
                    reparented_timelines,
-                }
-            }
-            detach_ancestor::Progress::Done(resp) => resp,
-        };
+                };

-        json_response(StatusCode::OK, resp)
+                json_response(StatusCode::OK, resp)
+            }
+            Err(e) => Err(ApiError::InternalServerError(
+                e.context("timeline detach completion"),
+            )),
+        }
    }
    .instrument(span)
    .await
@@ -2452,189 +2404,6 @@ async fn post_top_tenants(
    )
 }

-async fn put_tenant_timeline_import_basebackup(
-    request: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
-    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
-    let base_lsn: Lsn = must_parse_query_param(&request, "base_lsn")?;
-    let end_lsn: Lsn = must_parse_query_param(&request, "end_lsn")?;
-    let pg_version: u32 = must_parse_query_param(&request, "pg_version")?;
-
-    check_permission(&request, Some(tenant_id))?;
-
-    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
-
-    let span = info_span!("import_basebackup", tenant_id=%tenant_id, timeline_id=%timeline_id, base_lsn=%base_lsn, end_lsn=%end_lsn, pg_version=%pg_version);
-    async move {
-        let state = get_state(&request);
-        let tenant = state
-            .tenant_manager
-            .get_attached_tenant_shard(TenantShardId::unsharded(tenant_id))?;
-
-        let broker_client = state.broker_client.clone();
-
-        let mut body = StreamReader::new(request.into_body().map(|res| {
-            res.map_err(|error| {
-                std::io::Error::new(std::io::ErrorKind::Other, anyhow::anyhow!(error))
-            })
-        }));
-
-        tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
-
-        let timeline = tenant
-            .create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx)
-            .map_err(ApiError::InternalServerError)
-            .await?;
-
-        // TODO mark timeline as not ready until it reaches end_lsn.
-        // We might have some wal to import as well, and we should prevent compute
-        // from connecting before that and writing conflicting wal.
-        //
-        // This is not relevant for pageserver->pageserver migrations, since there's
-        // no wal to import. But should be fixed if we want to import from postgres.
-
-        // TODO leave clean state on error. For now you can use detach to clean
-        // up broken state from a failed import.
-
-        // Import basebackup provided via CopyData
-        info!("importing basebackup");
-
-        timeline
-            .import_basebackup_from_tar(tenant.clone(), &mut body, base_lsn, broker_client, &ctx)
-            .await
-            .map_err(ApiError::InternalServerError)?;
-
-        // Read the end of the tar archive.
-        read_tar_eof(body)
-            .await
-            .map_err(ApiError::InternalServerError)?;
-
-        // TODO check checksum
-        // Meanwhile you can verify client-side by taking fullbackup
-        // and checking that it matches in size with what was imported.
-        // It wouldn't work if base came from vanilla postgres though,
-        // since we discard some log files.
-
-        info!("done");
-        json_response(StatusCode::OK, ())
-    }
-    .instrument(span)
-    .await
-}
-
-async fn put_tenant_timeline_import_wal(
-    request: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
-    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
-    let start_lsn: Lsn = must_parse_query_param(&request, "start_lsn")?;
-    let end_lsn: Lsn = must_parse_query_param(&request, "end_lsn")?;
-
-    check_permission(&request, Some(tenant_id))?;
-
-    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
-
-    let span = info_span!("import_wal", tenant_id=%tenant_id, timeline_id=%timeline_id, start_lsn=%start_lsn, end_lsn=%end_lsn);
-    async move {
-        let state = get_state(&request);
-
-        let timeline = active_timeline_of_active_tenant(&state.tenant_manager, TenantShardId::unsharded(tenant_id), timeline_id).await?;
-
-        let mut body = StreamReader::new(request.into_body().map(|res| {
-            res.map_err(|error| {
-                std::io::Error::new(std::io::ErrorKind::Other, anyhow::anyhow!(error))
-            })
-        }));
-
-        let last_record_lsn = timeline.get_last_record_lsn();
-        if last_record_lsn != start_lsn {
-            return Err(ApiError::InternalServerError(anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}")));
-        }
-
-        // TODO leave clean state on error. For now you can use detach to clean
-        // up broken state from a failed import.
-
-        // Import wal provided via CopyData
-        info!("importing wal");
-        crate::import_datadir::import_wal_from_tar(&timeline, &mut body, start_lsn, end_lsn, &ctx).await.map_err(ApiError::InternalServerError)?;
-        info!("wal import complete");
-
-        // Read the end of the tar archive.
-        read_tar_eof(body).await.map_err(ApiError::InternalServerError)?;
-
-        // TODO Does it make sense to overshoot?
-        if timeline.get_last_record_lsn() < end_lsn {
-            return Err(ApiError::InternalServerError(anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}")));
-        }
-
-        // Flush data to disk, then upload to s3. No need for a forced checkpoint.
-        // We only want to persist the data, and it doesn't matter if it's in the
-        // shape of deltas or images.
-        info!("flushing layers");
-        timeline.freeze_and_flush().await.map_err(|e| match e {
-            tenant::timeline::FlushLayerError::Cancelled => ApiError::ShuttingDown,
-            other => ApiError::InternalServerError(anyhow::anyhow!(other)),
-        })?;
-
-        info!("done");
-
-        json_response(StatusCode::OK, ())
-    }.instrument(span).await
-}
-
-/// Read the end of a tar archive.
-///
-/// A tar archive normally ends with two consecutive blocks of zeros, 512 bytes each.
-/// `tokio_tar` already read the first such block. Read the second all-zeros block,
-/// and check that there is no more data after the EOF marker.
-///
-/// 'tar' command can also write extra blocks of zeros, up to a record
-/// size, controlled by the --record-size argument. Ignore them too.
-async fn read_tar_eof(mut reader: (impl tokio::io::AsyncRead + Unpin)) -> anyhow::Result<()> {
-    use tokio::io::AsyncReadExt;
-    let mut buf = [0u8; 512];
-
-    // Read the all-zeros block, and verify it
-    let mut total_bytes = 0;
-    while total_bytes < 512 {
-        let nbytes = reader.read(&mut buf[total_bytes..]).await?;
-        total_bytes += nbytes;
-        if nbytes == 0 {
-            break;
-        }
-    }
-    if total_bytes < 512 {
-        anyhow::bail!("incomplete or invalid tar EOF marker");
-    }
-    if !buf.iter().all(|&x| x == 0) {
-        anyhow::bail!("invalid tar EOF marker");
-    }
-
-    // Drain any extra zero-blocks after the EOF marker
-    let mut trailing_bytes = 0;
-    let mut seen_nonzero_bytes = false;
-    loop {
-        let nbytes = reader.read(&mut buf).await?;
-        trailing_bytes += nbytes;
-        if !buf.iter().all(|&x| x == 0) {
-            seen_nonzero_bytes = true;
-        }
-        if nbytes == 0 {
-            break;
-        }
-    }
-    if seen_nonzero_bytes {
-        anyhow::bail!("unexpected non-zero bytes after the tar archive");
-    }
-    if trailing_bytes % 512 != 0 {
-        anyhow::bail!("unexpected number of zeros ({trailing_bytes}), not divisible by tar block size (512 bytes), after the tar archive");
-    }
-    Ok(())
-}
-
 /// Common functionality of all the HTTP API handlers.
 ///
 /// - Adds a tracing span to each request (by `request_span`)
@@ -2823,10 +2592,6 @@ pub fn make_router(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/preserve_initdb_archive",
            |r| api_handler(r, timeline_preserve_initdb_handler),
        )
-        .post(
-            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/archival_config",
-            |r| api_handler(r, timeline_archival_config_handler),
-        )
        .get("/v1/tenant/:tenant_shard_id/timeline/:timeline_id", |r| {
            api_handler(r, timeline_detail_handler)
        })
@@ -2933,13 +2698,5 @@ pub fn make_router(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/perf_info",
            |r| testing_api_handler("perf_info", r, perf_info),
        )
-        .put(
-            "/v1/tenant/:tenant_id/timeline/:timeline_id/import_basebackup",
-            |r| api_handler(r, put_tenant_timeline_import_basebackup),
-        )
-        .put(
-            "/v1/tenant/:tenant_id/timeline/:timeline_id/import_wal",
-            |r| api_handler(r, put_tenant_timeline_import_wal),
-        )
        .any(handler_404))
 }
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -1,5 +1,6 @@
 #![recursion_limit = "300"]
 #![deny(clippy::undocumented_unsafe_blocks)]
+#![allow(unused)]

 mod auth;
 pub mod basebackup;
@@ -23,6 +24,7 @@ pub mod span;
 pub(crate) mod statvfs;
 pub mod task_mgr;
 pub mod tenant;
+pub mod trace;
 pub mod utilization;
 pub mod virtual_file;
 pub mod walingest;
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -473,31 +473,6 @@ static PITR_HISTORY_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-#[derive(strum_macros::EnumString, strum_macros::Display, strum_macros::IntoStaticStr)]
-#[strum(serialize_all = "kebab_case")]
-pub(crate) enum MetricLayerKind {
-    Delta,
-    Image,
-}
-
-static TIMELINE_LAYER_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
-    register_uint_gauge_vec!(
-        "pageserver_layer_bytes",
-        "Sum of layer physical sizes in bytes",
-        &["tenant_id", "shard_id", "timeline_id", "kind"]
-    )
-    .expect("failed to define a metric")
-});
-
-static TIMELINE_LAYER_COUNT: Lazy<UIntGaugeVec> = Lazy::new(|| {
-    register_uint_gauge_vec!(
-        "pageserver_layer_count",
-        "Number of layers that exist",
-        &["tenant_id", "shard_id", "timeline_id", "kind"]
-    )
-    .expect("failed to define a metric")
-});
-
 static TIMELINE_ARCHIVE_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
    register_uint_gauge_vec!(
        "pageserver_archive_size",
@@ -594,38 +569,6 @@ static VALID_LSN_LEASE_COUNT: Lazy<UIntGaugeVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub(crate) static CIRCUIT_BREAKERS_BROKEN: Lazy<IntCounter> = Lazy::new(|| {
-    register_int_counter!(
-        "pageserver_circuit_breaker_broken",
-        "How many times a circuit breaker has broken"
-    )
-    .expect("failed to define a metric")
-});
-
-pub(crate) static CIRCUIT_BREAKERS_UNBROKEN: Lazy<IntCounter> = Lazy::new(|| {
-    register_int_counter!(
-        "pageserver_circuit_breaker_unbroken",
-        "How many times a circuit breaker has been un-broken (recovered)"
-    )
-    .expect("failed to define a metric")
-});
-
-pub(crate) static COMPRESSION_IMAGE_INPUT_BYTES: Lazy<IntCounter> = Lazy::new(|| {
-    register_int_counter!(
-        "pageserver_compression_image_in_bytes_total",
-        "Size of uncompressed data written into image layers"
-    )
-    .expect("failed to define a metric")
-});
-
-pub(crate) static COMPRESSION_IMAGE_OUTPUT_BYTES: Lazy<IntCounter> = Lazy::new(|| {
-    register_int_counter!(
-        "pageserver_compression_image_out_bytes_total",
-        "Size of compressed image layer written"
-    )
-    .expect("failed to define a metric")
-});
-
 pub(crate) mod initial_logical_size {
    use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
    use once_cell::sync::Lazy;
@@ -1530,7 +1473,10 @@ pub(crate) enum ComputeCommandKind {
    PageStream,
    Basebackup,
    Fullbackup,
+    ImportBasebackup,
+    ImportWal,
    LeaseLsn,
+    Show,
 }

 pub(crate) struct ComputeCommandCounters {
@@ -2182,10 +2128,6 @@ pub(crate) struct TimelineMetrics {
    pub last_record_gauge: IntGauge,
    pub pitr_history_size: UIntGauge,
    pub archival_size: UIntGauge,
-    pub(crate) layer_size_image: UIntGauge,
-    pub(crate) layer_count_image: UIntGauge,
-    pub(crate) layer_size_delta: UIntGauge,
-    pub(crate) layer_count_delta: UIntGauge,
    pub standby_horizon_gauge: IntGauge,
    pub resident_physical_size_gauge: UIntGauge,
    /// copy of LayeredTimeline.current_logical_size
@@ -2268,42 +2210,6 @@ impl TimelineMetrics {
            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
            .unwrap();

-        let layer_size_image = TIMELINE_LAYER_SIZE
-            .get_metric_with_label_values(&[
-                &tenant_id,
-                &shard_id,
-                &timeline_id,
-                MetricLayerKind::Image.into(),
-            ])
-            .unwrap();
-
-        let layer_count_image = TIMELINE_LAYER_COUNT
-            .get_metric_with_label_values(&[
-                &tenant_id,
-                &shard_id,
-                &timeline_id,
-                MetricLayerKind::Image.into(),
-            ])
-            .unwrap();
-
-        let layer_size_delta = TIMELINE_LAYER_SIZE
-            .get_metric_with_label_values(&[
-                &tenant_id,
-                &shard_id,
-                &timeline_id,
-                MetricLayerKind::Delta.into(),
-            ])
-            .unwrap();
-
-        let layer_count_delta = TIMELINE_LAYER_COUNT
-            .get_metric_with_label_values(&[
-                &tenant_id,
-                &shard_id,
-                &timeline_id,
-                MetricLayerKind::Delta.into(),
-            ])
-            .unwrap();
-
        let standby_horizon_gauge = STANDBY_HORIZON
            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
            .unwrap();
@@ -2358,10 +2264,6 @@ impl TimelineMetrics {
            last_record_gauge,
            pitr_history_size,
            archival_size,
-            layer_size_image,
-            layer_count_image,
-            layer_size_delta,
-            layer_count_delta,
            standby_horizon_gauge,
            resident_physical_size_gauge,
            current_logical_size_gauge,
@@ -2423,31 +2325,6 @@ impl TimelineMetrics {
        let _ = TIMELINE_ARCHIVE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        let _ = PITR_HISTORY_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);

-        let _ = TIMELINE_LAYER_SIZE.remove_label_values(&[
-            tenant_id,
-            shard_id,
-            timeline_id,
-            MetricLayerKind::Image.into(),
-        ]);
-        let _ = TIMELINE_LAYER_COUNT.remove_label_values(&[
-            tenant_id,
-            shard_id,
-            timeline_id,
-            MetricLayerKind::Image.into(),
-        ]);
-        let _ = TIMELINE_LAYER_SIZE.remove_label_values(&[
-            tenant_id,
-            shard_id,
-            timeline_id,
-            MetricLayerKind::Delta.into(),
-        ]);
-        let _ = TIMELINE_LAYER_COUNT.remove_label_values(&[
-            tenant_id,
-            shard_id,
-            timeline_id,
-            MetricLayerKind::Delta.into(),
-        ]);
-
        let _ = EVICTIONS.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        let _ = AUX_FILE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        let _ = VALID_LSN_LEASE_COUNT.remove_label_values(&[tenant_id, shard_id, timeline_id]);
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -4,7 +4,9 @@
 use anyhow::Context;
 use async_compression::tokio::write::GzipEncoder;
 use bytes::Buf;
+use bytes::Bytes;
 use futures::stream::FuturesUnordered;
+use futures::Stream;
 use futures::StreamExt;
 use pageserver_api::key::Key;
 use pageserver_api::models::TenantState;
@@ -26,6 +28,7 @@ use std::borrow::Cow;
 use std::collections::HashMap;
 use std::io;
 use std::net::TcpListener;
+use std::pin::pin;
 use std::str;
 use std::str::FromStr;
 use std::sync::Arc;
@@ -34,8 +37,10 @@ use std::time::Instant;
 use std::time::SystemTime;
 use tokio::io::AsyncWriteExt;
 use tokio::io::{AsyncRead, AsyncWrite};
+use tokio_util::io::StreamReader;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
+use utils::id::ConnectionId;
 use utils::sync::gate::GateGuard;
 use utils::{
    auth::{Claims, Scope, SwappableJwtAuth},
@@ -48,6 +53,7 @@ use crate::auth::check_permission;
 use crate::basebackup;
 use crate::basebackup::BasebackupError;
 use crate::context::{DownloadBehavior, RequestContext};
+use crate::import_datadir::import_wal_from_tar;
 use crate::metrics;
 use crate::metrics::{ComputeCommandKind, COMPUTE_COMMANDS_COUNTERS, LIVE_CONNECTIONS};
 use crate::pgdatadir_mapping::Version;
@@ -60,11 +66,13 @@ use crate::tenant::mgr::GetTenantError;
 use crate::tenant::mgr::ShardResolveResult;
 use crate::tenant::mgr::ShardSelector;
 use crate::tenant::mgr::TenantManager;
+use crate::tenant::timeline::FlushLayerError;
 use crate::tenant::timeline::WaitLsnError;
 use crate::tenant::GetTimelineError;
 use crate::tenant::PageReconstructError;
 use crate::tenant::Tenant;
 use crate::tenant::Timeline;
+use crate::trace::Tracer;
 use pageserver_api::key::rel_block_to_key;
 use pageserver_api::reltag::SlruKind;
 use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
@@ -74,6 +82,56 @@ use postgres_ffi::BLCKSZ;
 // is not yet in state [`TenantState::Active`].
 const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000);

+/// Read the end of a tar archive.
+///
+/// A tar archive normally ends with two consecutive blocks of zeros, 512 bytes each.
+/// `tokio_tar` already read the first such block. Read the second all-zeros block,
+/// and check that there is no more data after the EOF marker.
+///
+/// 'tar' command can also write extra blocks of zeros, up to a record
+/// size, controlled by the --record-size argument. Ignore them too.
+async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<()> {
+    use tokio::io::AsyncReadExt;
+    let mut buf = [0u8; 512];
+
+    // Read the all-zeros block, and verify it
+    let mut total_bytes = 0;
+    while total_bytes < 512 {
+        let nbytes = reader.read(&mut buf[total_bytes..]).await?;
+        total_bytes += nbytes;
+        if nbytes == 0 {
+            break;
+        }
+    }
+    if total_bytes < 512 {
+        anyhow::bail!("incomplete or invalid tar EOF marker");
+    }
+    if !buf.iter().all(|&x| x == 0) {
+        anyhow::bail!("invalid tar EOF marker");
+    }
+
+    // Drain any extra zero-blocks after the EOF marker
+    let mut trailing_bytes = 0;
+    let mut seen_nonzero_bytes = false;
+    loop {
+        let nbytes = reader.read(&mut buf).await?;
+        trailing_bytes += nbytes;
+        if !buf.iter().all(|&x| x == 0) {
+            seen_nonzero_bytes = true;
+        }
+        if nbytes == 0 {
+            break;
+        }
+    }
+    if seen_nonzero_bytes {
+        anyhow::bail!("unexpected non-zero bytes after the tar archive");
+    }
+    if trailing_bytes % 512 != 0 {
+        anyhow::bail!("unexpected number of zeros ({trailing_bytes}), not divisible by tar block size (512 bytes), after the tar archive");
+    }
+    Ok(())
+}
+
 ///////////////////////////////////////////////////////////////////////////////

 ///
@@ -83,6 +141,7 @@ const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000);
 ///
 pub async fn libpq_listener_main(
    tenant_manager: Arc<TenantManager>,
+    broker_client: storage_broker::BrokerClientChannel,
    auth: Option<Arc<SwappableJwtAuth>>,
    listener: TcpListener,
    auth_type: AuthType,
@@ -127,6 +186,7 @@ pub async fn libpq_listener_main(
                    false,
                    page_service_conn_main(
                        tenant_manager.clone(),
+                        broker_client.clone(),
                        local_auth,
                        socket,
                        auth_type,
@@ -149,6 +209,7 @@ pub async fn libpq_listener_main(
 #[instrument(skip_all, fields(peer_addr))]
 async fn page_service_conn_main(
    tenant_manager: Arc<TenantManager>,
+    broker_client: storage_broker::BrokerClientChannel,
    auth: Option<Arc<SwappableJwtAuth>>,
    socket: tokio::net::TcpStream,
    auth_type: AuthType,
@@ -201,11 +262,12 @@ async fn page_service_conn_main(
    // and create a child per-query context when it invokes process_query.
    // But it's in a shared crate, so, we store connection_ctx inside PageServerHandler
    // and create the per-query context in process_query ourselves.
-    let mut conn_handler = PageServerHandler::new(tenant_manager, auth, connection_ctx);
+    let mut conn_handler =
+        PageServerHandler::new(tenant_manager, broker_client, auth, connection_ctx);
    let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;

    match pgbackend
-        .run(&mut conn_handler, &task_mgr::shutdown_token())
+        .run(&mut conn_handler, task_mgr::shutdown_watcher)
        .await
    {
        Ok(()) => {
@@ -232,6 +294,7 @@ struct HandlerTimeline {
 }

 struct PageServerHandler {
+    broker_client: storage_broker::BrokerClientChannel,
    auth: Option<Arc<SwappableJwtAuth>>,
    claims: Option<Claims>,

@@ -323,11 +386,13 @@ impl From<WaitLsnError> for QueryError {
 impl PageServerHandler {
    pub fn new(
        tenant_manager: Arc<TenantManager>,
+        broker_client: storage_broker::BrokerClientChannel,
        auth: Option<Arc<SwappableJwtAuth>>,
        connection_ctx: RequestContext,
    ) -> Self {
        PageServerHandler {
            tenant_manager,
+            broker_client,
            auth,
            claims: None,
            connection_ctx,
@@ -410,6 +475,73 @@ impl PageServerHandler {
        )
    }

+    fn copyin_stream<'a, IO>(
+        &'a self,
+        pgb: &'a mut PostgresBackend<IO>,
+        cancel: &'a CancellationToken,
+    ) -> impl Stream<Item = io::Result<Bytes>> + 'a
+    where
+        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
+    {
+        async_stream::try_stream! {
+            loop {
+                let msg = tokio::select! {
+                    biased;
+
+                    _ = cancel.cancelled() => {
+                        // We were requested to shut down.
+                        let msg = "pageserver is shutting down";
+                        let _ = pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, None));
+                        Err(QueryError::Shutdown)
+                    }
+
+                    msg = pgb.read_message() => { msg.map_err(QueryError::from)}
+                };
+
+                match msg {
+                    Ok(Some(message)) => {
+                        let copy_data_bytes = match message {
+                            FeMessage::CopyData(bytes) => bytes,
+                            FeMessage::CopyDone => { break },
+                            FeMessage::Sync => continue,
+                            FeMessage::Terminate => {
+                                let msg = "client terminated connection with Terminate message during COPY";
+                                let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
+                                // error can't happen here, ErrorResponse serialization should be always ok
+                                pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?;
+                                Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
+                                break;
+                            }
+                            m => {
+                                let msg = format!("unexpected message {m:?}");
+                                // error can't happen here, ErrorResponse serialization should be always ok
+                                pgb.write_message_noflush(&BeMessage::ErrorResponse(&msg, None)).map_err(|e| e.into_io_error())?;
+                                Err(io::Error::new(io::ErrorKind::Other, msg))?;
+                                break;
+                            }
+                        };
+
+                        yield copy_data_bytes;
+                    }
+                    Ok(None) => {
+                        let msg = "client closed connection during COPY";
+                        let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
+                        // error can't happen here, ErrorResponse serialization should be always ok
+                        pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?;
+                        self.flush_cancellable(pgb, cancel).await.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
+                        Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
+                    }
+                    Err(QueryError::Disconnected(ConnectionError::Io(io_error))) => {
+                        Err(io_error)?;
+                    }
+                    Err(other) => {
+                        Err(io::Error::new(io::ErrorKind::Other, other.to_string()))?;
+                    }
+                };
+            }
+        }
+    }
+
    #[instrument(skip_all)]
    async fn handle_pagerequests<IO>(
        &mut self,
@@ -428,6 +560,18 @@ impl PageServerHandler {
            .get_active_tenant_with_timeout(tenant_id, ShardSelector::First, ACTIVE_TENANT_TIMEOUT)
            .await?;

+        // Make request tracer if needed
+        let mut tracer = if tenant.get_trace_read_requests() {
+            let connection_id = ConnectionId::generate();
+            let path =
+                tenant
+                    .conf
+                    .trace_path(&tenant.tenant_shard_id(), &timeline_id, &connection_id);
+            Some(Tracer::new(path))
+        } else {
+            None
+        };
+
        // switch client to COPYBOTH
        pgb.write_message_noflush(&BeMessage::CopyBothResponse)?;
        self.flush_cancellable(pgb, &tenant.cancel).await?;
@@ -459,6 +603,11 @@ impl PageServerHandler {
            trace!("query: {copy_data_bytes:?}");
            fail::fail_point!("ps::handle-pagerequest-message");

+            // Trace request if needed
+            if let Some(t) = tracer.as_mut() {
+                t.trace(&copy_data_bytes)
+            }
+
            let neon_fe_msg =
                PagestreamFeMessage::parse(&mut copy_data_bytes.reader(), protocol_version)?;

@@ -564,6 +713,128 @@ impl PageServerHandler {
        Ok(())
    }

+    #[allow(clippy::too_many_arguments)]
+    #[instrument(skip_all, fields(%base_lsn, end_lsn=%_end_lsn, %pg_version))]
+    async fn handle_import_basebackup<IO>(
+        &self,
+        pgb: &mut PostgresBackend<IO>,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        base_lsn: Lsn,
+        _end_lsn: Lsn,
+        pg_version: u32,
+        ctx: RequestContext,
+    ) -> Result<(), QueryError>
+    where
+        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
+    {
+        debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id();
+
+        // Create empty timeline
+        info!("creating new timeline");
+        let tenant = self
+            .get_active_tenant_with_timeout(tenant_id, ShardSelector::Zero, ACTIVE_TENANT_TIMEOUT)
+            .await?;
+        let timeline = tenant
+            .create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx)
+            .await?;
+
+        // TODO mark timeline as not ready until it reaches end_lsn.
+        // We might have some wal to import as well, and we should prevent compute
+        // from connecting before that and writing conflicting wal.
+        //
+        // This is not relevant for pageserver->pageserver migrations, since there's
+        // no wal to import. But should be fixed if we want to import from postgres.
+
+        // TODO leave clean state on error. For now you can use detach to clean
+        // up broken state from a failed import.
+
+        // Import basebackup provided via CopyData
+        info!("importing basebackup");
+        pgb.write_message_noflush(&BeMessage::CopyInResponse)?;
+        self.flush_cancellable(pgb, &tenant.cancel).await?;
+
+        let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb, &tenant.cancel)));
+        timeline
+            .import_basebackup_from_tar(
+                tenant.clone(),
+                &mut copyin_reader,
+                base_lsn,
+                self.broker_client.clone(),
+                &ctx,
+            )
+            .await?;
+
+        // Read the end of the tar archive.
+        read_tar_eof(copyin_reader).await?;
+
+        // TODO check checksum
+        // Meanwhile you can verify client-side by taking fullbackup
+        // and checking that it matches in size with what was imported.
+        // It wouldn't work if base came from vanilla postgres though,
+        // since we discard some log files.
+
+        info!("done");
+        Ok(())
+    }
+
+    #[instrument(skip_all, fields(shard_id, %start_lsn, %end_lsn))]
+    async fn handle_import_wal<IO>(
+        &self,
+        pgb: &mut PostgresBackend<IO>,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        start_lsn: Lsn,
+        end_lsn: Lsn,
+        ctx: RequestContext,
+    ) -> Result<(), QueryError>
+    where
+        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
+    {
+        let timeline = self
+            .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
+            .await?;
+        let last_record_lsn = timeline.get_last_record_lsn();
+        if last_record_lsn != start_lsn {
+            return Err(QueryError::Other(
+                anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}"))
+            );
+        }
+
+        // TODO leave clean state on error. For now you can use detach to clean
+        // up broken state from a failed import.
+
+        // Import wal provided via CopyData
+        info!("importing wal");
+        pgb.write_message_noflush(&BeMessage::CopyInResponse)?;
+        self.flush_cancellable(pgb, &timeline.cancel).await?;
+        let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb, &timeline.cancel)));
+        import_wal_from_tar(&timeline, &mut copyin_reader, start_lsn, end_lsn, &ctx).await?;
+        info!("wal import complete");
+
+        // Read the end of the tar archive.
+        read_tar_eof(copyin_reader).await?;
+
+        // TODO Does it make sense to overshoot?
+        if timeline.get_last_record_lsn() < end_lsn {
+            return Err(QueryError::Other(
+                anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}"))
+            );
+        }
+
+        // Flush data to disk, then upload to s3. No need for a forced checkpoint.
+        // We only want to persist the data, and it doesn't matter if it's in the
+        // shape of deltas or images.
+        info!("flushing layers");
+        timeline.freeze_and_flush().await.map_err(|e| match e {
+            FlushLayerError::Cancelled => QueryError::Shutdown,
+            other => QueryError::Other(other.into()),
+        })?;
+
+        info!("done");
+        Ok(())
+    }
+
    /// Helper function to handle the LSN from client request.
    ///
    /// Each GetPage (and Exists and Nblocks) request includes information about
@@ -1434,6 +1705,109 @@ where
            )
            .await?;
            pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
+        } else if query_string.starts_with("import basebackup ") {
+            // Import the `base` section (everything but the wal) of a basebackup.
+            // Assumes the tenant already exists on this pageserver.
+            //
+            // Files are scheduled to be persisted to remote storage, and the
+            // caller should poll the http api to check when that is done.
+            //
+            // Example import command:
+            // 1. Get start/end LSN from backup_manifest file
+            // 2. Run:
+            // cat my_backup/base.tar | psql -h $PAGESERVER \
+            //     -c "import basebackup $TENANT $TIMELINE $START_LSN $END_LSN $PG_VERSION"
+            let params = &parts[2..];
+            if params.len() != 5 {
+                return Err(QueryError::Other(anyhow::anyhow!(
+                    "invalid param number for import basebackup command"
+                )));
+            }
+            let tenant_id = TenantId::from_str(params[0])
+                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
+            let timeline_id = TimelineId::from_str(params[1])
+                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
+            let base_lsn = Lsn::from_str(params[2])
+                .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?;
+            let end_lsn = Lsn::from_str(params[3])
+                .with_context(|| format!("Failed to parse Lsn from {}", params[3]))?;
+            let pg_version = u32::from_str(params[4])
+                .with_context(|| format!("Failed to parse pg_version from {}", params[4]))?;
+
+            tracing::Span::current()
+                .record("tenant_id", field::display(tenant_id))
+                .record("timeline_id", field::display(timeline_id));
+
+            self.check_permission(Some(tenant_id))?;
+
+            COMPUTE_COMMANDS_COUNTERS
+                .for_command(ComputeCommandKind::ImportBasebackup)
+                .inc();
+
+            match self
+                .handle_import_basebackup(
+                    pgb,
+                    tenant_id,
+                    timeline_id,
+                    base_lsn,
+                    end_lsn,
+                    pg_version,
+                    ctx,
+                )
+                .await
+            {
+                Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?,
+                Err(e) => {
+                    error!("error importing base backup between {base_lsn} and {end_lsn}: {e:?}");
+                    pgb.write_message_noflush(&BeMessage::ErrorResponse(
+                        &e.to_string(),
+                        Some(e.pg_error_code()),
+                    ))?
+                }
+            };
+        } else if query_string.starts_with("import wal ") {
+            // Import the `pg_wal` section of a basebackup.
+            //
+            // Files are scheduled to be persisted to remote storage, and the
+            // caller should poll the http api to check when that is done.
+            let params = &parts[2..];
+            if params.len() != 4 {
+                return Err(QueryError::Other(anyhow::anyhow!(
+                    "invalid param number for import wal command"
+                )));
+            }
+            let tenant_id = TenantId::from_str(params[0])
+                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
+            let timeline_id = TimelineId::from_str(params[1])
+                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
+            let start_lsn = Lsn::from_str(params[2])
+                .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?;
+            let end_lsn = Lsn::from_str(params[3])
+                .with_context(|| format!("Failed to parse Lsn from {}", params[3]))?;
+
+            tracing::Span::current()
+                .record("tenant_id", field::display(tenant_id))
+                .record("timeline_id", field::display(timeline_id));
+
+            self.check_permission(Some(tenant_id))?;
+
+            COMPUTE_COMMANDS_COUNTERS
+                .for_command(ComputeCommandKind::ImportWal)
+                .inc();
+
+            match self
+                .handle_import_wal(pgb, tenant_id, timeline_id, start_lsn, end_lsn, ctx)
+                .await
+            {
+                Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?,
+                Err(e) => {
+                    error!("error importing WAL between {start_lsn} and {end_lsn}: {e:?}");
+                    pgb.write_message_noflush(&BeMessage::ErrorResponse(
+                        &e.to_string(),
+                        Some(e.pg_error_code()),
+                    ))?
+                }
+            };
        } else if query_string.to_ascii_lowercase().starts_with("set ") {
            // important because psycopg2 executes "SET datestyle TO 'ISO'"
            // on connect
@@ -1479,6 +1853,66 @@ where
                    ))?
                }
            };
+        } else if let Some(params) = parts.strip_prefix(&["show"]) {
+            // show <tenant_id>
+            if params.len() != 1 {
+                return Err(QueryError::Other(anyhow::anyhow!(
+                    "invalid param number for config command"
+                )));
+            }
+            let tenant_id = TenantId::from_str(params[0])
+                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
+
+            tracing::Span::current().record("tenant_id", field::display(tenant_id));
+
+            self.check_permission(Some(tenant_id))?;
+
+            COMPUTE_COMMANDS_COUNTERS
+                .for_command(ComputeCommandKind::Show)
+                .inc();
+
+            let tenant = self
+                .get_active_tenant_with_timeout(
+                    tenant_id,
+                    ShardSelector::Zero,
+                    ACTIVE_TENANT_TIMEOUT,
+                )
+                .await?;
+            pgb.write_message_noflush(&BeMessage::RowDescription(&[
+                RowDescriptor::int8_col(b"checkpoint_distance"),
+                RowDescriptor::int8_col(b"checkpoint_timeout"),
+                RowDescriptor::int8_col(b"compaction_target_size"),
+                RowDescriptor::int8_col(b"compaction_period"),
+                RowDescriptor::int8_col(b"compaction_threshold"),
+                RowDescriptor::int8_col(b"gc_horizon"),
+                RowDescriptor::int8_col(b"gc_period"),
+                RowDescriptor::int8_col(b"image_creation_threshold"),
+                RowDescriptor::int8_col(b"pitr_interval"),
+            ]))?
+            .write_message_noflush(&BeMessage::DataRow(&[
+                Some(tenant.get_checkpoint_distance().to_string().as_bytes()),
+                Some(
+                    tenant
+                        .get_checkpoint_timeout()
+                        .as_secs()
+                        .to_string()
+                        .as_bytes(),
+                ),
+                Some(tenant.get_compaction_target_size().to_string().as_bytes()),
+                Some(
+                    tenant
+                        .get_compaction_period()
+                        .as_secs()
+                        .to_string()
+                        .as_bytes(),
+                ),
+                Some(tenant.get_compaction_threshold().to_string().as_bytes()),
+                Some(tenant.get_gc_horizon().to_string().as_bytes()),
+                Some(tenant.get_gc_period().as_secs().to_string().as_bytes()),
+                Some(tenant.get_image_creation_threshold().to_string().as_bytes()),
+                Some(tenant.get_pitr_interval().as_secs().to_string().as_bytes()),
+            ]))?
+            .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
        } else {
            return Err(QueryError::Other(anyhow::anyhow!(
                "unknown command {query_string}"
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -174,7 +174,6 @@ impl Timeline {
            pending_deletions: Vec::new(),
            pending_nblocks: 0,
            pending_directory_entries: Vec::new(),
-            latest_rel_sizes: Default::default(),
            lsn,
        }
    }
@@ -523,7 +522,7 @@ impl Timeline {
        ctx: &RequestContext,
    ) -> Result<Option<TimestampTz>, PageReconstructError> {
        let mut max: Option<TimestampTz> = None;
-        self.map_all_timestamps::<()>(probe_lsn, ctx, |timestamp| {
+        self.map_all_timestamps(probe_lsn, ctx, |timestamp| {
            if let Some(max_prev) = max {
                max = Some(max_prev.max(timestamp));
            } else {
@@ -855,14 +854,13 @@ impl Timeline {
        result.add_key(DBDIR_KEY);

        // Fetch list of database dirs and iterate them
-        let dbdir = self.list_dbdirs(lsn, ctx).await?;
-        let mut dbs: Vec<((Oid, Oid), bool)> = dbdir.into_iter().collect();
+        let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
+        let dbdir = DbDirectory::des(&buf)?;

-        dbs.sort_unstable_by(|(k_a, _), (k_b, _)| k_a.cmp(k_b));
-        for ((spcnode, dbnode), has_relmap_file) in dbs {
-            if has_relmap_file {
-                result.add_key(relmap_file_key(spcnode, dbnode));
-            }
+        let mut dbs: Vec<(Oid, Oid)> = dbdir.dbdirs.keys().cloned().collect();
+        dbs.sort_unstable();
+        for (spcnode, dbnode) in dbs {
+            result.add_key(relmap_file_key(spcnode, dbnode));
            result.add_key(rel_dir_to_key(spcnode, dbnode));

            let mut rels: Vec<RelTag> = self
@@ -921,9 +919,6 @@ impl Timeline {
            result.add_key(AUX_FILES_KEY);
        }

-        // Add extra keyspaces in the test cases. Some test cases write keys into the storage without
-        // creating directory keys. These test cases will add such keyspaces into `extra_test_dense_keyspace`
-        // and the keys will not be garbage-colllected.
        #[cfg(test)]
        {
            let guard = self.extra_test_dense_keyspace.load();
@@ -932,48 +927,13 @@ impl Timeline {
            }
        }

-        let dense_keyspace = result.to_keyspace();
-        let sparse_keyspace = SparseKeySpace(KeySpace {
-            ranges: vec![Key::metadata_aux_key_range(), repl_origin_key_range()],
-        });
-
-        if cfg!(debug_assertions) {
-            // Verify if the sparse keyspaces are ordered and non-overlapping.
-
-            // We do not use KeySpaceAccum for sparse_keyspace because we want to ensure each
-            // category of sparse keys are split into their own image/delta files. If there
-            // are overlapping keyspaces, they will be automatically merged by keyspace accum,
-            // and we want the developer to keep the keyspaces separated.
-
-            let ranges = &sparse_keyspace.0.ranges;
-
-            // TODO: use a single overlaps_with across the codebase
-            fn overlaps_with<T: Ord>(a: &Range<T>, b: &Range<T>) -> bool {
-                !(a.end <= b.start || b.end <= a.start)
-            }
-            for i in 0..ranges.len() {
-                for j in 0..i {
-                    if overlaps_with(&ranges[i], &ranges[j]) {
-                        panic!(
-                            "overlapping sparse keyspace: {}..{} and {}..{}",
-                            ranges[i].start, ranges[i].end, ranges[j].start, ranges[j].end
-                        );
-                    }
-                }
-            }
-            for i in 1..ranges.len() {
-                assert!(
-                    ranges[i - 1].end <= ranges[i].start,
-                    "unordered sparse keyspace: {}..{} and {}..{}",
-                    ranges[i - 1].start,
-                    ranges[i - 1].end,
-                    ranges[i].start,
-                    ranges[i].end
-                );
-            }
-        }
-
-        Ok((dense_keyspace, sparse_keyspace))
+        Ok((
+            result.to_keyspace(),
+            /* AUX sparse key space */
+            SparseKeySpace(KeySpace {
+                ranges: vec![repl_origin_key_range(), Key::metadata_aux_key_range()],
+            }),
+        ))
    }

    /// Get cached size of relation if it not updated after specified LSN
@@ -1046,11 +1006,6 @@ pub struct DatadirModification<'a> {
    pending_deletions: Vec<(Range<Key>, Lsn)>,
    pending_nblocks: i64,

-    // We update relation sizes when appending.  Since writing is single threaded, once we
-    // have updated a relation size we may be sure that its size is unchanged within the
-    // same DatadirModification
-    latest_rel_sizes: HashMap<RelTag, u32>,
-
    /// For special "directory" keys that store key-value maps, track the size of the map
    /// if it was updated in this modification.
    pending_directory_entries: Vec<(DirectoryKind, usize)>,
@@ -1413,10 +1368,7 @@ impl<'a> DatadirModification<'a> {

        // Put size
        let size_key = rel_size_to_key(rel);
-        let old_size = match self.latest_rel_sizes.get(&rel) {
-            Some(s) => *s,
-            None => self.get(size_key, ctx).await?.get_u32_le(),
-        };
+        let old_size = self.get(size_key, ctx).await?.get_u32_le();

        // only extend relation here. never decrease the size
        if nblocks > old_size {
@@ -1427,8 +1379,6 @@ impl<'a> DatadirModification<'a> {
            self.tline.set_cached_rel_size(rel, self.lsn, nblocks);

            self.pending_nblocks += nblocks as i64 - old_size as i64;
-
-            self.latest_rel_sizes.insert(rel, nblocks);
        }
        Ok(())
    }
@@ -2042,7 +1992,7 @@ mod tests {
    #[tokio::test]
    async fn aux_files_round_trip() -> anyhow::Result<()> {
        let name = "aux_files_round_trip";
-        let harness = TenantHarness::create(name).await?;
+        let harness = TenantHarness::create(name)?;

        pub const TIMELINE_ID: TimelineId =
            TimelineId::from_array(hex!("11223344556677881122334455667788"));
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -21,7 +21,6 @@ use futures::FutureExt;
 use futures::StreamExt;
 use pageserver_api::models;
 use pageserver_api::models::AuxFilePolicy;
-use pageserver_api::models::TimelineArchivalState;
 use pageserver_api::models::TimelineState;
 use pageserver_api::models::TopTenantShardItem;
 use pageserver_api::models::WalRedoManagerStatus;
@@ -40,7 +39,6 @@ use tokio::task::JoinSet;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::backoff;
-use utils::circuit_breaker::CircuitBreaker;
 use utils::completion;
 use utils::crashsafe::path_with_suffix_extension;
 use utils::failpoint_support;
@@ -78,8 +76,7 @@ use crate::is_uninit_mark;
 use crate::l0_flush::L0FlushGlobalState;
 use crate::metrics::TENANT;
 use crate::metrics::{
-    remove_tenant_metrics, BROKEN_TENANTS_SET, CIRCUIT_BREAKERS_BROKEN, CIRCUIT_BREAKERS_UNBROKEN,
-    TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC,
+    remove_tenant_metrics, BROKEN_TENANTS_SET, TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC,
 };
 use crate::repository::GcResult;
 use crate::task_mgr;
@@ -279,10 +276,6 @@ pub struct Tenant {

    eviction_task_tenant_state: tokio::sync::Mutex<EvictionTaskTenantState>,

-    /// Track repeated failures to compact, so that we can back off.
-    /// Overhead of mutex is acceptable because compaction is done with a multi-second period.
-    compaction_circuit_breaker: std::sync::Mutex<CircuitBreaker>,
-
    /// If the tenant is in Activating state, notify this to encourage it
    /// to proceed to Active as soon as possible, rather than waiting for lazy
    /// background warmup.
@@ -1229,14 +1222,6 @@ impl Tenant {
        Ok(timeline_preloads)
    }

-    pub async fn apply_timeline_archival_config(
-        &self,
-        _timeline_id: TimelineId,
-        _config: TimelineArchivalState,
-    ) -> anyhow::Result<()> {
-        Ok(())
-    }
-
    pub(crate) fn tenant_shard_id(&self) -> TenantShardId {
        self.tenant_shard_id
    }
@@ -1656,31 +1641,13 @@ impl Tenant {
            timelines_to_compact
        };

-        // Before doing any I/O work, check our circuit breaker
-        if self.compaction_circuit_breaker.lock().unwrap().is_broken() {
-            info!("Skipping compaction due to previous failures");
-            return Ok(());
-        }
-
        for (timeline_id, timeline) in &timelines_to_compact {
            timeline
                .compact(cancel, EnumSet::empty(), ctx)
                .instrument(info_span!("compact_timeline", %timeline_id))
-                .await
-                .map_err(|e| {
-                    self.compaction_circuit_breaker
-                        .lock()
-                        .unwrap()
-                        .fail(&CIRCUIT_BREAKERS_BROKEN, &e);
-                    e
-                })?;
+                .await?;
        }

-        self.compaction_circuit_breaker
-            .lock()
-            .unwrap()
-            .success(&CIRCUIT_BREAKERS_UNBROKEN);
-
        Ok(())
    }

@@ -2374,6 +2341,13 @@ impl Tenant {
            .unwrap_or(self.conf.default_tenant_conf.pitr_interval)
    }

+    pub fn get_trace_read_requests(&self) -> bool {
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
+        tenant_conf
+            .trace_read_requests
+            .unwrap_or(self.conf.default_tenant_conf.trace_read_requests)
+    }
+
    pub fn get_min_resident_size_override(&self) -> Option<u64> {
        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
        tenant_conf
@@ -2596,14 +2570,6 @@ impl Tenant {
            cached_logical_sizes: tokio::sync::Mutex::new(HashMap::new()),
            cached_synthetic_tenant_size: Arc::new(AtomicU64::new(0)),
            eviction_task_tenant_state: tokio::sync::Mutex::new(EvictionTaskTenantState::default()),
-            compaction_circuit_breaker: std::sync::Mutex::new(CircuitBreaker::new(
-                format!("compaction-{tenant_shard_id}"),
-                5,
-                // Compaction can be a very expensive operation, and might leak disk space.  It also ought
-                // to be infallible, as long as remote storage is available.  So if it repeatedly fails,
-                // use an extremely long backoff.
-                Some(Duration::from_secs(3600 * 24)),
-            )),
            activate_now_sem: tokio::sync::Semaphore::new(0),
            cancel: CancellationToken::default(),
            gate: Gate::default(),
@@ -2921,7 +2887,7 @@ impl Tenant {
                if let Some(ancestor_id) = timeline.get_ancestor_timeline_id() {
                    if let Some(ancestor_gc_cutoffs) = gc_cutoffs.get(&ancestor_id) {
                        target.within_ancestor_pitr =
-                            timeline.get_ancestor_lsn() >= ancestor_gc_cutoffs.time;
+                            timeline.get_ancestor_lsn() >= ancestor_gc_cutoffs.pitr;
                    }
                }

@@ -2937,7 +2903,7 @@ impl Tenant {
                timeline.metrics.pitr_history_size.set(
                    timeline
                        .get_last_record_lsn()
-                        .checked_sub(target.cutoffs.time)
+                        .checked_sub(target.cutoffs.pitr)
                        .unwrap_or(Lsn(0))
                        .0,
                );
@@ -3752,6 +3718,7 @@ pub(crate) mod harness {
                walreceiver_connect_timeout: Some(tenant_conf.walreceiver_connect_timeout),
                lagging_wal_timeout: Some(tenant_conf.lagging_wal_timeout),
                max_lsn_wal_lag: Some(tenant_conf.max_lsn_wal_lag),
+                trace_read_requests: Some(tenant_conf.trace_read_requests),
                eviction_policy: Some(tenant_conf.eviction_policy),
                min_resident_size_override: tenant_conf.min_resident_size_override,
                evictions_low_residence_duration_metric_threshold: Some(
@@ -3797,7 +3764,7 @@ pub(crate) mod harness {
    }

    impl TenantHarness {
-        pub async fn create_custom(
+        pub fn create_custom(
            test_name: &'static str,
            tenant_conf: TenantConf,
            tenant_id: TenantId,
@@ -3833,7 +3800,7 @@ pub(crate) mod harness {
                },
                timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
            };
-            let remote_storage = GenericRemoteStorage::from_config(&config).await.unwrap();
+            let remote_storage = GenericRemoteStorage::from_config(&config).unwrap();
            let deletion_queue = MockDeletionQueue::new(Some(remote_storage.clone()));

            Ok(Self {
@@ -3848,7 +3815,7 @@ pub(crate) mod harness {
            })
        }

-        pub async fn create(test_name: &'static str) -> anyhow::Result<Self> {
+        pub fn create(test_name: &'static str) -> anyhow::Result<Self> {
            // Disable automatic GC and compaction to make the unit tests more deterministic.
            // The tests perform them manually if needed.
            let tenant_conf = TenantConf {
@@ -3865,7 +3832,6 @@ pub(crate) mod harness {
                shard,
                Generation::new(0xdeadbeef),
            )
-            .await
        }

        pub fn span(&self) -> tracing::Span {
@@ -3971,7 +3937,8 @@ pub(crate) mod harness {
    }
 }

-#[cfg(test)]
+#[cfg(any())]
+#[allow(unused)]
 mod tests {
    use std::collections::BTreeMap;

@@ -4002,7 +3969,7 @@ mod tests {

    #[tokio::test]
    async fn test_basic() -> anyhow::Result<()> {
-        let (tenant, ctx) = TenantHarness::create("test_basic").await?.load().await;
+        let (tenant, ctx) = TenantHarness::create("test_basic")?.load().await;
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
            .await?;
@@ -4049,8 +4016,7 @@ mod tests {

    #[tokio::test]
    async fn no_duplicate_timelines() -> anyhow::Result<()> {
-        let (tenant, ctx) = TenantHarness::create("no_duplicate_timelines")
-            .await?
+        let (tenant, ctx) = TenantHarness::create("no_duplicate_timelines")?
            .load()
            .await;
        let _ = tenant
@@ -4082,7 +4048,7 @@ mod tests {
    async fn test_branch() -> anyhow::Result<()> {
        use std::str::from_utf8;

-        let (tenant, ctx) = TenantHarness::create("test_branch").await?.load().await;
+        let (tenant, ctx) = TenantHarness::create("test_branch")?.load().await;
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
            .await?;
@@ -4204,8 +4170,7 @@ mod tests {
    #[tokio::test]
    async fn test_prohibit_branch_creation_on_garbage_collected_data() -> anyhow::Result<()> {
        let (tenant, ctx) =
-            TenantHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")
-                .await?
+            TenantHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")?
                .load()
                .await;
        let tline = tenant
@@ -4252,8 +4217,7 @@ mod tests {
    #[tokio::test]
    async fn test_prohibit_branch_creation_on_pre_initdb_lsn() -> anyhow::Result<()> {
        let (tenant, ctx) =
-            TenantHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn")
-                .await?
+            TenantHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn")?
                .load()
                .await;

@@ -4275,7 +4239,7 @@ mod tests {
                    .source()
                    .unwrap()
                    .to_string()
-                    .contains("is earlier than latest GC cutoff"));
+                    .contains("is earlier than latest GC horizon"));
            }
        }

@@ -4308,8 +4272,7 @@ mod tests {
    #[tokio::test]
    async fn test_get_branchpoints_from_an_inactive_timeline() -> anyhow::Result<()> {
        let (tenant, ctx) =
-            TenantHarness::create("test_get_branchpoints_from_an_inactive_timeline")
-                .await?
+            TenantHarness::create("test_get_branchpoints_from_an_inactive_timeline")?
                .load()
                .await;
        let tline = tenant
@@ -4366,8 +4329,7 @@ mod tests {
    #[tokio::test]
    async fn test_retain_data_in_parent_which_is_needed_for_child() -> anyhow::Result<()> {
        let (tenant, ctx) =
-            TenantHarness::create("test_retain_data_in_parent_which_is_needed_for_child")
-                .await?
+            TenantHarness::create("test_retain_data_in_parent_which_is_needed_for_child")?
                .load()
                .await;
        let tline = tenant
@@ -4397,10 +4359,10 @@ mod tests {
    }
    #[tokio::test]
    async fn test_parent_keeps_data_forever_after_branching() -> anyhow::Result<()> {
-        let (tenant, ctx) = TenantHarness::create("test_parent_keeps_data_forever_after_branching")
-            .await?
-            .load()
-            .await;
+        let (tenant, ctx) =
+            TenantHarness::create("test_parent_keeps_data_forever_after_branching")?
+                .load()
+                .await;
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
            .await?;
@@ -4438,7 +4400,7 @@ mod tests {
    #[tokio::test]
    async fn timeline_load() -> anyhow::Result<()> {
        const TEST_NAME: &str = "timeline_load";
-        let harness = TenantHarness::create(TEST_NAME).await?;
+        let harness = TenantHarness::create(TEST_NAME)?;
        {
            let (tenant, ctx) = harness.load().await;
            let tline = tenant
@@ -4465,7 +4427,7 @@ mod tests {
    #[tokio::test]
    async fn timeline_load_with_ancestor() -> anyhow::Result<()> {
        const TEST_NAME: &str = "timeline_load_with_ancestor";
-        let harness = TenantHarness::create(TEST_NAME).await?;
+        let harness = TenantHarness::create(TEST_NAME)?;
        // create two timelines
        {
            let (tenant, ctx) = harness.load().await;
@@ -4513,10 +4475,7 @@ mod tests {
    #[tokio::test]
    async fn delta_layer_dumping() -> anyhow::Result<()> {
        use storage_layer::AsLayerDesc;
-        let (tenant, ctx) = TenantHarness::create("test_layer_dumping")
-            .await?
-            .load()
-            .await;
+        let (tenant, ctx) = TenantHarness::create("test_layer_dumping")?.load().await;
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
            .await?;
@@ -4543,7 +4502,7 @@ mod tests {

    #[tokio::test]
    async fn test_images() -> anyhow::Result<()> {
-        let (tenant, ctx) = TenantHarness::create("test_images").await?.load().await;
+        let (tenant, ctx) = TenantHarness::create("test_images")?.load().await;
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
            .await?;
@@ -4714,7 +4673,7 @@ mod tests {
    //
    #[tokio::test]
    async fn test_bulk_insert() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_bulk_insert").await?;
+        let harness = TenantHarness::create("test_bulk_insert")?;
        let (tenant, ctx) = harness.load().await;
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
@@ -4744,8 +4703,9 @@ mod tests {
    // There's one major downside to this test: delta layers only contains images,
    // so the search can stop at the first delta layer and doesn't traverse any deeper.
    #[tokio::test]
+    #[cfg(any())]
    async fn test_get_vectored() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_get_vectored").await?;
+        let harness = TenantHarness::create("test_get_vectored")?;
        let (tenant, ctx) = harness.load().await;
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
@@ -4823,7 +4783,7 @@ mod tests {

    #[tokio::test]
    async fn test_get_vectored_aux_files() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_get_vectored_aux_files").await?;
+        let harness = TenantHarness::create("test_get_vectored_aux_files")?;

        let (tenant, ctx) = harness.load().await;
        let tline = tenant
@@ -4892,6 +4852,7 @@ mod tests {
    // ------------------------------+
    // ```
    #[tokio::test]
+    #[cfg(any())]
    async fn test_get_vectored_key_gap() -> anyhow::Result<()> {
        let tenant_conf = TenantConf {
            // Make compaction deterministic
@@ -4909,8 +4870,7 @@ mod tests {
            TenantId::generate(),
            ShardIdentity::unsharded(),
            Generation::new(0xdeadbeef),
-        )
-        .await?;
+        )?;
        let (tenant, ctx) = harness.load().await;

        let mut current_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
@@ -5052,8 +5012,9 @@ mod tests {
    // * X - page images
    // ```
    #[tokio::test]
+    #[cfg(any())]
    async fn test_get_vectored_ancestor_descent() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_get_vectored_on_lsn_axis").await?;
+        let harness = TenantHarness::create("test_get_vectored_on_lsn_axis")?;
        let (tenant, ctx) = harness.load().await;

        let start_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
@@ -5202,7 +5163,7 @@ mod tests {
        name: &'static str,
        compaction_algorithm: CompactionAlgorithm,
    ) -> anyhow::Result<()> {
-        let mut harness = TenantHarness::create(name).await?;
+        let mut harness = TenantHarness::create(name)?;
        harness.tenant_conf.compaction_algorithm = CompactionAlgorithmSettings {
            kind: compaction_algorithm,
        };
@@ -5286,8 +5247,7 @@ mod tests {

    #[tokio::test]
    async fn test_traverse_branches() -> anyhow::Result<()> {
-        let (tenant, ctx) = TenantHarness::create("test_traverse_branches")
-            .await?
+        let (tenant, ctx) = TenantHarness::create("test_traverse_branches")?
            .load()
            .await;
        let mut tline = tenant
@@ -5377,8 +5337,7 @@ mod tests {

    #[tokio::test]
    async fn test_traverse_ancestors() -> anyhow::Result<()> {
-        let (tenant, ctx) = TenantHarness::create("test_traverse_ancestors")
-            .await?
+        let (tenant, ctx) = TenantHarness::create("test_traverse_ancestors")?
            .load()
            .await;
        let mut tline = tenant
@@ -5444,8 +5403,7 @@ mod tests {

    #[tokio::test]
    async fn test_write_at_initdb_lsn_takes_optimization_code_path() -> anyhow::Result<()> {
-        let (tenant, ctx) = TenantHarness::create("test_empty_test_timeline_is_usable")
-            .await?
+        let (tenant, ctx) = TenantHarness::create("test_empty_test_timeline_is_usable")?
            .load()
            .await;

@@ -5514,7 +5472,7 @@ mod tests {
    #[tokio::test]
    async fn test_create_guard_crash() -> anyhow::Result<()> {
        let name = "test_create_guard_crash";
-        let harness = TenantHarness::create(name).await?;
+        let harness = TenantHarness::create(name)?;
        {
            let (tenant, ctx) = harness.load().await;
            let tline = tenant
@@ -5567,7 +5525,7 @@ mod tests {
        name: &'static str,
        compaction_algorithm: CompactionAlgorithm,
    ) -> anyhow::Result<()> {
-        let mut harness = TenantHarness::create(name).await?;
+        let mut harness = TenantHarness::create(name)?;
        harness.tenant_conf.compaction_algorithm = CompactionAlgorithmSettings {
            kind: compaction_algorithm,
        };
@@ -5591,7 +5549,7 @@ mod tests {

    #[tokio::test]
    async fn test_metadata_scan() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_metadata_scan").await?;
+        let harness = TenantHarness::create("test_metadata_scan")?;
        let (tenant, ctx) = harness.load().await;
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
@@ -5710,7 +5668,7 @@ mod tests {

    #[tokio::test]
    async fn test_metadata_compaction_trigger() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_metadata_compaction_trigger").await?;
+        let harness = TenantHarness::create("test_metadata_compaction_trigger")?;
        let (tenant, ctx) = harness.load().await;
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
@@ -5769,9 +5727,7 @@ mod tests {

    #[tokio::test]
    async fn test_branch_copies_dirty_aux_file_flag() {
-        let harness = TenantHarness::create("test_branch_copies_dirty_aux_file_flag")
-            .await
-            .unwrap();
+        let harness = TenantHarness::create("test_branch_copies_dirty_aux_file_flag").unwrap();

        // the default aux file policy to switch is v1 if not set by the admins
        assert_eq!(
@@ -5873,9 +5829,7 @@ mod tests {

    #[tokio::test]
    async fn aux_file_policy_switch() {
-        let mut harness = TenantHarness::create("aux_file_policy_switch")
-            .await
-            .unwrap();
+        let mut harness = TenantHarness::create("aux_file_policy_switch").unwrap();
        harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::CrossValidation; // set to cross-validation mode
        let (tenant, ctx) = harness.load().await;

@@ -6049,9 +6003,7 @@ mod tests {

    #[tokio::test]
    async fn aux_file_policy_force_switch() {
-        let mut harness = TenantHarness::create("aux_file_policy_force_switch")
-            .await
-            .unwrap();
+        let mut harness = TenantHarness::create("aux_file_policy_force_switch").unwrap();
        harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::V1;
        let (tenant, ctx) = harness.load().await;

@@ -6112,9 +6064,7 @@ mod tests {

    #[tokio::test]
    async fn aux_file_policy_auto_detect() {
-        let mut harness = TenantHarness::create("aux_file_policy_auto_detect")
-            .await
-            .unwrap();
+        let mut harness = TenantHarness::create("aux_file_policy_auto_detect").unwrap();
        harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::V2; // set to cross-validation mode
        let (tenant, ctx) = harness.load().await;

@@ -6177,7 +6127,7 @@ mod tests {

    #[tokio::test]
    async fn test_metadata_image_creation() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_metadata_image_creation").await?;
+        let harness = TenantHarness::create("test_metadata_image_creation")?;
        let (tenant, ctx) = harness.load().await;
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
@@ -6276,7 +6226,7 @@ mod tests {

    #[tokio::test]
    async fn test_vectored_missing_data_key_reads() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_vectored_missing_data_key_reads").await?;
+        let harness = TenantHarness::create("test_vectored_missing_data_key_reads")?;
        let (tenant, ctx) = harness.load().await;

        let base_key = Key::from_hex("000000000033333333444444445500000000").unwrap();
@@ -6348,7 +6298,7 @@ mod tests {

    #[tokio::test]
    async fn test_vectored_missing_metadata_key_reads() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_vectored_missing_metadata_key_reads").await?;
+        let harness = TenantHarness::create("test_vectored_missing_metadata_key_reads")?;
        let (tenant, ctx) = harness.load().await;

        let base_key = Key::from_hex("620000000033333333444444445500000000").unwrap();
@@ -6440,7 +6390,7 @@ mod tests {

    #[tokio::test]
    async fn test_metadata_tombstone_reads() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_metadata_tombstone_reads").await?;
+        let harness = TenantHarness::create("test_metadata_tombstone_reads")?;
        let (tenant, ctx) = harness.load().await;
        let key0 = Key::from_hex("620000000033333333444444445500000000").unwrap();
        let key1 = Key::from_hex("620000000033333333444444445500000001").unwrap();
@@ -6520,9 +6470,7 @@ mod tests {

    #[tokio::test]
    async fn test_metadata_tombstone_image_creation() {
-        let harness = TenantHarness::create("test_metadata_tombstone_image_creation")
-            .await
-            .unwrap();
+        let harness = TenantHarness::create("test_metadata_tombstone_image_creation").unwrap();
        let (tenant, ctx) = harness.load().await;

        let key0 = Key::from_hex("620000000033333333444444445500000000").unwrap();
@@ -6594,9 +6542,8 @@ mod tests {

    #[tokio::test]
    async fn test_metadata_tombstone_empty_image_creation() {
-        let harness = TenantHarness::create("test_metadata_tombstone_empty_image_creation")
-            .await
-            .unwrap();
+        let harness =
+            TenantHarness::create("test_metadata_tombstone_empty_image_creation").unwrap();
        let (tenant, ctx) = harness.load().await;

        let key1 = Key::from_hex("620000000033333333444444445500000001").unwrap();
@@ -6659,7 +6606,7 @@ mod tests {

    #[tokio::test]
    async fn test_simple_bottom_most_compaction_images() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_simple_bottom_most_compaction_images").await?;
+        let harness = TenantHarness::create("test_simple_bottom_most_compaction_images")?;
        let (tenant, ctx) = harness.load().await;

        fn get_key(id: u32) -> Key {
@@ -6751,8 +6698,8 @@ mod tests {
        {
            // Update GC info
            let mut guard = tline.gc_info.write().unwrap();
-            guard.cutoffs.time = Lsn(0x30);
-            guard.cutoffs.space = Lsn(0x30);
+            guard.cutoffs.pitr = Lsn(0x30);
+            guard.cutoffs.horizon = Lsn(0x30);
        }

        let expected_result = [
@@ -6843,7 +6790,7 @@ mod tests {
            vec![
                // Image layer at GC horizon
                PersistentLayerKey {
-                    key_range: Key::MIN..Key::MAX,
+                    key_range: Key::MIN..get_key(10),
                    lsn_range: Lsn(0x30)..Lsn(0x31),
                    is_delta: false
                },
@@ -6867,7 +6814,7 @@ mod tests {

    #[tokio::test]
    async fn test_neon_test_record() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_neon_test_record").await?;
+        let harness = TenantHarness::create("test_neon_test_record")?;
        let (tenant, ctx) = harness.load().await;

        fn get_key(id: u32) -> Key {
@@ -6948,7 +6895,7 @@ mod tests {

    #[tokio::test]
    async fn test_lsn_lease() -> anyhow::Result<()> {
-        let (tenant, ctx) = TenantHarness::create("test_lsn_lease").await?.load().await;
+        let (tenant, ctx) = TenantHarness::create("test_lsn_lease")?.load().await;
        let key = Key::from_hex("010000000033333333444444445500000000").unwrap();

        let end_lsn = Lsn(0x100);
@@ -7037,7 +6984,7 @@ mod tests {

    #[tokio::test]
    async fn test_simple_bottom_most_compaction_deltas() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_simple_bottom_most_compaction_deltas").await?;
+        let harness = TenantHarness::create("test_simple_bottom_most_compaction_deltas")?;
        let (tenant, ctx) = harness.load().await;

        fn get_key(id: u32) -> Key {
@@ -7142,8 +7089,8 @@ mod tests {
            *guard = GcInfo {
                retain_lsns: vec![],
                cutoffs: GcCutoffs {
-                    time: Lsn(0x30),
-                    space: Lsn(0x30),
+                    pitr: Lsn(0x30),
+                    horizon: Lsn(0x30),
                },
                leases: Default::default(),
                within_ancestor_pitr: false,
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -137,14 +137,14 @@ impl<'a> BlockCursor<'a> {
 }

 /// Reserved bits for length and compression
-pub(super) const LEN_COMPRESSION_BIT_MASK: u8 = 0xf0;
+const LEN_COMPRESSION_BIT_MASK: u8 = 0xf0;

 /// The maximum size of blobs we support. The highest few bits
 /// are reserved for compression and other further uses.
 const MAX_SUPPORTED_LEN: usize = 0x0fff_ffff;

-pub(super) const BYTE_UNCOMPRESSED: u8 = 0x80;
-pub(super) const BYTE_ZSTD: u8 = BYTE_UNCOMPRESSED | 0x10;
+const BYTE_UNCOMPRESSED: u8 = 0x80;
+const BYTE_ZSTD: u8 = BYTE_UNCOMPRESSED | 0x10;

 /// A wrapper of `VirtualFile` that allows users to write blobs.
 ///
@@ -273,8 +273,12 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
        srcbuf: B,
        ctx: &RequestContext,
    ) -> (B::Buf, Result<u64, Error>) {
-        self.write_blob_maybe_compressed(srcbuf, ctx, ImageCompressionAlgorithm::Disabled)
-            .await
+        self.write_blob_maybe_compressed(
+            srcbuf,
+            ctx,
+            ImageCompressionAlgorithm::DisabledNoDecompress,
+        )
+        .await
    }

    /// Write a blob of data. Returns the offset that it was written to,
@@ -336,7 +340,8 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
                            (BYTE_UNCOMPRESSED, len, slice.into_inner())
                        }
                    }
-                    ImageCompressionAlgorithm::Disabled => {
+                    ImageCompressionAlgorithm::Disabled
+                    | ImageCompressionAlgorithm::DisabledNoDecompress => {
                        (BYTE_UNCOMPRESSED, len, srcbuf.slice_full().into_inner())
                    }
                };
@@ -389,64 +394,52 @@ impl BlobWriter<false> {
    }
 }

-#[cfg(test)]
-pub(crate) mod tests {
+#[cfg(any())]
+mod tests {
    use super::*;
    use crate::{context::DownloadBehavior, task_mgr::TaskKind, tenant::block_io::BlockReaderRef};
-    use camino::Utf8PathBuf;
-    use camino_tempfile::Utf8TempDir;
    use rand::{Rng, SeedableRng};

    async fn round_trip_test<const BUFFERED: bool>(blobs: &[Vec<u8>]) -> Result<(), Error> {
        round_trip_test_compressed::<BUFFERED>(blobs, false).await
    }

-    pub(crate) async fn write_maybe_compressed<const BUFFERED: bool>(
+    async fn round_trip_test_compressed<const BUFFERED: bool>(
        blobs: &[Vec<u8>],
        compression: bool,
-        ctx: &RequestContext,
-    ) -> Result<(Utf8TempDir, Utf8PathBuf, Vec<u64>), Error> {
+    ) -> Result<(), Error> {
        let temp_dir = camino_tempfile::tempdir()?;
        let pathbuf = temp_dir.path().join("file");
+        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);

        // Write part (in block to drop the file)
        let mut offsets = Vec::new();
        {
-            let file = VirtualFile::create(pathbuf.as_path(), ctx).await?;
+            let file = VirtualFile::create(pathbuf.as_path(), &ctx).await?;
            let mut wtr = BlobWriter::<BUFFERED>::new(file, 0);
            for blob in blobs.iter() {
                let (_, res) = if compression {
                    wtr.write_blob_maybe_compressed(
                        blob.clone(),
-                        ctx,
+                        &ctx,
                        ImageCompressionAlgorithm::Zstd { level: Some(1) },
                    )
                    .await
                } else {
-                    wtr.write_blob(blob.clone(), ctx).await
+                    wtr.write_blob(blob.clone(), &ctx).await
                };
                let offs = res?;
                offsets.push(offs);
            }
            // Write out one page worth of zeros so that we can
            // read again with read_blk
-            let (_, res) = wtr.write_blob(vec![0; PAGE_SZ], ctx).await;
+            let (_, res) = wtr.write_blob(vec![0; PAGE_SZ], &ctx).await;
            let offs = res?;
            println!("Writing final blob at offs={offs}");
-            wtr.flush_buffer(ctx).await?;
+            wtr.flush_buffer(&ctx).await?;
        }
-        Ok((temp_dir, pathbuf, offsets))
-    }

-    async fn round_trip_test_compressed<const BUFFERED: bool>(
-        blobs: &[Vec<u8>],
-        compression: bool,
-    ) -> Result<(), Error> {
-        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
-        let (_temp_dir, pathbuf, offsets) =
-            write_maybe_compressed::<BUFFERED>(blobs, compression, &ctx).await?;
-
-        let file = VirtualFile::open(pathbuf, &ctx).await?;
+        let file = VirtualFile::open(pathbuf.as_path(), &ctx).await?;
        let rdr = BlockReaderRef::VirtualFile(&file);
        let rdr = BlockCursor::new_with_compression(rdr, compression);
        for (idx, (blob, offset)) in blobs.iter().zip(offsets.iter()).enumerate() {
@@ -459,7 +452,7 @@ pub(crate) mod tests {
        Ok(())
    }

-    pub(crate) fn random_array(len: usize) -> Vec<u8> {
+    fn random_array(len: usize) -> Vec<u8> {
        let mut rng = rand::thread_rng();
        (0..len).map(|_| rng.gen()).collect::<_>()
    }
--- a/pageserver/src/tenant/block_io.rs
+++ b/pageserver/src/tenant/block_io.rs
@@ -86,7 +86,7 @@ pub(crate) enum BlockReaderRef<'a> {
    Slice(&'a [u8]),
    #[cfg(test)]
    TestDisk(&'a super::disk_btree::tests::TestDisk),
-    #[cfg(test)]
+    #[cfg(any())]
    VirtualFile(&'a VirtualFile),
 }

@@ -105,7 +105,7 @@ impl<'a> BlockReaderRef<'a> {
            Slice(s) => Self::read_blk_slice(s, blknum),
            #[cfg(test)]
            TestDisk(r) => r.read_blk(blknum),
-            #[cfg(test)]
+            #[cfg(any())]
            VirtualFile(r) => r.read_blk(blknum, ctx).await,
        }
    }
@@ -202,10 +202,18 @@ pub struct FileBlockReader<'a> {

 impl<'a> FileBlockReader<'a> {
    pub fn new(file: &'a VirtualFile, file_id: FileId) -> Self {
+        Self::new_with_compression(file, file_id, false)
+    }
+
+    pub fn new_with_compression(
+        file: &'a VirtualFile,
+        file_id: FileId,
+        compressed_reads: bool,
+    ) -> Self {
        FileBlockReader {
            file_id,
            file,
-            compressed_reads: true,
+            compressed_reads,
        }
    }

--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -335,6 +335,7 @@ pub struct TenantConf {
    /// A lagging safekeeper will be changed after `lagging_wal_timeout` time elapses since the last WAL update,
    /// to avoid eager reconnects.
    pub max_lsn_wal_lag: NonZeroU64,
+    pub trace_read_requests: bool,
    pub eviction_policy: EvictionPolicy,
    pub min_resident_size_override: Option<u64>,
    // See the corresponding metric's help string.
@@ -435,6 +436,10 @@ pub struct TenantConfOpt {
    #[serde(default)]
    pub max_lsn_wal_lag: Option<NonZeroU64>,

+    #[serde(skip_serializing_if = "Option::is_none")]
+    #[serde(default)]
+    pub trace_read_requests: Option<bool>,
+
    #[serde(skip_serializing_if = "Option::is_none")]
    #[serde(default)]
    pub eviction_policy: Option<EvictionPolicy>,
@@ -514,6 +519,9 @@ impl TenantConfOpt {
                .lagging_wal_timeout
                .unwrap_or(global_conf.lagging_wal_timeout),
            max_lsn_wal_lag: self.max_lsn_wal_lag.unwrap_or(global_conf.max_lsn_wal_lag),
+            trace_read_requests: self
+                .trace_read_requests
+                .unwrap_or(global_conf.trace_read_requests),
            eviction_policy: self.eviction_policy.unwrap_or(global_conf.eviction_policy),
            min_resident_size_override: self
                .min_resident_size_override
@@ -573,6 +581,7 @@ impl Default for TenantConf {
                .expect("cannot parse default walreceiver lagging wal timeout"),
            max_lsn_wal_lag: NonZeroU64::new(DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG)
                .expect("cannot parse default max walreceiver Lsn wal lag"),
+            trace_read_requests: false,
            eviction_policy: EvictionPolicy::NoEviction,
            min_resident_size_override: None,
            evictions_low_residence_duration_metric_threshold: humantime::parse_duration(
@@ -650,6 +659,7 @@ impl From<TenantConfOpt> for models::TenantConfig {
            walreceiver_connect_timeout: value.walreceiver_connect_timeout.map(humantime),
            lagging_wal_timeout: value.lagging_wal_timeout.map(humantime),
            max_lsn_wal_lag: value.max_lsn_wal_lag,
+            trace_read_requests: value.trace_read_requests,
            eviction_policy: value.eviction_policy,
            min_resident_size_override: value.min_resident_size_override,
            evictions_low_residence_duration_metric_threshold: value
--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -262,7 +262,7 @@ where

    pub fn iter<'a>(self, start_key: &'a [u8; L], ctx: &'a RequestContext) -> DiskBtreeIterator<'a>
    where
-        R: 'a + Send,
+        R: 'a,
    {
        DiskBtreeIterator {
            stream: Box::pin(self.into_stream(start_key, ctx)),
@@ -521,7 +521,7 @@ where
 pub struct DiskBtreeIterator<'a> {
    #[allow(clippy::type_complexity)]
    stream: std::pin::Pin<
-        Box<dyn Stream<Item = std::result::Result<(Vec<u8>, u64), DiskBtreeError>> + 'a + Send>,
+        Box<dyn Stream<Item = std::result::Result<(Vec<u8>, u64), DiskBtreeError>> + 'a>,
    >,
 }

@@ -550,10 +550,10 @@ where
    /// We maintain the length of the stack to be always greater than zero.
    /// Two exceptions are:
    /// 1. `Self::flush_node`. The method will push the new node if it extracted the last one.
-    ///    So because other methods cannot see the intermediate state invariant still holds.
+    ///   So because other methods cannot see the intermediate state invariant still holds.
    /// 2. `Self::finish`. It consumes self and does not return it back,
-    ///    which means that this is where the structure is destroyed.
-    ///    Thus stack of zero length cannot be observed by other methods.
+    ///  which means that this is where the structure is destroyed.
+    ///  Thus stack of zero length cannot be observed by other methods.
    stack: Vec<BuildNode<L>>,

    /// Last key that was appended to the tree. Used to sanity check that append
--- a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs
+++ b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs
@@ -28,7 +28,7 @@ use crate::{
    },
 };

-const TAIL_SZ: usize = 4096 * 1024;
+const TAIL_SZ: usize = 64 * 1024;

 /// See module-level comment.
 pub struct RW<W: OwnedAsyncWriter> {
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -2698,9 +2698,7 @@ mod tests {
        // Test that if an InProgress tenant is in the map during shutdown, the shutdown will gracefully
        // wait for it to complete before proceeding.

-        let h = TenantHarness::create("shutdown_awaits_in_progress_tenant")
-            .await
-            .unwrap();
+        let h = TenantHarness::create("shutdown_awaits_in_progress_tenant").unwrap();
        let (t, _ctx) = h.load().await;

        // harness loads it to active, which is forced and nothing is running on the tenant
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -241,7 +241,7 @@ use self::index::IndexPart;

 use super::metadata::MetadataUpdate;
 use super::storage_layer::{Layer, LayerName, ResidentLayer};
-use super::upload_queue::{NotInitialized, SetDeletedFlagProgress};
+use super::upload_queue::SetDeletedFlagProgress;
 use super::Generation;

 pub(crate) use download::{
@@ -1930,31 +1930,6 @@ impl RemoteTimelineClient {
            }
        }
    }
-
-    /// Returns an accessor which will hold the UploadQueue mutex for accessing the upload queue
-    /// externally to RemoteTimelineClient.
-    pub(crate) fn initialized_upload_queue(
-        &self,
-    ) -> Result<UploadQueueAccessor<'_>, NotInitialized> {
-        let mut inner = self.upload_queue.lock().unwrap();
-        inner.initialized_mut()?;
-        Ok(UploadQueueAccessor { inner })
-    }
-}
-
-pub(crate) struct UploadQueueAccessor<'a> {
-    inner: std::sync::MutexGuard<'a, UploadQueue>,
-}
-
-impl<'a> UploadQueueAccessor<'a> {
-    pub(crate) fn latest_uploaded_index_part(&self) -> &IndexPart {
-        match &*self.inner {
-            UploadQueue::Initialized(x) => &x.clean.0,
-            UploadQueue::Uninitialized | UploadQueue::Stopped(_) => {
-                unreachable!("checked before constructing")
-            }
-        }
-    }
 }

 pub fn remote_tenant_path(tenant_shard_id: &TenantShardId) -> RemotePath {
@@ -2128,7 +2103,7 @@ mod tests {
    impl TestSetup {
        async fn new(test_name: &str) -> anyhow::Result<Self> {
            let test_name = Box::leak(Box::new(format!("remote_timeline_client__{test_name}")));
-            let harness = TenantHarness::create(test_name).await?;
+            let harness = TenantHarness::create(test_name)?;
            let (tenant, ctx) = harness.load().await;

            let timeline = tenant
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -176,24 +176,6 @@ pub(crate) struct Lineage {
    ///
    /// If you are adding support for detaching from a hierarchy, consider changing the ancestry
    /// into a `Vec<(TimelineId, Lsn)>` to be a path instead.
-    // FIXME: this is insufficient even for path of two timelines for future wal recovery
-    // purposes:
-    //
-    // assuming a "old main" which has received most of the WAL, and has a branch "new main",
-    // starting a bit before "old main" last_record_lsn. the current version works fine,
-    // because we will know to replay wal and branch at the recorded Lsn to do wal recovery.
-    //
-    // then assuming "new main" would similarly receive a branch right before its last_record_lsn,
-    // "new new main". the current implementation would just store ("new main", ancestor_lsn, _)
-    // here. however, we cannot recover from WAL using only that information, we would need the
-    // whole ancestry here:
-    //
-    // ```json
-    // [
-    //   ["old main", ancestor_lsn("new main"), _],
-    //   ["new main", ancestor_lsn("new new main"), _]
-    // ]
-    // ```
    #[serde(skip_serializing_if = "Option::is_none", default)]
    original_ancestor: Option<(TimelineId, Lsn, NaiveDateTime)>,
 }
@@ -235,14 +217,6 @@ impl Lineage {
        self.original_ancestor
            .is_some_and(|(_, ancestor_lsn, _)| ancestor_lsn == lsn)
    }
-
-    pub(crate) fn is_detached_from_original_ancestor(&self) -> bool {
-        self.original_ancestor.is_some()
-    }
-
-    pub(crate) fn is_reparented(&self) -> bool {
-        !self.reparenting_history.is_empty()
-    }
 }

 #[cfg(test)]
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -135,9 +135,11 @@ pub struct TimelineInputs {
    ancestor_lsn: Lsn,
    last_record: Lsn,
    latest_gc_cutoff: Lsn,
+    horizon_cutoff: Lsn,
+    pitr_cutoff: Lsn,

    /// Cutoff point based on GC settings
-    next_pitr_cutoff: Lsn,
+    next_gc_cutoff: Lsn,

    /// Cutoff point calculated from the user-supplied 'max_retention_period'
    retention_param_cutoff: Option<Lsn>,
@@ -148,7 +150,7 @@ pub struct TimelineInputs {

 /// Gathers the inputs for the tenant sizing model.
 ///
-/// Tenant size does not consider the latest state, but only the state until next_pitr_cutoff, which
+/// Tenant size does not consider the latest state, but only the state until next_gc_cutoff, which
 /// is updated on-demand, during the start of this calculation and separate from the
 /// [`TimelineInputs::latest_gc_cutoff`].
 ///
@@ -156,8 +158,11 @@ pub struct TimelineInputs {
 ///
 /// ```text
 /// 0-----|---------|----|------------| · · · · · |·> lsn
-///   initdb_lsn  branchpoints*  next_pitr_cutoff  latest
+///   initdb_lsn  branchpoints*  next_gc_cutoff  latest
 /// ```
+///
+/// Until gc_horizon_cutoff > `Timeline::last_record_lsn` for any of the tenant's timelines, the
+/// tenant size will be zero.
 pub(super) async fn gather_inputs(
    tenant: &Tenant,
    limit: &Arc<Semaphore>,
@@ -167,7 +172,7 @@ pub(super) async fn gather_inputs(
    cancel: &CancellationToken,
    ctx: &RequestContext,
 ) -> Result<ModelInputs, CalculateSyntheticSizeError> {
-    // refresh is needed to update [`timeline::GcCutoffs`]
+    // refresh is needed to update gc related pitr_cutoff and horizon_cutoff
    tenant.refresh_gc_info(cancel, ctx).await?;

    // Collect information about all the timelines
@@ -231,18 +236,20 @@ pub(super) async fn gather_inputs(
        // we don't consider the `Timeline::disk_consistent_lsn` at all, because we are not
        // actually removing files.
        //
-        // We only consider [`timeline::GcCutoffs::time`], and not [`timeline::GcCutoffs::space`], because from
+        // We only consider [`GcInfo::pitr_cutoff`], and not [`GcInfo::horizon_cutoff`], because from
        // a user's perspective they have only requested retention up to the time bound (pitr_cutoff), rather
-        // than our internal space cutoff.  This means that if someone drops a database and waits for their
+        // than a space bound (horizon cutoff).  This means that if someone drops a database and waits for their
        // PITR interval, they will see synthetic size decrease, even if we are still storing data inside
-        // the space cutoff.
-        let mut next_pitr_cutoff = gc_info.cutoffs.time;
+        // horizon_cutoff.
+        let pitr_cutoff = gc_info.cutoffs.pitr;
+        let horizon_cutoff = gc_info.cutoffs.horizon;
+        let mut next_gc_cutoff = pitr_cutoff;

        // If the caller provided a shorter retention period, use that instead of the GC cutoff.
        let retention_param_cutoff = if let Some(max_retention_period) = max_retention_period {
            let param_cutoff = Lsn(last_record_lsn.0.saturating_sub(max_retention_period));
-            if next_pitr_cutoff < param_cutoff {
-                next_pitr_cutoff = param_cutoff;
+            if next_gc_cutoff < param_cutoff {
+                next_gc_cutoff = param_cutoff;
            }
            Some(param_cutoff)
        } else {
@@ -256,7 +263,7 @@ pub(super) async fn gather_inputs(
            .copied()
            .collect::<Vec<_>>();

-        // next_pitr_cutoff in parent branch are not of interest (right now at least), nor do we
+        // next_gc_cutoff in parent branch are not of interest (right now at least), nor do we
        // want to query any logical size before initdb_lsn.
        let branch_start_lsn = cmp::max(ancestor_lsn, timeline.initdb_lsn);

@@ -284,10 +291,10 @@ pub(super) async fn gather_inputs(
            )
        }

-        // Add a point for the PITR cutoff
-        let branch_start_needed = next_pitr_cutoff <= branch_start_lsn;
+        // Add a point for the GC cutoff
+        let branch_start_needed = next_gc_cutoff <= branch_start_lsn;
        if !branch_start_needed {
-            lsns.push((next_pitr_cutoff, LsnKind::GcCutOff));
+            lsns.push((next_gc_cutoff, LsnKind::GcCutOff));
        }

        lsns.sort_unstable();
@@ -326,7 +333,7 @@ pub(super) async fn gather_inputs(
                    parent: Some(parent),
                    lsn: lsn.0,
                    size: None,
-                    needed: lsn > next_pitr_cutoff,
+                    needed: lsn > next_gc_cutoff,
                },
                timeline_id: timeline.timeline_id,
                kind,
@@ -350,8 +357,8 @@ pub(super) async fn gather_inputs(
                    segment: Segment {
                        parent: Some(lease_parent),
                        lsn: lsn.0,
-                        size: None,                     // Filled in later, if necessary
-                        needed: lsn > next_pitr_cutoff, // only needed if the point is within rentention.
+                        size: None,                   // Filled in later, if necessary
+                        needed: lsn > next_gc_cutoff, // only needed if the point is within rentention.
                    },
                    timeline_id: timeline.timeline_id,
                    kind: LsnKind::LeaseStart,
@@ -391,7 +398,9 @@ pub(super) async fn gather_inputs(
            last_record: last_record_lsn,
            // this is not used above, because it might not have updated recently enough
            latest_gc_cutoff: *timeline.get_latest_gc_cutoff_lsn(),
-            next_pitr_cutoff,
+            horizon_cutoff,
+            pitr_cutoff,
+            next_gc_cutoff,
            retention_param_cutoff,
            lease_points,
        });
@@ -733,7 +742,9 @@ fn verify_size_for_multiple_branches() {
      "ancestor_lsn": "0/18D3D98",
      "last_record": "0/2230CD0",
      "latest_gc_cutoff": "0/1698C48",
-      "next_pitr_cutoff": "0/2210CD0",
+      "horizon_cutoff": "0/2210CD0",
+      "pitr_cutoff": "0/2210CD0",
+      "next_gc_cutoff": "0/2210CD0",
      "retention_param_cutoff": null,
      "lease_points": []
    },
@@ -742,7 +753,9 @@ fn verify_size_for_multiple_branches() {
      "ancestor_lsn": "0/176D998",
      "last_record": "0/1837770",
      "latest_gc_cutoff": "0/1698C48",
-      "next_pitr_cutoff": "0/1817770",
+      "horizon_cutoff": "0/1817770",
+      "pitr_cutoff": "0/1817770",
+      "next_gc_cutoff": "0/1817770",
      "retention_param_cutoff": null,
      "lease_points": []
    },
@@ -751,7 +764,9 @@ fn verify_size_for_multiple_branches() {
      "ancestor_lsn": "0/0",
      "last_record": "0/18D3D98",
      "latest_gc_cutoff": "0/1698C48",
-      "next_pitr_cutoff": "0/18B3D98",
+      "horizon_cutoff": "0/18B3D98",
+      "pitr_cutoff": "0/18B3D98",
+      "next_gc_cutoff": "0/18B3D98",
      "retention_param_cutoff": null,
      "lease_points": []
    }
@@ -805,7 +820,9 @@ fn verify_size_for_one_branch() {
      "ancestor_lsn": "0/0",
      "last_record": "47/280A5860",
      "latest_gc_cutoff": "47/240A5860",
-      "next_pitr_cutoff": "47/240A5860",
+      "horizon_cutoff": "47/240A5860",
+      "pitr_cutoff": "47/240A5860",
+      "next_gc_cutoff": "47/240A5860",
      "retention_param_cutoff": "0/0",
      "lease_points": []
    }
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -6,7 +6,6 @@ pub(crate) mod inmemory_layer;
 pub(crate) mod layer;
 mod layer_desc;
 mod layer_name;
-pub mod merge_iterator;

 use crate::context::{AccessStatsBehavior, RequestContext};
 use crate::repository::Value;
@@ -674,26 +673,6 @@ impl LayerAccessStats {
            },
        }
    }
-
-    /// Whether this layer has been accessed (excluding in [`AccessStatsBehavior::Skip`]).
-    ///
-    /// This indicates whether the layer has been used for some purpose that would motivate
-    /// us to keep it on disk, such as for serving a getpage request.
-    fn accessed(&self) -> bool {
-        let locked = self.0.lock().unwrap();
-        let inner = &locked.for_eviction_policy;
-
-        // Consider it accessed if the most recent access is more recent than
-        // the most recent change in residence status.
-        match (
-            inner.last_accesses.recent(),
-            inner.last_residence_changes.recent(),
-        ) {
-            (None, _) => false,
-            (Some(_), None) => true,
-            (Some(a), Some(r)) => a.when >= r.timestamp,
-        }
-    }
 }

 /// Get a layer descriptor from a layer.
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -33,14 +33,11 @@ use crate::page_cache::{self, FileId, PAGE_SZ};
 use crate::repository::{Key, Value, KEY_SIZE};
 use crate::tenant::blob_io::BlobWriter;
 use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, FileBlockReader};
-use crate::tenant::disk_btree::{
-    DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection,
-};
+use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
 use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::vectored_blob_io::{
-    BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
-    VectoredReadPlanner,
+    BlobFlag, MaxVectoredReadBytes, VectoredBlobReader, VectoredRead, VectoredReadPlanner,
 };
 use crate::tenant::{PageReconstructError, Timeline};
 use crate::virtual_file::{self, VirtualFile};
@@ -56,7 +53,6 @@ use pageserver_api::models::{ImageCompressionAlgorithm, LayerAccessKind};
 use pageserver_api::shard::TenantShardId;
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
-use std::collections::VecDeque;
 use std::fs::File;
 use std::io::SeekFrom;
 use std::ops::Range;
@@ -227,11 +223,6 @@ pub struct DeltaLayerInner {
    file: VirtualFile,
    file_id: FileId,

-    #[allow(dead_code)]
-    layer_key_range: Range<Key>,
-    #[allow(dead_code)]
-    layer_lsn_range: Range<Lsn>,
-
    max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
 }

@@ -462,7 +453,7 @@ impl DeltaLayerWriterInner {
    ) -> (Vec<u8>, anyhow::Result<()>) {
        assert!(self.lsn_range.start <= lsn);
        // We don't want to use compression in delta layer creation
-        let compression = ImageCompressionAlgorithm::Disabled;
+        let compression = ImageCompressionAlgorithm::DisabledNoDecompress;
        let (val, res) = self
            .blob_writer
            .write_blob_maybe_compressed(val, ctx, compression)
@@ -751,14 +742,6 @@ impl DeltaLayer {
 }

 impl DeltaLayerInner {
-    pub(crate) fn key_range(&self) -> &Range<Key> {
-        &self.layer_key_range
-    }
-
-    pub(crate) fn lsn_range(&self) -> &Range<Lsn> {
-        &self.layer_lsn_range
-    }
-
    /// Returns nested result following Result<Result<_, OpErr>, Critical>:
    /// - inner has the success or transient failure
    /// - outer has the permanent failure
@@ -807,8 +790,6 @@ impl DeltaLayerInner {
            index_start_blk: actual_summary.index_start_blk,
            index_root_blk: actual_summary.index_root_blk,
            max_vectored_read_bytes,
-            layer_key_range: actual_summary.key_range,
-            layer_lsn_range: actual_summary.lsn_range,
        }))
    }

@@ -1182,7 +1163,9 @@ impl DeltaLayerInner {
                    let delta_key = DeltaKey::from_slice(key);
                    let val_ref = ValueRef {
                        blob_ref: BlobRef(value),
-                        layer: self,
+                        reader: BlockCursor::new(crate::tenant::block_io::BlockReaderRef::Adapter(
+                            Adapter(self),
+                        )),
                    };
                    let pos = BlobRef(value).pos();
                    if let Some(last) = all_keys.last_mut() {
@@ -1321,7 +1304,7 @@ impl DeltaLayerInner {
                        offsets.start.pos(),
                        offsets.end.pos(),
                        meta,
-                        max_read_size,
+                        Some(max_read_size),
                    ))
                }
            } else {
@@ -1426,7 +1409,7 @@ impl DeltaLayerInner {
        let keys = self.load_keys(ctx).await?;

        async fn dump_blob(val: &ValueRef<'_>, ctx: &RequestContext) -> anyhow::Result<String> {
-            let buf = val.load_raw(ctx).await?;
+            let buf = val.reader.read_blob(val.blob_ref.pos(), ctx).await?;
            let val = Value::des(&buf)?;
            let desc = match val {
                Value::Image(img) => {
@@ -1461,7 +1444,8 @@ impl DeltaLayerInner {
            use pageserver_api::key::CHECKPOINT_KEY;
            use postgres_ffi::CheckPoint;
            if key == CHECKPOINT_KEY {
-                let val = val.load(ctx).await?;
+                let buf = val.reader.read_blob(val.blob_ref.pos(), ctx).await?;
+                let val = Value::des(&buf)?;
                match val {
                    Value::Image(img) => {
                        let checkpoint = CheckPoint::decode(&img)?;
@@ -1514,6 +1498,7 @@ impl DeltaLayerInner {
        offset
    }

+    #[cfg(any())]
    pub(crate) fn iter<'a>(&'a self, ctx: &'a RequestContext) -> DeltaLayerIterator<'a> {
        let block_reader = FileBlockReader::new(&self.file, self.file_id);
        let tree_reader =
@@ -1524,7 +1509,7 @@ impl DeltaLayerInner {
            index_iter: tree_reader.iter(&[0; DELTA_KEY_SIZE], ctx),
            key_values_batch: std::collections::VecDeque::new(),
            is_end: false,
-            planner: StreamingVectoredReadPlanner::new(
+            planner: crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner::new(
                1024 * 8192, // The default value. Unit tests might use a different value. 1024 * 8K = 8MB buffer.
                1024,        // The default value. Unit tests might use a different value
            ),
@@ -1545,24 +1530,17 @@ pub struct DeltaEntry<'a> {
 /// Reference to an on-disk value
 pub struct ValueRef<'a> {
    blob_ref: BlobRef,
-    layer: &'a DeltaLayerInner,
+    reader: BlockCursor<'a>,
 }

 impl<'a> ValueRef<'a> {
    /// Loads the value from disk
    pub async fn load(&self, ctx: &RequestContext) -> Result<Value> {
-        let buf = self.load_raw(ctx).await?;
+        // theoretically we *could* record an access time for each, but it does not really matter
+        let buf = self.reader.read_blob(self.blob_ref.pos(), ctx).await?;
        let val = Value::des(&buf)?;
        Ok(val)
    }
-
-    async fn load_raw(&self, ctx: &RequestContext) -> Result<Vec<u8>> {
-        let reader = BlockCursor::new(crate::tenant::block_io::BlockReaderRef::Adapter(Adapter(
-            self.layer,
-        )));
-        let buf = reader.read_blob(self.blob_ref.pos(), ctx).await?;
-        Ok(buf)
-    }
 }

 pub(crate) struct Adapter<T>(T);
@@ -1596,15 +1574,17 @@ impl<'a> pageserver_compaction::interface::CompactionDeltaEntry<'a, Key> for Del
    }
 }

+#[cfg(test)]
 pub struct DeltaLayerIterator<'a> {
    delta_layer: &'a DeltaLayerInner,
    ctx: &'a RequestContext,
-    planner: StreamingVectoredReadPlanner,
-    index_iter: DiskBtreeIterator<'a>,
-    key_values_batch: VecDeque<(Key, Lsn, Value)>,
+    planner: crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner,
+    index_iter: crate::tenant::disk_btree::DiskBtreeIterator<'a>,
+    key_values_batch: std::collections::VecDeque<(Key, Lsn, Value)>,
    is_end: bool,
 }

+#[cfg(test)]
 impl<'a> DeltaLayerIterator<'a> {
    /// Retrieve a batch of key-value pairs into the iterator buffer.
    async fn next_batch(&mut self) -> anyhow::Result<()> {
@@ -1618,17 +1598,13 @@ impl<'a> DeltaLayerIterator<'a> {
                let lsn = DeltaKey::extract_lsn_from_buf(&raw_key);
                let blob_ref = BlobRef(value);
                let offset = blob_ref.pos();
-                if let Some(batch_plan) = self.planner.handle(key, lsn, offset) {
+                if let Some(batch_plan) = self.planner.handle(key, lsn, offset, BlobFlag::None) {
                    break batch_plan;
                }
            } else {
                self.is_end = true;
                let data_end_offset = self.delta_layer.index_start_offset();
-                if let Some(item) = self.planner.handle_range_end(data_end_offset) {
-                    break item;
-                } else {
-                    return Ok(()); // TODO: test empty iterator
-                }
+                break self.planner.handle_range_end(data_end_offset);
            }
        };
        let vectored_blob_reader = VectoredBlobReader::new(&self.delta_layer.file);
@@ -1662,8 +1638,8 @@ impl<'a> DeltaLayerIterator<'a> {
    }
 }

-#[cfg(test)]
-pub(crate) mod test {
+#[cfg(any())]
+mod test {
    use std::collections::BTreeMap;

    use itertools::MinMaxResult;
@@ -1671,7 +1647,6 @@ pub(crate) mod test {
    use rand::RngCore;

    use super::*;
-    use crate::repository::Value;
    use crate::tenant::harness::TIMELINE_ID;
    use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner;
    use crate::tenant::Tenant;
@@ -1681,7 +1656,6 @@ pub(crate) mod test {
        tenant::{disk_btree::tests::TestDisk, harness::TenantHarness},
        DEFAULT_PG_VERSION,
    };
-    use bytes::Bytes;

    /// Construct an index for a fictional delta layer and and then
    /// traverse in order to plan vectored reads for a query. Finally,
@@ -1934,7 +1908,7 @@ pub(crate) mod test {

    #[tokio::test]
    async fn test_delta_layer_vectored_read_end_to_end() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_delta_layer_oversized_vectored_read").await?;
+        let harness = TenantHarness::create("test_delta_layer_oversized_vectored_read")?;
        let (tenant, ctx) = harness.load().await;

        let timeline_id = TimelineId::generate();
@@ -2034,9 +2008,7 @@ pub(crate) mod test {
        use crate::walrecord::NeonWalRecord;
        use bytes::Bytes;

-        let h = crate::tenant::harness::TenantHarness::create("truncate_delta_smoke")
-            .await
-            .unwrap();
+        let h = crate::tenant::harness::TenantHarness::create("truncate_delta_smoke").unwrap();
        let (tenant, ctx) = h.load().await;
        let ctx = &ctx;
        let timeline = tenant
@@ -2245,31 +2217,15 @@ pub(crate) mod test {
        }
    }

-    pub(crate) fn sort_delta(
-        (k1, l1, _): &(Key, Lsn, Value),
-        (k2, l2, _): &(Key, Lsn, Value),
-    ) -> std::cmp::Ordering {
-        (k1, l1).cmp(&(k2, l2))
-    }
-
-    pub(crate) fn sort_delta_value(
-        (k1, l1, v1): &(Key, Lsn, Value),
-        (k2, l2, v2): &(Key, Lsn, Value),
-    ) -> std::cmp::Ordering {
-        let order_1 = if v1.is_image() { 0 } else { 1 };
-        let order_2 = if v2.is_image() { 0 } else { 1 };
-        (k1, l1, order_1).cmp(&(k2, l2, order_2))
-    }
-
-    pub(crate) async fn produce_delta_layer(
+    async fn produce_delta_layer(
        tenant: &Tenant,
        tline: &Arc<Timeline>,
        mut deltas: Vec<(Key, Lsn, Value)>,
        ctx: &RequestContext,
    ) -> anyhow::Result<ResidentLayer> {
-        deltas.sort_by(sort_delta);
+        deltas.sort_by(|(k1, l1, _), (k2, l2, _)| (k1, l1).cmp(&(k2, l2)));
        let (key_start, _, _) = deltas.first().unwrap();
-        let (key_max, _, _) = deltas.last().unwrap();
+        let (key_max, _, _) = deltas.first().unwrap();
        let lsn_min = deltas.iter().map(|(_, lsn, _)| lsn).min().unwrap();
        let lsn_max = deltas.iter().map(|(_, lsn, _)| lsn).max().unwrap();
        let lsn_end = Lsn(lsn_max.0 + 1);
@@ -2314,7 +2270,10 @@ pub(crate) mod test {

    #[tokio::test]
    async fn delta_layer_iterator() {
-        let harness = TenantHarness::create("delta_layer_iterator").await.unwrap();
+        use crate::repository::Value;
+        use bytes::Bytes;
+
+        let harness = TenantHarness::create("delta_layer_iterator").unwrap();
        let (tenant, ctx) = harness.load().await;

        let tline = tenant
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -29,16 +29,13 @@ use crate::page_cache::{self, FileId, PAGE_SZ};
 use crate::repository::{Key, Value, KEY_SIZE};
 use crate::tenant::blob_io::BlobWriter;
 use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
-use crate::tenant::disk_btree::{
-    DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection,
-};
+use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
 use crate::tenant::storage_layer::{
    LayerAccessStats, ValueReconstructResult, ValueReconstructState,
 };
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::vectored_blob_io::{
-    BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
-    VectoredReadPlanner,
+    BlobFlag, MaxVectoredReadBytes, VectoredBlobReader, VectoredRead, VectoredReadPlanner,
 };
 use crate::tenant::{PageReconstructError, Timeline};
 use crate::virtual_file::{self, VirtualFile};
@@ -53,7 +50,6 @@ use pageserver_api::models::LayerAccessKind;
 use pageserver_api::shard::{ShardIdentity, TenantShardId};
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
-use std::collections::VecDeque;
 use std::fs::File;
 use std::io::SeekFrom;
 use std::ops::Range;
@@ -169,6 +165,7 @@ pub struct ImageLayerInner {
    file_id: FileId,

    max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
+    compressed_reads: bool,
 }

 impl std::fmt::Debug for ImageLayerInner {
@@ -182,7 +179,8 @@ impl std::fmt::Debug for ImageLayerInner {

 impl ImageLayerInner {
    pub(super) async fn dump(&self, ctx: &RequestContext) -> anyhow::Result<()> {
-        let block_reader = FileBlockReader::new(&self.file, self.file_id);
+        let block_reader =
+            FileBlockReader::new_with_compression(&self.file, self.file_id, self.compressed_reads);
        let tree_reader = DiskBtreeReader::<_, KEY_SIZE>::new(
            self.index_start_blk,
            self.index_root_blk,
@@ -270,9 +268,10 @@ impl ImageLayer {
    async fn load_inner(&self, ctx: &RequestContext) -> Result<ImageLayerInner> {
        let path = self.path();

-        let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, None, ctx)
-            .await
-            .and_then(|res| res)?;
+        let loaded =
+            ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, None, false, ctx)
+                .await
+                .and_then(|res| res)?;

        // not production code
        let actual_layer_name = LayerName::from_str(path.file_name().unwrap()).unwrap();
@@ -373,14 +372,6 @@ impl ImageLayer {
 }

 impl ImageLayerInner {
-    pub(crate) fn key_range(&self) -> &Range<Key> {
-        &self.key_range
-    }
-
-    pub(crate) fn lsn(&self) -> Lsn {
-        self.lsn
-    }
-
    /// Returns nested result following Result<Result<_, OpErr>, Critical>:
    /// - inner has the success or transient failure
    /// - outer has the permanent failure
@@ -389,6 +380,7 @@ impl ImageLayerInner {
        lsn: Lsn,
        summary: Option<Summary>,
        max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
+        support_compressed_reads: bool,
        ctx: &RequestContext,
    ) -> Result<Result<Self, anyhow::Error>, anyhow::Error> {
        let file = match VirtualFile::open(path, ctx).await {
@@ -432,6 +424,7 @@ impl ImageLayerInner {
            file,
            file_id,
            max_vectored_read_bytes,
+            compressed_reads: support_compressed_reads,
            key_range: actual_summary.key_range,
        }))
    }
@@ -442,7 +435,8 @@ impl ImageLayerInner {
        reconstruct_state: &mut ValueReconstructState,
        ctx: &RequestContext,
    ) -> anyhow::Result<ValueReconstructResult> {
-        let block_reader = FileBlockReader::new(&self.file, self.file_id);
+        let block_reader =
+            FileBlockReader::new_with_compression(&self.file, self.file_id, self.compressed_reads);
        let tree_reader =
            DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, &block_reader);

@@ -502,12 +496,14 @@ impl ImageLayerInner {
        &self,
        ctx: &RequestContext,
    ) -> anyhow::Result<Vec<(Key, Lsn, Value)>> {
-        let block_reader = FileBlockReader::new(&self.file, self.file_id);
+        let block_reader =
+            FileBlockReader::new_with_compression(&self.file, self.file_id, self.compressed_reads);
        let tree_reader =
            DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, &block_reader);
        let mut result = Vec::new();
        let mut stream = Box::pin(tree_reader.into_stream(&[0; KEY_SIZE], ctx));
-        let block_reader = FileBlockReader::new(&self.file, self.file_id);
+        let block_reader =
+            FileBlockReader::new_with_compression(&self.file, self.file_id, self.compressed_reads);
        let cursor = block_reader.block_cursor();
        while let Some(item) = stream.next().await {
            // TODO: dedup code with get_reconstruct_value
@@ -542,7 +538,8 @@ impl ImageLayerInner {
                .into(),
        );

-        let block_reader = FileBlockReader::new(&self.file, self.file_id);
+        let block_reader =
+            FileBlockReader::new_with_compression(&self.file, self.file_id, self.compressed_reads);
        let tree_reader =
            DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, block_reader);

@@ -701,17 +698,19 @@ impl ImageLayerInner {
        }
    }

+    #[cfg(test)]
    pub(crate) fn iter<'a>(&'a self, ctx: &'a RequestContext) -> ImageLayerIterator<'a> {
-        let block_reader = FileBlockReader::new(&self.file, self.file_id);
+        let block_reader =
+            FileBlockReader::new_with_compression(&self.file, self.file_id, self.compressed_reads);
        let tree_reader =
            DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, block_reader);
        ImageLayerIterator {
            image_layer: self,
            ctx,
            index_iter: tree_reader.iter(&[0; KEY_SIZE], ctx),
-            key_values_batch: VecDeque::new(),
+            key_values_batch: std::collections::VecDeque::new(),
            is_end: false,
-            planner: StreamingVectoredReadPlanner::new(
+            planner: crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner::new(
                1024 * 8192, // The default value. Unit tests might use a different value. 1024 * 8K = 8MB buffer.
                1024,        // The default value. Unit tests might use a different value
            ),
@@ -738,9 +737,6 @@ struct ImageLayerWriterInner {
    key_range: Range<Key>,
    lsn: Lsn,

-    // Total uncompressed bytes passed into put_image
-    uncompressed_bytes: u64,
-
    blob_writer: BlobWriter<false>,
    tree: DiskBtreeBuilder<BlockBuf, KEY_SIZE>,
 }
@@ -796,7 +792,6 @@ impl ImageLayerWriterInner {
            lsn,
            tree: tree_builder,
            blob_writer,
-            uncompressed_bytes: 0,
        };

        Ok(writer)
@@ -814,12 +809,7 @@ impl ImageLayerWriterInner {
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
        ensure!(self.key_range.contains(&key));
-        let compression = self.conf.image_compression;
-        self.uncompressed_bytes += img.len() as u64;
-        let (_img, res) = self
-            .blob_writer
-            .write_blob_maybe_compressed(img, ctx, compression)
-            .await;
+        let (_img, res) = self.blob_writer.write_blob(img, ctx).await;
        // TODO: re-use the buffer for `img` further upstack
        let off = res?;

@@ -841,11 +831,6 @@ impl ImageLayerWriterInner {
        let index_start_blk =
            ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;

-        // Calculate compression ratio
-        let compressed_size = self.blob_writer.size() - PAGE_SZ as u64; // Subtract PAGE_SZ for header
-        crate::metrics::COMPRESSION_IMAGE_INPUT_BYTES.inc_by(self.uncompressed_bytes);
-        crate::metrics::COMPRESSION_IMAGE_OUTPUT_BYTES.inc_by(compressed_size);
-
        let mut file = self.blob_writer.into_inner();

        // Write out the index
@@ -985,15 +970,17 @@ impl Drop for ImageLayerWriter {
    }
 }

+#[cfg(test)]
 pub struct ImageLayerIterator<'a> {
    image_layer: &'a ImageLayerInner,
    ctx: &'a RequestContext,
-    planner: StreamingVectoredReadPlanner,
-    index_iter: DiskBtreeIterator<'a>,
-    key_values_batch: VecDeque<(Key, Lsn, Value)>,
+    planner: crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner,
+    index_iter: crate::tenant::disk_btree::DiskBtreeIterator<'a>,
+    key_values_batch: std::collections::VecDeque<(Key, Lsn, Value)>,
    is_end: bool,
 }

+#[cfg(test)]
 impl<'a> ImageLayerIterator<'a> {
    /// Retrieve a batch of key-value pairs into the iterator buffer.
    async fn next_batch(&mut self) -> anyhow::Result<()> {
@@ -1007,17 +994,14 @@ impl<'a> ImageLayerIterator<'a> {
                    Key::from_slice(&raw_key[..KEY_SIZE]),
                    self.image_layer.lsn,
                    offset,
+                    BlobFlag::None,
                ) {
                    break batch_plan;
                }
            } else {
                self.is_end = true;
                let payload_end = self.image_layer.index_start_blk as u64 * PAGE_SZ as u64;
-                if let Some(item) = self.planner.handle_range_end(payload_end) {
-                    break item;
-                } else {
-                    return Ok(()); // TODO: a test case on empty iterator
-                }
+                break self.planner.handle_range_end(payload_end);
            }
        };
        let vectored_blob_reader = VectoredBlobReader::new(&self.image_layer.file);
@@ -1111,7 +1095,6 @@ mod test {
            ShardIdentity::unsharded(),
            get_next_gen(),
        )
-        .await
        .unwrap();
        let (tenant, ctx) = harness.load().await;
        let timeline = tenant
@@ -1178,7 +1161,6 @@ mod test {
                // But here, all we care about is that the gen number is unique.
                get_next_gen(),
            )
-            .await
            .unwrap();
            let (tenant, ctx) = harness.load().await;
            let timeline = tenant
@@ -1308,9 +1290,10 @@ mod test {
        }
    }

+    #[cfg(any())]
    #[tokio::test]
    async fn image_layer_iterator() {
-        let harness = TenantHarness::create("image_layer_iterator").await.unwrap();
+        let harness = TenantHarness::create("image_layer_iterator").unwrap();
        let (tenant, ctx) = harness.load().await;

        let tline = tenant
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -521,30 +521,6 @@ impl InMemoryLayer {
        self.put_value_locked(&mut inner, key, lsn, buf, ctx).await
    }

-    pub(crate) async fn put_values(
-        &self,
-        mut values: Vec<(Lsn, Key, smallvec::SmallVec<[u8; 256]>, u64)>,
-        ctx: &RequestContext,
-    ) -> Result<()> {
-        let mut inner = self.inner.write().await;
-        self.assert_writable();
-        for (_lsn, _key, buf, off) in &mut values {
-            *off = self.put_value_locked2(&mut inner, &buf, ctx).await?;
-        }
-
-        for (lsn, key, _buf, off) in values.into_iter() {
-            let vec_map = inner.index.entry(key).or_default();
-
-            // Use fast version of append, since we know our LSNs are already sorted
-            vec_map.append2(lsn, off);
-        }
-
-        let size = inner.file.len();
-        inner.resource_units.maybe_publish_size(size);
-
-        Ok(())
-    }
-
    async fn put_value_locked(
        &self,
        locked_inner: &mut RwLockWriteGuard<'_, InMemoryLayerInner>,
@@ -580,27 +556,6 @@ impl InMemoryLayer {
        Ok(())
    }

-    async fn put_value_locked2(
-        &self,
-        locked_inner: &mut RwLockWriteGuard<'_, InMemoryLayerInner>,
-        buf: &[u8],
-        ctx: &RequestContext,
-    ) -> Result<u64> {
-        let off = {
-            locked_inner
-                .file
-                .write_blob(
-                    buf,
-                    &RequestContextBuilder::extend(ctx)
-                        .page_content_kind(PageContentKind::InMemoryLayer)
-                        .build(),
-                )
-                .await?
-        };
-
-        Ok(off)
-    }
-
    pub(crate) fn get_opened_at(&self) -> Instant {
        self.opened_at
    }
@@ -619,6 +574,8 @@ impl InMemoryLayer {
    /// Records the end_lsn for non-dropped layers.
    /// `end_lsn` is exclusive
    pub async fn freeze(&self, end_lsn: Lsn) {
+        let inner = self.inner.write().await;
+
        assert!(
            self.start_lsn < end_lsn,
            "{} >= {}",
@@ -636,13 +593,9 @@ impl InMemoryLayer {
            })
            .expect("frozen_local_path_str set only once");

-        #[cfg(debug_assertions)]
-        {
-            let inner = self.inner.write().await;
-            for vec_map in inner.index.values() {
-                for (lsn, _pos) in vec_map.as_slice() {
-                    debug_assert!(*lsn < end_lsn);
-                }
+        for vec_map in inner.index.values() {
+            for (lsn, _pos) in vec_map.as_slice() {
+                assert!(*lsn < end_lsn);
            }
        }
    }
@@ -762,22 +715,16 @@ impl InMemoryLayer {
                        res?;
                    }
                }
+
+                // Hold the permit until the IO is done; if we didn't, one could drop this future,
+                // thereby releasing the permit, but the Vec<u8> remains allocated until the IO completes.
+                // => we'd have more concurrenct Vec<u8> than allowed as per the semaphore.
+                drop(_concurrency_permit);
            }
        }

        // MAX is used here because we identify L0 layers by full key range
        let delta_layer = delta_layer_writer.finish(Key::MAX, timeline, ctx).await?;
-
-        // Hold the permit until all the IO is done, including the fsync in `delta_layer_writer.finish()``.
-        //
-        // If we didn't and our caller drops this future, tokio-epoll-uring would extend the lifetime of
-        // the `file_contents: Vec<u8>` until the IO is done, but not the permit's lifetime.
-        // Thus, we'd have more concurrenct `Vec<u8>` in existence than the semaphore allows.
-        //
-        // We hold across the fsync so that on ext4 mounted with data=ordered, all the kernel page cache pages
-        // we dirtied when writing to the filesystem have been flushed and marked !dirty.
-        drop(_concurrency_permit);
-
        Ok(Some(delta_layer))
    }
 }
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -385,7 +385,6 @@ impl Layer {
    }

    /// Get all key/values in the layer. Should be replaced with an iterator-based API in the future.
-    #[allow(dead_code)]
    pub(crate) async fn load_key_values(
        &self,
        ctx: &RequestContext,
@@ -694,18 +693,6 @@ impl Drop for LayerInner {
            // and we could be delaying shutdown for nothing.
        }

-        if let Some(timeline) = self.timeline.upgrade() {
-            // Only need to decrement metrics if the timeline still exists: otherwise
-            // it will have already de-registered these metrics via TimelineMetrics::shutdown
-            if self.desc.is_delta() {
-                timeline.metrics.layer_count_delta.dec();
-                timeline.metrics.layer_size_delta.sub(self.desc.file_size);
-            } else {
-                timeline.metrics.layer_count_image.dec();
-                timeline.metrics.layer_size_image.sub(self.desc.file_size);
-            }
-        }
-
        if !*self.wanted_deleted.get_mut() {
            return;
        }
@@ -804,15 +791,6 @@ impl LayerInner {
            (heavier_once_cell::OnceCell::default(), 0, Status::Evicted)
        };

-        // This object acts as a RAII guard on these metrics: increment on construction
-        if desc.is_delta() {
-            timeline.metrics.layer_count_delta.inc();
-            timeline.metrics.layer_size_delta.add(desc.file_size);
-        } else {
-            timeline.metrics.layer_count_image.inc();
-            timeline.metrics.layer_size_image.add(desc.file_size);
-        }
-
        LayerInner {
            conf,
            debug_str: {
@@ -1491,22 +1469,14 @@ impl LayerInner {
                let duration = SystemTime::now().duration_since(local_layer_mtime);
                match duration {
                    Ok(elapsed) => {
-                        let accessed = self.access_stats.accessed();
-                        if accessed {
-                            // Only layers used for reads contribute to our "low residence" metric that is used
-                            // to detect thrashing.  Layers promoted for other reasons (e.g. compaction) are allowed
-                            // to be rapidly evicted without contributing to this metric.
-                            timeline
-                                .metrics
-                                .evictions_with_low_residence_duration
-                                .read()
-                                .unwrap()
-                                .observe(elapsed);
-                        }
-
+                        timeline
+                            .metrics
+                            .evictions_with_low_residence_duration
+                            .read()
+                            .unwrap()
+                            .observe(elapsed);
                        tracing::info!(
                            residence_millis = elapsed.as_millis(),
-                            accessed,
                            "evicted layer after known residence period"
                        );
                    }
@@ -1715,6 +1685,7 @@ impl DownloadedLayer {
                    lsn,
                    summary,
                    Some(owner.conf.max_vectored_read_bytes),
+                    owner.conf.image_compression.allow_decompression(),
                    ctx,
                )
                .await
@@ -1919,7 +1890,7 @@ impl ResidentLayer {
        self.owner.metadata()
    }

-    /// Cast the layer to a delta, return an error if it is an image layer.
+    #[cfg(any())]
    pub(crate) async fn get_as_delta(
        &self,
        ctx: &RequestContext,
@@ -1931,7 +1902,7 @@ impl ResidentLayer {
        }
    }

-    /// Cast the layer to an image, return an error if it is a delta layer.
+    #[cfg(test)]
    pub(crate) async fn get_as_image(
        &self,
        ctx: &RequestContext,
--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -22,7 +22,7 @@ const FOREVER: std::time::Duration = std::time::Duration::from_secs(ADVANCE.as_s
 async fn smoke_test() {
    let handle = tokio::runtime::Handle::current();

-    let h = TenantHarness::create("smoke_test").await.unwrap();
+    let h = TenantHarness::create("smoke_test").unwrap();
    let span = h.span();
    let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
    let (tenant, _) = h.load().await;
@@ -176,9 +176,7 @@ async fn evict_and_wait_on_wanted_deleted() {
    // this is the runtime on which Layer spawns the blocking tasks on
    let handle = tokio::runtime::Handle::current();

-    let h = TenantHarness::create("evict_and_wait_on_wanted_deleted")
-        .await
-        .unwrap();
+    let h = TenantHarness::create("evict_and_wait_on_wanted_deleted").unwrap();
    utils::logging::replace_panic_hook_with_tracing_panic_hook().forget();
    let (tenant, ctx) = h.load().await;

@@ -260,9 +258,7 @@ fn read_wins_pending_eviction() {
    rt.block_on(async move {
        // this is the runtime on which Layer spawns the blocking tasks on
        let handle = tokio::runtime::Handle::current();
-        let h = TenantHarness::create("read_wins_pending_eviction")
-            .await
-            .unwrap();
+        let h = TenantHarness::create("read_wins_pending_eviction").unwrap();
        let (tenant, ctx) = h.load().await;
        let span = h.span();
        let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
@@ -394,7 +390,7 @@ fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) {
    rt.block_on(async move {
        // this is the runtime on which Layer spawns the blocking tasks on
        let handle = tokio::runtime::Handle::current();
-        let h = TenantHarness::create(name).await.unwrap();
+        let h = TenantHarness::create(name).unwrap();
        let (tenant, ctx) = h.load().await;
        let span = h.span();
        let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
@@ -563,9 +559,8 @@ fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) {
 #[tokio::test(start_paused = true)]
 async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() {
    let handle = tokio::runtime::Handle::current();
-    let h = TenantHarness::create("cancelled_get_or_maybe_download_does_not_cancel_eviction")
-        .await
-        .unwrap();
+    let h =
+        TenantHarness::create("cancelled_get_or_maybe_download_does_not_cancel_eviction").unwrap();
    let (tenant, ctx) = h.load().await;

    let timeline = tenant
@@ -641,9 +636,7 @@ async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() {
 #[tokio::test(start_paused = true)]
 async fn evict_and_wait_does_not_wait_for_download() {
    // let handle = tokio::runtime::Handle::current();
-    let h = TenantHarness::create("evict_and_wait_does_not_wait_for_download")
-        .await
-        .unwrap();
+    let h = TenantHarness::create("evict_and_wait_does_not_wait_for_download").unwrap();
    let (tenant, ctx) = h.load().await;
    let span = h.span();
    let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
@@ -740,9 +733,7 @@ async fn eviction_cancellation_on_drop() {
    // this is the runtime on which Layer spawns the blocking tasks on
    let handle = tokio::runtime::Handle::current();

-    let h = TenantHarness::create("eviction_cancellation_on_drop")
-        .await
-        .unwrap();
+    let h = TenantHarness::create("eviction_cancellation_on_drop").unwrap();
    utils::logging::replace_panic_hook_with_tracing_panic_hook().forget();
    let (tenant, ctx) = h.load().await;

--- a/pageserver/src/tenant/storage_layer/layer_desc.rs
+++ b/pageserver/src/tenant/storage_layer/layer_desc.rs
@@ -25,7 +25,7 @@ pub struct PersistentLayerDesc {
    ///
    /// - For an open in-memory layer, the end bound is MAX_LSN
    /// - For a frozen in-memory layer or a delta layer, the end bound is a valid lsn after the
-    ///   range start
+    /// range start
    /// - An image layer represents snapshot at one LSN, so end_lsn is always the snapshot LSN + 1
    pub lsn_range: Range<Lsn>,
    /// Whether this is a delta layer, and also, is this incremental.
--- a/pageserver/src/tenant/storage_layer/merge_iterator.rs
+++ b/pageserver/src/tenant/storage_layer/merge_iterator.rs
@@ -1,561 +0,0 @@
-use std::{
-    cmp::Ordering,
-    collections::{binary_heap, BinaryHeap},
-};
-
-use pageserver_api::key::Key;
-use utils::lsn::Lsn;
-
-use crate::{context::RequestContext, repository::Value};
-
-use super::{
-    delta_layer::{DeltaLayerInner, DeltaLayerIterator},
-    image_layer::{ImageLayerInner, ImageLayerIterator},
-};
-
-#[derive(Clone, Copy)]
-enum LayerRef<'a> {
-    Image(&'a ImageLayerInner),
-    Delta(&'a DeltaLayerInner),
-}
-
-impl<'a> LayerRef<'a> {
-    fn iter(self, ctx: &'a RequestContext) -> LayerIterRef<'a> {
-        match self {
-            Self::Image(x) => LayerIterRef::Image(x.iter(ctx)),
-            Self::Delta(x) => LayerIterRef::Delta(x.iter(ctx)),
-        }
-    }
-}
-
-enum LayerIterRef<'a> {
-    Image(ImageLayerIterator<'a>),
-    Delta(DeltaLayerIterator<'a>),
-}
-
-impl LayerIterRef<'_> {
-    async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
-        match self {
-            Self::Delta(x) => x.next().await,
-            Self::Image(x) => x.next().await,
-        }
-    }
-}
-
-/// This type plays several roles at once
-/// 1. Unified iterator for image and delta layers.
-/// 2. `Ord` for use in [`MergeIterator::heap`] (for the k-merge).
-/// 3. Lazy creation of the real delta/image iterator.
-enum IteratorWrapper<'a> {
-    NotLoaded {
-        ctx: &'a RequestContext,
-        first_key_lower_bound: (Key, Lsn),
-        layer: LayerRef<'a>,
-    },
-    Loaded {
-        iter: PeekableLayerIterRef<'a>,
-    },
-}
-
-struct PeekableLayerIterRef<'a> {
-    iter: LayerIterRef<'a>,
-    peeked: Option<(Key, Lsn, Value)>, // None == end
-}
-
-impl<'a> PeekableLayerIterRef<'a> {
-    async fn create(mut iter: LayerIterRef<'a>) -> anyhow::Result<Self> {
-        let peeked = iter.next().await?;
-        Ok(Self { iter, peeked })
-    }
-
-    fn peek(&self) -> &Option<(Key, Lsn, Value)> {
-        &self.peeked
-    }
-
-    async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
-        let result = self.peeked.take();
-        self.peeked = self.iter.next().await?;
-        Ok(result)
-    }
-}
-
-impl<'a> std::cmp::PartialEq for IteratorWrapper<'a> {
-    fn eq(&self, other: &Self) -> bool {
-        self.cmp(other) == Ordering::Equal
-    }
-}
-
-impl<'a> std::cmp::Eq for IteratorWrapper<'a> {}
-
-impl<'a> std::cmp::PartialOrd for IteratorWrapper<'a> {
-    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
-        Some(self.cmp(other))
-    }
-}
-
-impl<'a> std::cmp::Ord for IteratorWrapper<'a> {
-    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
-        use std::cmp::Ordering;
-        let a = self.peek_next_key_lsn_value();
-        let b = other.peek_next_key_lsn_value();
-        match (a, b) {
-            (Some((k1, l1, v1)), Some((k2, l2, v2))) => {
-                fn map_value_to_num(val: &Option<&Value>) -> usize {
-                    match val {
-                        None => 0,
-                        Some(Value::Image(_)) => 1,
-                        Some(Value::WalRecord(_)) => 2,
-                    }
-                }
-                let order_1 = map_value_to_num(&v1);
-                let order_2 = map_value_to_num(&v2);
-                // When key_lsn are the same, the unloaded iter will always appear before the loaded one.
-                // And note that we do a reverse at the end of the comparison, so it works with the max heap.
-                (k1, l1, order_1).cmp(&(k2, l2, order_2))
-            }
-            (Some(_), None) => Ordering::Less,
-            (None, Some(_)) => Ordering::Greater,
-            (None, None) => Ordering::Equal,
-        }
-        .reverse()
-    }
-}
-
-impl<'a> IteratorWrapper<'a> {
-    pub fn create_from_image_layer(
-        image_layer: &'a ImageLayerInner,
-        ctx: &'a RequestContext,
-    ) -> Self {
-        Self::NotLoaded {
-            layer: LayerRef::Image(image_layer),
-            first_key_lower_bound: (image_layer.key_range().start, image_layer.lsn()),
-            ctx,
-        }
-    }
-
-    pub fn create_from_delta_layer(
-        delta_layer: &'a DeltaLayerInner,
-        ctx: &'a RequestContext,
-    ) -> Self {
-        Self::NotLoaded {
-            layer: LayerRef::Delta(delta_layer),
-            first_key_lower_bound: (delta_layer.key_range().start, delta_layer.lsn_range().start),
-            ctx,
-        }
-    }
-
-    fn peek_next_key_lsn_value(&self) -> Option<(&Key, Lsn, Option<&Value>)> {
-        match self {
-            Self::Loaded { iter } => iter
-                .peek()
-                .as_ref()
-                .map(|(key, lsn, val)| (key, *lsn, Some(val))),
-            Self::NotLoaded {
-                first_key_lower_bound: (key, lsn),
-                ..
-            } => Some((key, *lsn, None)),
-        }
-    }
-
-    // CORRECTNESS: this function must always take `&mut self`, never `&self`.
-    //
-    // The reason is that `impl Ord for Self` evaluates differently after this function
-    // returns. We're called through a `PeekMut::deref_mut`, which causes heap repair when
-    // the PeekMut gets returned. So, it's critical that we actually run through `PeekMut::deref_mut`
-    // and not just `PeekMut::deref`
-    // If we don't take `&mut self`
-    async fn load(&mut self) -> anyhow::Result<()> {
-        assert!(!self.is_loaded());
-        let Self::NotLoaded {
-            ctx,
-            first_key_lower_bound,
-            layer,
-        } = self
-        else {
-            unreachable!()
-        };
-        let iter = layer.iter(ctx);
-        let iter = PeekableLayerIterRef::create(iter).await?;
-        if let Some((k1, l1, _)) = iter.peek() {
-            let (k2, l2) = first_key_lower_bound;
-            debug_assert!((k1, l1) >= (k2, l2));
-        }
-        *self = Self::Loaded { iter };
-        Ok(())
-    }
-
-    fn is_loaded(&self) -> bool {
-        matches!(self, Self::Loaded { .. })
-    }
-
-    /// Correctness: must load the iterator before using.
-    ///
-    /// Given this iterator wrapper is private to the merge iterator, users won't be able to mis-use it.
-    /// The public interfaces to use are [`crate::tenant::storage_layer::delta_layer::DeltaLayerIterator`] and
-    /// [`crate::tenant::storage_layer::image_layer::ImageLayerIterator`].
-    async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
-        let Self::Loaded { iter } = self else {
-            panic!("must load the iterator before using")
-        };
-        iter.next().await
-    }
-}
-
-/// A merge iterator over delta/image layer iterators. When duplicated records are
-/// found, the iterator will not perform any deduplication, and the caller should handle
-/// these situation. By saying duplicated records, there are many possibilities:
-/// * Two same delta at the same LSN.
-/// * Two same image at the same LSN.
-/// * Delta/image at the same LSN where the image has already applied the delta.
-/// The iterator will always put the image before the delta.
-pub struct MergeIterator<'a> {
-    heap: BinaryHeap<IteratorWrapper<'a>>,
-}
-
-impl<'a> MergeIterator<'a> {
-    pub fn create(
-        deltas: &[&'a DeltaLayerInner],
-        images: &[&'a ImageLayerInner],
-        ctx: &'a RequestContext,
-    ) -> Self {
-        let mut heap = Vec::with_capacity(images.len() + deltas.len());
-        for image in images {
-            heap.push(IteratorWrapper::create_from_image_layer(image, ctx));
-        }
-        for delta in deltas {
-            heap.push(IteratorWrapper::create_from_delta_layer(delta, ctx));
-        }
-        Self {
-            heap: BinaryHeap::from(heap),
-        }
-    }
-
-    pub async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
-        while let Some(mut iter) = self.heap.peek_mut() {
-            if !iter.is_loaded() {
-                // Once we load the iterator, we can know the real first key-value pair in the iterator.
-                // We put it back into the heap so that a potentially unloaded layer may have a key between
-                // [potential_first_key, loaded_first_key).
-                iter.load().await?;
-                continue;
-            }
-            let Some(item) = iter.next().await? else {
-                // If the iterator returns None, we pop this iterator. Actually, in the current implementation,
-                // we order None > Some, and all the rest of the iterators should return None.
-                binary_heap::PeekMut::pop(iter);
-                continue;
-            };
-            return Ok(Some(item));
-        }
-        Ok(None)
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    use itertools::Itertools;
-    use pageserver_api::key::Key;
-    use utils::lsn::Lsn;
-
-    use crate::{
-        tenant::{
-            harness::{TenantHarness, TIMELINE_ID},
-            storage_layer::delta_layer::test::{produce_delta_layer, sort_delta, sort_delta_value},
-        },
-        walrecord::NeonWalRecord,
-        DEFAULT_PG_VERSION,
-    };
-
-    async fn assert_merge_iter_equal(
-        merge_iter: &mut MergeIterator<'_>,
-        expect: &[(Key, Lsn, Value)],
-    ) {
-        let mut expect_iter = expect.iter();
-        loop {
-            let o1 = merge_iter.next().await.unwrap();
-            let o2 = expect_iter.next();
-            assert_eq!(o1.is_some(), o2.is_some());
-            if o1.is_none() && o2.is_none() {
-                break;
-            }
-            let (k1, l1, v1) = o1.unwrap();
-            let (k2, l2, v2) = o2.unwrap();
-            assert_eq!(&k1, k2);
-            assert_eq!(l1, *l2);
-            assert_eq!(&v1, v2);
-        }
-    }
-
-    #[tokio::test]
-    async fn merge_in_between() {
-        use crate::repository::Value;
-        use bytes::Bytes;
-
-        let harness = TenantHarness::create("merge_iterator_merge_in_between")
-            .await
-            .unwrap();
-        let (tenant, ctx) = harness.load().await;
-
-        let tline = tenant
-            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
-            .await
-            .unwrap();
-
-        fn get_key(id: u32) -> Key {
-            let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap();
-            key.field6 = id;
-            key
-        }
-        let test_deltas1 = vec![
-            (
-                get_key(0),
-                Lsn(0x10),
-                Value::Image(Bytes::copy_from_slice(b"test")),
-            ),
-            (
-                get_key(5),
-                Lsn(0x10),
-                Value::Image(Bytes::copy_from_slice(b"test")),
-            ),
-        ];
-        let resident_layer_1 = produce_delta_layer(&tenant, &tline, test_deltas1.clone(), &ctx)
-            .await
-            .unwrap();
-        let test_deltas2 = vec![
-            (
-                get_key(3),
-                Lsn(0x10),
-                Value::Image(Bytes::copy_from_slice(b"test")),
-            ),
-            (
-                get_key(4),
-                Lsn(0x10),
-                Value::Image(Bytes::copy_from_slice(b"test")),
-            ),
-        ];
-        let resident_layer_2 = produce_delta_layer(&tenant, &tline, test_deltas2.clone(), &ctx)
-            .await
-            .unwrap();
-        let mut merge_iter = MergeIterator::create(
-            &[
-                resident_layer_2.get_as_delta(&ctx).await.unwrap(),
-                resident_layer_1.get_as_delta(&ctx).await.unwrap(),
-            ],
-            &[],
-            &ctx,
-        );
-        let mut expect = Vec::new();
-        expect.extend(test_deltas1);
-        expect.extend(test_deltas2);
-        expect.sort_by(sort_delta);
-        assert_merge_iter_equal(&mut merge_iter, &expect).await;
-    }
-
-    #[tokio::test]
-    async fn delta_merge() {
-        use crate::repository::Value;
-        use bytes::Bytes;
-
-        let harness = TenantHarness::create("merge_iterator_delta_merge")
-            .await
-            .unwrap();
-        let (tenant, ctx) = harness.load().await;
-
-        let tline = tenant
-            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
-            .await
-            .unwrap();
-
-        fn get_key(id: u32) -> Key {
-            let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap();
-            key.field6 = id;
-            key
-        }
-        const N: usize = 1000;
-        let test_deltas1 = (0..N)
-            .map(|idx| {
-                (
-                    get_key(idx as u32 / 10),
-                    Lsn(0x20 * ((idx as u64) % 10 + 1)),
-                    Value::Image(Bytes::from(format!("img{idx:05}"))),
-                )
-            })
-            .collect_vec();
-        let resident_layer_1 = produce_delta_layer(&tenant, &tline, test_deltas1.clone(), &ctx)
-            .await
-            .unwrap();
-        let test_deltas2 = (0..N)
-            .map(|idx| {
-                (
-                    get_key(idx as u32 / 10),
-                    Lsn(0x20 * ((idx as u64) % 10 + 1) + 0x10),
-                    Value::Image(Bytes::from(format!("img{idx:05}"))),
-                )
-            })
-            .collect_vec();
-        let resident_layer_2 = produce_delta_layer(&tenant, &tline, test_deltas2.clone(), &ctx)
-            .await
-            .unwrap();
-        let test_deltas3 = (0..N)
-            .map(|idx| {
-                (
-                    get_key(idx as u32 / 10 + N as u32),
-                    Lsn(0x10 * ((idx as u64) % 10 + 1)),
-                    Value::Image(Bytes::from(format!("img{idx:05}"))),
-                )
-            })
-            .collect_vec();
-        let resident_layer_3 = produce_delta_layer(&tenant, &tline, test_deltas3.clone(), &ctx)
-            .await
-            .unwrap();
-        let mut merge_iter = MergeIterator::create(
-            &[
-                resident_layer_1.get_as_delta(&ctx).await.unwrap(),
-                resident_layer_2.get_as_delta(&ctx).await.unwrap(),
-                resident_layer_3.get_as_delta(&ctx).await.unwrap(),
-            ],
-            &[],
-            &ctx,
-        );
-        let mut expect = Vec::new();
-        expect.extend(test_deltas1);
-        expect.extend(test_deltas2);
-        expect.extend(test_deltas3);
-        expect.sort_by(sort_delta);
-        assert_merge_iter_equal(&mut merge_iter, &expect).await;
-
-        // TODO: test layers are loaded only when needed, reducing num of active iterators in k-merge
-    }
-
-    #[tokio::test]
-    async fn delta_image_mixed_merge() {
-        use crate::repository::Value;
-        use bytes::Bytes;
-
-        let harness = TenantHarness::create("merge_iterator_delta_image_mixed_merge")
-            .await
-            .unwrap();
-        let (tenant, ctx) = harness.load().await;
-
-        let tline = tenant
-            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
-            .await
-            .unwrap();
-
-        fn get_key(id: u32) -> Key {
-            let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap();
-            key.field6 = id;
-            key
-        }
-        // In this test case, we want to test if the iterator still works correctly with multiple copies
-        // of a delta+image at the same LSN, for example, the following sequence a@10=+a, a@10=+a, a@10=ab, a@10=ab.
-        // Duplicated deltas/images are possible for old tenants before the full L0 compaction file name fix.
-        // An incomplete compaction could produce multiple exactly-the-same delta layers. Force image generation
-        // could produce overlapping images. Apart from duplicated deltas/images, in the current storage implementation
-        // one key-lsn could have a delta in the delta layer and one image in the image layer. The iterator should
-        // correctly process these situations and return everything as-is, and the upper layer of the system
-        // will handle duplicated LSNs.
-        let test_deltas1 = vec![
-            (
-                get_key(0),
-                Lsn(0x10),
-                Value::WalRecord(NeonWalRecord::wal_init()),
-            ),
-            (
-                get_key(0),
-                Lsn(0x18),
-                Value::WalRecord(NeonWalRecord::wal_append("a")),
-            ),
-            (
-                get_key(5),
-                Lsn(0x10),
-                Value::WalRecord(NeonWalRecord::wal_init()),
-            ),
-            (
-                get_key(5),
-                Lsn(0x18),
-                Value::WalRecord(NeonWalRecord::wal_append("b")),
-            ),
-        ];
-        let resident_layer_1 = produce_delta_layer(&tenant, &tline, test_deltas1.clone(), &ctx)
-            .await
-            .unwrap();
-        let mut test_deltas2 = test_deltas1.clone();
-        test_deltas2.push((
-            get_key(10),
-            Lsn(0x20),
-            Value::Image(Bytes::copy_from_slice(b"test")),
-        ));
-        let resident_layer_2 = produce_delta_layer(&tenant, &tline, test_deltas2.clone(), &ctx)
-            .await
-            .unwrap();
-        let test_deltas3 = vec![
-            (
-                get_key(0),
-                Lsn(0x10),
-                Value::Image(Bytes::copy_from_slice(b"")),
-            ),
-            (
-                get_key(5),
-                Lsn(0x18),
-                Value::Image(Bytes::copy_from_slice(b"b")),
-            ),
-            (
-                get_key(15),
-                Lsn(0x20),
-                Value::Image(Bytes::copy_from_slice(b"test")),
-            ),
-        ];
-        let resident_layer_3 = produce_delta_layer(&tenant, &tline, test_deltas3.clone(), &ctx)
-            .await
-            .unwrap();
-        let mut test_deltas4 = test_deltas3.clone();
-        test_deltas4.push((
-            get_key(20),
-            Lsn(0x20),
-            Value::Image(Bytes::copy_from_slice(b"test")),
-        ));
-        let resident_layer_4 = produce_delta_layer(&tenant, &tline, test_deltas4.clone(), &ctx)
-            .await
-            .unwrap();
-        let mut expect = Vec::new();
-        expect.extend(test_deltas1);
-        expect.extend(test_deltas2);
-        expect.extend(test_deltas3);
-        expect.extend(test_deltas4);
-        expect.sort_by(sort_delta_value);
-
-        // Test with different layer order for MergeIterator::create to ensure the order
-        // is stable.
-
-        let mut merge_iter = MergeIterator::create(
-            &[
-                resident_layer_4.get_as_delta(&ctx).await.unwrap(),
-                resident_layer_1.get_as_delta(&ctx).await.unwrap(),
-                resident_layer_3.get_as_delta(&ctx).await.unwrap(),
-                resident_layer_2.get_as_delta(&ctx).await.unwrap(),
-            ],
-            &[],
-            &ctx,
-        );
-        assert_merge_iter_equal(&mut merge_iter, &expect).await;
-
-        let mut merge_iter = MergeIterator::create(
-            &[
-                resident_layer_1.get_as_delta(&ctx).await.unwrap(),
-                resident_layer_4.get_as_delta(&ctx).await.unwrap(),
-                resident_layer_3.get_as_delta(&ctx).await.unwrap(),
-                resident_layer_2.get_as_delta(&ctx).await.unwrap(),
-            ],
-            &[],
-            &ctx,
-        );
-        assert_merge_iter_equal(&mut merge_iter, &expect).await;
-
-        is_send(merge_iter);
-    }
-
-    fn is_send(_: impl Send) {}
-}
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -66,13 +66,12 @@ use std::{
    ops::{Deref, Range},
 };

+use crate::pgdatadir_mapping::MAX_AUX_FILE_V2_DELTAS;
 use crate::{
    aux_file::AuxFileSizeEstimator,
    tenant::{
-        config::defaults::DEFAULT_PITR_INTERVAL,
        layer_map::{LayerMap, SearchResult},
        metadata::TimelineMetadata,
-        storage_layer::PersistentLayerDesc,
    },
 };
 use crate::{
@@ -99,7 +98,6 @@ use crate::{
    metrics::ScanLatencyOngoingRecording, tenant::timeline::logical_size::CurrentLogicalSize,
 };
 use crate::{pgdatadir_mapping::LsnForTimestamp, tenant::tasks::BackgroundLoopKind};
-use crate::{pgdatadir_mapping::MAX_AUX_FILE_V2_DELTAS, tenant::storage_layer::PersistentLayerKey};
 use crate::{
    pgdatadir_mapping::{AuxFilesDirectory, DirectoryKind},
    virtual_file::{MaybeFatalIo, VirtualFile},
@@ -198,7 +196,7 @@ impl PartialOrd for Hole {

 /// Temporary function for immutable storage state refactor, ensures we are dropping mutex guard instead of other things.
 /// Can be removed after all refactors are done.
-fn drop_rlock<T>(rlock: tokio::sync::RwLockReadGuard<T>) {
+fn drop_rlock<T>(rlock: tokio::sync::OwnedRwLockReadGuard<T>) {
    drop(rlock)
 }

@@ -271,7 +269,7 @@ pub struct Timeline {
    ///
    /// In the future, we'll be able to split up the tuple of LayerMap and `LayerFileManager`,
    /// so that e.g. on-demand-download/eviction, and layer spreading, can operate just on `LayerFileManager`.
-    pub(crate) layers: tokio::sync::RwLock<LayerManager>,
+    pub(crate) layers: Arc<tokio::sync::RwLock<LayerManager>>,

    last_freeze_at: AtomicLsn,
    // Atomic would be more appropriate here.
@@ -478,32 +476,37 @@ impl GcInfo {
    }
 }

-/// The `GcInfo` component describing which Lsns need to be retained.  Functionally, this
-/// is a single number (the oldest LSN which we must retain), but it internally distinguishes
-/// between time-based and space-based retention for observability and consumption metrics purposes.
+/// The `GcInfo` component describing which Lsns need to be retained.
 #[derive(Debug)]
 pub(crate) struct GcCutoffs {
-    /// Calculated from the [`TenantConf::gc_horizon`], this LSN indicates how much
-    /// history we must keep to retain a specified number of bytes of WAL.
-    pub(crate) space: Lsn,
+    /// Keep everything newer than this point.
+    ///
+    /// This is calculated by subtracting 'gc_horizon' setting from
+    /// last-record LSN
+    ///
+    /// FIXME: is this inclusive or exclusive?
+    pub(crate) horizon: Lsn,

-    /// Calculated from [`TenantConf::pitr_interval`], this LSN indicates how much
-    /// history we must keep to enable reading back at least the PITR interval duration.
-    pub(crate) time: Lsn,
+    /// In addition to 'retain_lsns' and 'horizon_cutoff', keep everything newer than this
+    /// point.
+    ///
+    /// This is calculated by finding a number such that a record is needed for PITR
+    /// if only if its LSN is larger than 'pitr_cutoff'.
+    pub(crate) pitr: Lsn,
 }

 impl Default for GcCutoffs {
    fn default() -> Self {
        Self {
-            space: Lsn::INVALID,
-            time: Lsn::INVALID,
+            horizon: Lsn::INVALID,
+            pitr: Lsn::INVALID,
        }
    }
 }

 impl GcCutoffs {
    fn select_min(&self) -> Lsn {
-        std::cmp::min(self.space, self.time)
+        std::cmp::min(self.horizon, self.pitr)
    }
 }

@@ -725,9 +728,6 @@ impl From<CreateImageLayersError> for CompactionError {
    fn from(e: CreateImageLayersError) -> Self {
        match e {
            CreateImageLayersError::Cancelled => CompactionError::ShuttingDown,
-            CreateImageLayersError::Other(e) => {
-                CompactionError::Other(e.context("create image layers"))
-            }
            _ => CompactionError::Other(e.into()),
        }
    }
@@ -862,7 +862,7 @@ impl Timeline {
        let gc_info = self.gc_info.read().unwrap();
        let history = self
            .get_last_record_lsn()
-            .checked_sub(gc_info.cutoffs.time)
+            .checked_sub(gc_info.cutoffs.pitr)
            .unwrap_or(Lsn(0))
            .0;
        (history, gc_info.within_ancestor_pitr)
@@ -1561,7 +1561,7 @@ impl Timeline {
    ) -> anyhow::Result<()> {
        ensure!(
            lsn >= **latest_gc_cutoff_lsn,
-            "LSN {} is earlier than latest GC cutoff {} (we might've already garbage collected needed data)",
+            "LSN {} is earlier than latest GC horizon {} (we might've already garbage collected needed data)",
            lsn,
            **latest_gc_cutoff_lsn,
        );
@@ -3404,8 +3404,6 @@ impl Timeline {
        }
    }

-    #[allow(unknown_lints)] // doc_lazy_continuation is still a new lint
-    #[allow(clippy::doc_lazy_continuation)]
    /// Get the data needed to reconstruct all keys in the provided keyspace
    ///
    /// The algorithm is as follows:
@@ -4472,10 +4470,10 @@ impl Timeline {
    /// are required. Since checking if new image layers are required is expensive in
    /// terms of CPU, we only do it in the following cases:
    /// 1. If the timeline has ingested sufficient WAL to justify the cost
-    /// 2. If enough time has passed since the last check:
-    ///     1. For large tenants, we wish to perform the check more often since they
-    ///        suffer from the lack of image layers
-    ///     2. For small tenants (that can mostly fit in RAM), we use a much longer interval
+    /// 2. If enough time has passed since the last check
+    /// 2.1. For large tenants, we wish to perform the check more often since they
+    /// suffer from the lack of image layers
+    /// 2.2. For small tenants (that can mostly fit in RAM), we use a much longer interval
    fn should_check_if_image_layers_required(self: &Arc<Timeline>, lsn: Lsn) -> bool {
        const LARGE_TENANT_THRESHOLD: u64 = 2 * 1024 * 1024 * 1024;

@@ -4571,22 +4569,6 @@ impl Timeline {
                    start = img_range.end;
                    continue;
                }
-            } else if let ImageLayerCreationMode::Force = mode {
-                // When forced to create image layers, we might try and create them where they already
-                // exist.  This mode is only used in tests/debug.
-                let layers = self.layers.read().await;
-                if layers.contains_key(&PersistentLayerKey {
-                    key_range: img_range.clone(),
-                    lsn_range: PersistentLayerDesc::image_layer_lsn_range(lsn),
-                    is_delta: false,
-                }) {
-                    tracing::info!(
-                        "Skipping image layer at {lsn} {}..{}, already exists",
-                        img_range.start,
-                        img_range.end
-                    );
-                    continue;
-                }
            }

            let image_layer_writer = ImageLayerWriter::new(
@@ -4717,7 +4699,7 @@ impl Timeline {
    /// Requires a timeline that:
    /// - has an ancestor to detach from
    /// - the ancestor does not have an ancestor -- follows from the original RFC limitations, not
-    ///   a technical requirement
+    /// a technical requirement
    ///
    /// After the operation has been started, it cannot be canceled. Upon restart it needs to be
    /// polled again until completion.
@@ -4729,7 +4711,13 @@ impl Timeline {
        tenant: &crate::tenant::Tenant,
        options: detach_ancestor::Options,
        ctx: &RequestContext,
-    ) -> Result<detach_ancestor::Progress, detach_ancestor::Error> {
+    ) -> Result<
+        (
+            completion::Completion,
+            detach_ancestor::PreparedTimelineDetach,
+        ),
+        detach_ancestor::Error,
+    > {
        detach_ancestor::prepare(self, tenant, options, ctx).await
    }

@@ -4936,21 +4924,24 @@ impl Timeline {
    }

    /// Find the Lsns above which layer files need to be retained on
-    /// garbage collection.
+    /// garbage collection. This is separate from actually performing the GC,
+    /// and is updated more frequently, so that compaction can remove obsolete
+    /// page versions more aggressively.
    ///
-    /// We calculate two cutoffs, one based on time and one based on WAL size.  `pitr`
-    /// controls the time cutoff (or ZERO to disable time-based retention), and `space_cutoff` controls
-    /// the space-based retention.
+    /// TODO: that's wishful thinking, compaction doesn't actually do that
+    /// currently.
    ///
-    /// This function doesn't simply to calculate time & space based retention: it treats time-based
-    /// retention as authoritative if enabled, and falls back to space-based retention if calculating
-    /// the LSN for a time point isn't possible.  Therefore the GcCutoffs::horizon in the response might
-    /// be different to the `space_cutoff` input.  Callers should treat the min() of the two cutoffs
-    /// in the response as the GC cutoff point for the timeline.
+    /// The 'cutoff_horizon' point is used to retain recent versions that might still be
+    /// needed by read-only nodes. (As of this writing, the caller just passes
+    /// the latest LSN subtracted by a constant, and doesn't do anything smart
+    /// to figure out what read-only nodes might actually need.)
+    ///
+    /// The 'pitr' duration is used to calculate a 'pitr_cutoff', which can be used to determine
+    /// whether a record is needed for PITR.
    #[instrument(skip_all, fields(timeline_id=%self.timeline_id))]
    pub(super) async fn find_gc_cutoffs(
        &self,
-        space_cutoff: Lsn,
+        cutoff_horizon: Lsn,
        pitr: Duration,
        cancel: &CancellationToken,
        ctx: &RequestContext,
@@ -4963,87 +4954,58 @@ impl Timeline {

        pausable_failpoint!("Timeline::find_gc_cutoffs-pausable");

-        if cfg!(test) {
-            // Unit tests which specify zero PITR interval expect to avoid doing any I/O for timestamp lookup
-            if pitr == Duration::ZERO {
-                return Ok(GcCutoffs {
-                    time: self.get_last_record_lsn(),
-                    space: space_cutoff,
-                });
-            }
-        }
-
-        // Calculate a time-based limit on how much to retain:
-        // - if PITR interval is set, then this is our cutoff.
-        // - if PITR interval is not set, then we do a lookup
-        //   based on DEFAULT_PITR_INTERVAL, so that size-based retention does not result in keeping history around permanently on idle databases.
-        let time_cutoff = {
+        // First, calculate pitr_cutoff_timestamp and then convert it to LSN.
+        //
+        // Some unit tests depend on garbage-collection working even when
+        // CLOG data is missing, so that find_lsn_for_timestamp() doesn't
+        // work, so avoid calling it altogether if time-based retention is not
+        // configured. It would be pointless anyway.
+        let pitr_cutoff = if pitr != Duration::ZERO {
            let now = SystemTime::now();
-            let time_range = if pitr == Duration::ZERO {
-                humantime::parse_duration(DEFAULT_PITR_INTERVAL).expect("constant is invalid")
+            if let Some(pitr_cutoff_timestamp) = now.checked_sub(pitr) {
+                let pitr_timestamp = to_pg_timestamp(pitr_cutoff_timestamp);
+
+                match self
+                    .find_lsn_for_timestamp(pitr_timestamp, cancel, ctx)
+                    .await?
+                {
+                    LsnForTimestamp::Present(lsn) => lsn,
+                    LsnForTimestamp::Future(lsn) => {
+                        // The timestamp is in the future. That sounds impossible,
+                        // but what it really means is that there hasn't been
+                        // any commits since the cutoff timestamp.
+                        //
+                        // In this case we should use the LSN of the most recent commit,
+                        // which is implicitly the last LSN in the log.
+                        debug!("future({})", lsn);
+                        self.get_last_record_lsn()
+                    }
+                    LsnForTimestamp::Past(lsn) => {
+                        debug!("past({})", lsn);
+                        // conservative, safe default is to remove nothing, when we
+                        // have no commit timestamp data available
+                        *self.get_latest_gc_cutoff_lsn()
+                    }
+                    LsnForTimestamp::NoData(lsn) => {
+                        debug!("nodata({})", lsn);
+                        // conservative, safe default is to remove nothing, when we
+                        // have no commit timestamp data available
+                        *self.get_latest_gc_cutoff_lsn()
+                    }
+                }
            } else {
-                pitr
-            };
-
-            // If PITR is so large or `now` is so small that this underflows, we will retain no history (highly unexpected case)
-            let time_cutoff = now.checked_sub(time_range).unwrap_or(now);
-            let timestamp = to_pg_timestamp(time_cutoff);
-
-            match self.find_lsn_for_timestamp(timestamp, cancel, ctx).await? {
-                LsnForTimestamp::Present(lsn) => Some(lsn),
-                LsnForTimestamp::Future(lsn) => {
-                    // The timestamp is in the future. That sounds impossible,
-                    // but what it really means is that there hasn't been
-                    // any commits since the cutoff timestamp.
-                    //
-                    // In this case we should use the LSN of the most recent commit,
-                    // which is implicitly the last LSN in the log.
-                    debug!("future({})", lsn);
-                    Some(self.get_last_record_lsn())
-                }
-                LsnForTimestamp::Past(lsn) => {
-                    debug!("past({})", lsn);
-                    None
-                }
-                LsnForTimestamp::NoData(lsn) => {
-                    debug!("nodata({})", lsn);
-                    None
-                }
+                // If we don't have enough data to convert to LSN,
+                // play safe and don't remove any layers.
+                *self.get_latest_gc_cutoff_lsn()
            }
+        } else {
+            // No time-based retention was configured. Interpret this as "keep no history".
+            self.get_last_record_lsn()
        };

-        Ok(match (pitr, time_cutoff) {
-            (Duration::ZERO, Some(time_cutoff)) => {
-                // PITR is not set. Retain the size-based limit, or the default time retention,
-                // whichever requires less data.
-                GcCutoffs {
-                    time: self.get_last_record_lsn(),
-                    space: std::cmp::max(time_cutoff, space_cutoff),
-                }
-            }
-            (Duration::ZERO, None) => {
-                // PITR is not set, and time lookup failed
-                GcCutoffs {
-                    time: self.get_last_record_lsn(),
-                    space: space_cutoff,
-                }
-            }
-            (_, None) => {
-                // PITR interval is set & we didn't look up a timestamp successfully.  Conservatively assume PITR
-                // cannot advance beyond what was already GC'd, and respect space-based retention
-                GcCutoffs {
-                    time: *self.get_latest_gc_cutoff_lsn(),
-                    space: space_cutoff,
-                }
-            }
-            (_, Some(time_cutoff)) => {
-                // PITR interval is set and we looked up timestamp successfully.  Ignore
-                // size based retention and make time cutoff authoritative
-                GcCutoffs {
-                    time: time_cutoff,
-                    space: time_cutoff,
-                }
-            }
+        Ok(GcCutoffs {
+            horizon: cutoff_horizon,
+            pitr: pitr_cutoff,
        })
    }

@@ -5068,11 +5030,11 @@ impl Timeline {
            return Err(GcError::TimelineCancelled);
        }

-        let (space_cutoff, time_cutoff, retain_lsns, max_lsn_with_valid_lease) = {
+        let (horizon_cutoff, pitr_cutoff, retain_lsns, max_lsn_with_valid_lease) = {
            let gc_info = self.gc_info.read().unwrap();

-            let space_cutoff = min(gc_info.cutoffs.space, self.get_disk_consistent_lsn());
-            let time_cutoff = gc_info.cutoffs.time;
+            let horizon_cutoff = min(gc_info.cutoffs.horizon, self.get_disk_consistent_lsn());
+            let pitr_cutoff = gc_info.cutoffs.pitr;
            let retain_lsns = gc_info.retain_lsns.clone();

            // Gets the maximum LSN that holds the valid lease.
@@ -5082,14 +5044,14 @@ impl Timeline {
            let max_lsn_with_valid_lease = gc_info.leases.last_key_value().map(|(lsn, _)| *lsn);

            (
-                space_cutoff,
-                time_cutoff,
+                horizon_cutoff,
+                pitr_cutoff,
                retain_lsns,
                max_lsn_with_valid_lease,
            )
        };

-        let mut new_gc_cutoff = Lsn::min(space_cutoff, time_cutoff);
+        let mut new_gc_cutoff = Lsn::min(horizon_cutoff, pitr_cutoff);
        let standby_horizon = self.standby_horizon.load();
        // Hold GC for the standby, but as a safety guard do it only within some
        // reasonable lag.
@@ -5118,8 +5080,8 @@ impl Timeline {

        let res = self
            .gc_timeline(
-                space_cutoff,
-                time_cutoff,
+                horizon_cutoff,
+                pitr_cutoff,
                retain_lsns,
                max_lsn_with_valid_lease,
                new_gc_cutoff,
@@ -5137,8 +5099,8 @@ impl Timeline {

    async fn gc_timeline(
        &self,
-        space_cutoff: Lsn,
-        time_cutoff: Lsn,
+        horizon_cutoff: Lsn,
+        pitr_cutoff: Lsn,
        retain_lsns: Vec<Lsn>,
        max_lsn_with_valid_lease: Option<Lsn>,
        new_gc_cutoff: Lsn,
@@ -5199,22 +5161,22 @@ impl Timeline {
            result.layers_total += 1;

            // 1. Is it newer than GC horizon cutoff point?
-            if l.get_lsn_range().end > space_cutoff {
+            if l.get_lsn_range().end > horizon_cutoff {
                debug!(
-                    "keeping {} because it's newer than space_cutoff {}",
+                    "keeping {} because it's newer than horizon_cutoff {}",
                    l.layer_name(),
-                    space_cutoff,
+                    horizon_cutoff,
                );
                result.layers_needed_by_cutoff += 1;
                continue 'outer;
            }

            // 2. It is newer than PiTR cutoff point?
-            if l.get_lsn_range().end > time_cutoff {
+            if l.get_lsn_range().end > pitr_cutoff {
                debug!(
-                    "keeping {} because it's newer than time_cutoff {}",
+                    "keeping {} because it's newer than pitr_cutoff {}",
                    l.layer_name(),
-                    time_cutoff,
+                    pitr_cutoff,
                );
                result.layers_needed_by_pitr += 1;
                continue 'outer;
@@ -5990,45 +5952,10 @@ impl<'a> TimelineWriter<'a> {
        batch: VecMap<Lsn, (Key, Value)>,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
-        if batch.is_empty() {
-            return Ok(());
+        for (lsn, (key, val)) in batch {
+            self.put(key, lsn, &val, ctx).await?
        }

-        let first_lsn = batch.as_slice().first().unwrap().0;
-        let last_lsn = batch.as_slice().last().unwrap().0;
-        let mut total_serialized_size = 0;
-
-        let mut serialized = Vec::with_capacity(batch.len());
-        for (l, (k, v)) in batch.into_iter() {
-            // Avoid doing allocations for "small" values.
-            // In the regression test suite, the limit of 256 avoided allocations in 95% of cases:
-            // https://github.com/neondatabase/neon/pull/5056#discussion_r1301975061
-            let mut buf = smallvec::SmallVec::<[u8; 256]>::new();
-            v.ser_into(&mut buf)
-                .expect("Serialization of Value is infallible");
-            let buf_size: u64 = buf.len().try_into().expect("oversized value buf");
-            total_serialized_size += buf_size;
-            serialized.push((l, k, buf, 0));
-        }
-
-        let action = self.get_open_layer_action(first_lsn, total_serialized_size);
-        let layer = self
-            .handle_open_layer_action(first_lsn, action, ctx)
-            .await?;
-
-        layer.put_values(serialized, ctx).await?;
-
-        // Update the current size only when the entire write was ok.
-        // In case of failures, we may have had partial writes which
-        // render the size tracking out of sync. That's ok because
-        // the checkpoint distance should be significantly smaller
-        // than the S3 single shot upload limit of 5GiB.
-        let state = self.write_guard.as_mut().unwrap();
-
-        state.current_size += total_serialized_size;
-        state.prev_lsn = Some(last_lsn);
-        state.max_lsn = std::cmp::max(state.max_lsn, Some(last_lsn));
-
        Ok(())
    }

@@ -6081,9 +6008,8 @@ mod tests {

    #[tokio::test]
    async fn two_layer_eviction_attempts_at_the_same_time() {
-        let harness = TenantHarness::create("two_layer_eviction_attempts_at_the_same_time")
-            .await
-            .unwrap();
+        let harness =
+            TenantHarness::create("two_layer_eviction_attempts_at_the_same_time").unwrap();

        let (tenant, ctx) = harness.load().await;
        let timeline = tenant
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -26,11 +26,9 @@ use utils::id::TimelineId;

 use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
 use crate::page_cache;
-use crate::tenant::config::defaults::{DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD};
-use crate::tenant::storage_layer::merge_iterator::MergeIterator;
 use crate::tenant::storage_layer::{AsLayerDesc, PersistentLayerDesc};
-use crate::tenant::timeline::{drop_rlock, DeltaLayerWriter, ImageLayerWriter};
-use crate::tenant::timeline::{Hole, ImageLayerCreationOutcome};
+use crate::tenant::timeline::{drop_rlock, Hole, ImageLayerCreationOutcome};
+use crate::tenant::timeline::{DeltaLayerWriter, ImageLayerWriter};
 use crate::tenant::timeline::{Layer, ResidentLayer};
 use crate::tenant::DeltaLayer;
 use crate::virtual_file::{MaybeFatalIo, VirtualFile};
@@ -197,7 +195,7 @@ impl Timeline {
        tracing::info!(
            "latest_gc_cutoff: {}, pitr cutoff {}",
            *latest_gc_cutoff,
-            self.gc_info.read().unwrap().cutoffs.time
+            self.gc_info.read().unwrap().cutoffs.pitr
        );

        let layers = self.layers.read().await;
@@ -381,7 +379,7 @@ impl Timeline {
            };

            let begin = tokio::time::Instant::now();
-            let phase1_layers_locked = self.layers.read().await;
+            let phase1_layers_locked = Arc::clone(&self.layers).read_owned().await;
            let now = tokio::time::Instant::now();
            stats.read_lock_acquisition_micros =
                DurationRecorder::Recorded(RecordedDuration(now - begin), now);
@@ -401,9 +399,9 @@ impl Timeline {
    }

    /// Level0 files first phase of compaction, explained in the [`Self::compact_legacy`] comment.
-    async fn compact_level0_phase1<'a>(
-        self: &'a Arc<Self>,
-        guard: tokio::sync::RwLockReadGuard<'a, LayerManager>,
+    async fn compact_level0_phase1(
+        self: &Arc<Self>,
+        guard: tokio::sync::OwnedRwLockReadGuard<LayerManager>,
        mut stats: CompactLevel0Phase1StatsBuilder,
        target_file_size: u64,
        ctx: &RequestContext,
@@ -417,7 +415,6 @@ impl Timeline {
            .map(|x| guard.get_from_desc(&x))
            .collect_vec();
        stats.level0_deltas_count = Some(level0_deltas.len());
-
        // Only compact if enough layers have accumulated.
        let threshold = self.get_compaction_threshold();
        if level0_deltas.is_empty() || level0_deltas.len() < threshold {
@@ -448,22 +445,6 @@ impl Timeline {
        let mut prev_lsn_end = first_level0_delta.layer_desc().lsn_range.end;
        let mut deltas_to_compact = Vec::with_capacity(level0_deltas.len());

-        // Accumulate the size of layers in `deltas_to_compact`
-        let mut deltas_to_compact_bytes = 0;
-
-        // Under normal circumstances, we will accumulate up to compaction_interval L0s of size
-        // checkpoint_distance each.  To avoid edge cases using extra system resources, bound our
-        // work in this function to only operate on this much delta data at once.
-        //
-        // Take the max of the configured value & the default, so that tests that configure tiny values
-        // can still use a sensible amount of memory, but if a deployed system configures bigger values we
-        // still let them compact a full stack of L0s in one go.
-        let delta_size_limit = std::cmp::max(
-            self.get_compaction_threshold(),
-            DEFAULT_COMPACTION_THRESHOLD,
-        ) as u64
-            * std::cmp::max(self.get_checkpoint_distance(), DEFAULT_CHECKPOINT_DISTANCE);
-
        deltas_to_compact.push(first_level0_delta.download_and_keep_resident().await?);
        for l in level0_deltas_iter {
            let lsn_range = &l.layer_desc().lsn_range;
@@ -472,20 +453,7 @@ impl Timeline {
                break;
            }
            deltas_to_compact.push(l.download_and_keep_resident().await?);
-            deltas_to_compact_bytes += l.metadata().file_size;
            prev_lsn_end = lsn_range.end;
-
-            if deltas_to_compact_bytes >= delta_size_limit {
-                info!(
-                    l0_deltas_selected = deltas_to_compact.len(),
-                    l0_deltas_total = level0_deltas.len(),
-                    "L0 compaction picker hit max delta layer size limit: {}",
-                    delta_size_limit
-                );
-
-                // Proceed with compaction, but only a subset of L0s
-                break;
-            }
        }
        let lsn_range = Range {
            start: deltas_to_compact
@@ -1022,7 +990,7 @@ impl Timeline {
                    "enhanced legacy compaction currently does not support retain_lsns (branches)"
                )));
            }
-            let gc_cutoff = gc_info.cutoffs.select_min();
+            let gc_cutoff = Lsn::min(gc_info.cutoffs.horizon, gc_info.cutoffs.pitr);
            let mut selected_layers = Vec::new();
            // TODO: consider retain_lsns
            drop(gc_info);
@@ -1040,12 +1008,10 @@ impl Timeline {
        );
        // Step 1: (In the future) construct a k-merge iterator over all layers. For now, simply collect all keys + LSNs.
        // Also, collect the layer information to decide when to split the new delta layers.
-        let mut downloaded_layers = Vec::new();
+        let mut all_key_values = Vec::new();
        let mut delta_split_points = BTreeSet::new();
        for layer in &layer_selection {
-            let resident_layer = layer.download_and_keep_resident().await?;
-            downloaded_layers.push(resident_layer);
-
+            all_key_values.extend(layer.load_key_values(ctx).await?);
            let desc = layer.layer_desc();
            if desc.is_delta() {
                // TODO: is it correct to only record split points for deltas intersecting with the GC horizon? (exclude those below/above the horizon)
@@ -1055,28 +1021,44 @@ impl Timeline {
                delta_split_points.insert(key_range.end);
            }
        }
-        let mut delta_layers = Vec::new();
-        let mut image_layers = Vec::new();
-        for resident_layer in &downloaded_layers {
-            if resident_layer.layer_desc().is_delta() {
-                let layer = resident_layer.get_as_delta(ctx).await?;
-                delta_layers.push(layer);
-            } else {
-                let layer = resident_layer.get_as_image(ctx).await?;
-                image_layers.push(layer);
+        // Key small to large, LSN low to high, if the same LSN has both image and delta due to the merge of delta layers and
+        // image layers, make image appear before than delta.
+        struct ValueWrapper<'a>(&'a crate::repository::Value);
+        impl Ord for ValueWrapper<'_> {
+            fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+                use crate::repository::Value;
+                use std::cmp::Ordering;
+                match (self.0, other.0) {
+                    (Value::Image(_), Value::WalRecord(_)) => Ordering::Less,
+                    (Value::WalRecord(_), Value::Image(_)) => Ordering::Greater,
+                    _ => Ordering::Equal,
+                }
            }
        }
-        let mut merge_iter = MergeIterator::create(&delta_layers, &image_layers, ctx);
+        impl PartialOrd for ValueWrapper<'_> {
+            fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+                Some(self.cmp(other))
+            }
+        }
+        impl PartialEq for ValueWrapper<'_> {
+            fn eq(&self, other: &Self) -> bool {
+                self.cmp(other) == std::cmp::Ordering::Equal
+            }
+        }
+        impl Eq for ValueWrapper<'_> {}
+        all_key_values.sort_by(|(k1, l1, v1), (k2, l2, v2)| {
+            (k1, l1, ValueWrapper(v1)).cmp(&(k2, l2, ValueWrapper(v2)))
+        });
        // Step 2: Produce images+deltas. TODO: ensure newly-produced delta does not overlap with other deltas.
        // Data of the same key.
        let mut accumulated_values = Vec::new();
-        let mut last_key: Option<Key> = None;
+        let mut last_key = all_key_values.first().unwrap().0; // TODO: assert all_key_values not empty

        /// Take a list of images and deltas, produce an image at the GC horizon, and a list of deltas above the GC horizon.
        async fn flush_accumulated_states(
            tline: &Arc<Timeline>,
            key: Key,
-            accumulated_values: &[(Key, Lsn, crate::repository::Value)],
+            accumulated_values: &[&(Key, Lsn, crate::repository::Value)],
            horizon: Lsn,
        ) -> anyhow::Result<(Vec<(Key, Lsn, crate::repository::Value)>, bytes::Bytes)> {
            let mut base_image = None;
@@ -1177,7 +1159,7 @@ impl Timeline {
            self.conf,
            self.timeline_id,
            self.tenant_shard_id,
-            &(Key::MIN..Key::MAX), // covers the full key range
+            &(all_key_values.first().unwrap().0..all_key_values.last().unwrap().0.next()),
            gc_cutoff,
            ctx,
        )
@@ -1187,24 +1169,20 @@ impl Timeline {
        let delta_split_points = delta_split_points.into_iter().collect_vec();
        let mut current_delta_split_point = 0;
        let mut delta_layers = Vec::new();
-        while let Some((key, lsn, val)) = merge_iter.next().await? {
-            if last_key.is_none() || last_key.as_ref() == Some(&key) {
-                if last_key.is_none() {
-                    last_key = Some(key);
-                }
-                accumulated_values.push((key, lsn, val));
+        for item @ (key, _, _) in &all_key_values {
+            if &last_key == key {
+                accumulated_values.push(item);
            } else {
-                let last_key = last_key.as_mut().unwrap();
                let (deltas, image) =
-                    flush_accumulated_states(self, *last_key, &accumulated_values, gc_cutoff)
+                    flush_accumulated_states(self, last_key, &accumulated_values, gc_cutoff)
                        .await?;
                // Put the image into the image layer. Currently we have a single big layer for the compaction.
-                image_layer_writer.put_image(*last_key, image, ctx).await?;
+                image_layer_writer.put_image(last_key, image, ctx).await?;
                delta_values.extend(deltas);
                delta_layers.extend(
                    flush_deltas(
                        &mut delta_values,
-                        *last_key,
+                        last_key,
                        &delta_split_points,
                        &mut current_delta_split_point,
                        self,
@@ -1214,12 +1192,11 @@ impl Timeline {
                    .await?,
                );
                accumulated_values.clear();
-                *last_key = key;
-                accumulated_values.push((key, lsn, val));
+                accumulated_values.push(item);
+                last_key = *key;
            }
        }

-        let last_key = last_key.expect("no keys produced during compaction");
        // TODO: move this part to the loop body
        let (deltas, image) =
            flush_accumulated_states(self, last_key, &accumulated_values, gc_cutoff).await?;
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -182,15 +182,13 @@ async fn remove_timeline_from_tenant(
 /// 5. Delete index part
 /// 6. Delete meta, timeline directory
 /// 7. Delete mark file
-///
 /// It is resumable from any step in case a crash/restart occurs.
 /// There are three entrypoints to the process:
 /// 1. [`DeleteTimelineFlow::run`] this is the main one called by a management api handler.
 /// 2. [`DeleteTimelineFlow::resume_deletion`] is called during restarts when local metadata is still present
-///    and we possibly neeed to continue deletion of remote files.
+/// and we possibly neeed to continue deletion of remote files.
 /// 3. [`DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces`] is used when we deleted remote
-///    index but still have local metadata, timeline directory and delete mark.
-///
+/// index but still have local metadata, timeline directory and delete mark.
 /// Note the only other place that messes around timeline delete mark is the logic that scans directory with timelines during tenant load.
 #[derive(Default)]
 pub enum DeleteTimelineFlow {
--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -10,7 +10,6 @@ use crate::{
    },
    virtual_file::{MaybeFatalIo, VirtualFile},
 };
-use pageserver_api::models::detach_ancestor::AncestorDetached;
 use tokio_util::sync::CancellationToken;
 use tracing::Instrument;
 use utils::{completion, generation::Generation, http::error::ApiError, id::TimelineId, lsn::Lsn};
@@ -40,9 +39,6 @@ pub(crate) enum Error {

    #[error("unexpected error")]
    Unexpected(#[source] anyhow::Error),
-
-    #[error("failpoint: {}", .0)]
-    Failpoint(&'static str),
 }

 impl From<Error> for ApiError {
@@ -61,41 +57,11 @@ impl From<Error> for ApiError {
            | e @ Error::CopyDeltaPrefix(_)
            | e @ Error::UploadRewritten(_)
            | e @ Error::CopyFailed(_)
-            | e @ Error::Unexpected(_)
-            | e @ Error::Failpoint(_) => ApiError::InternalServerError(e.into()),
+            | e @ Error::Unexpected(_) => ApiError::InternalServerError(e.into()),
        }
    }
 }

-impl From<crate::tenant::upload_queue::NotInitialized> for Error {
-    fn from(_: crate::tenant::upload_queue::NotInitialized) -> Self {
-        // treat all as shutting down signals, even though that is not entirely correct
-        // (uninitialized state)
-        Error::ShuttingDown
-    }
-}
-
-impl From<FlushLayerError> for Error {
-    fn from(value: FlushLayerError) -> Self {
-        match value {
-            FlushLayerError::Cancelled => Error::ShuttingDown,
-            FlushLayerError::NotRunning(_) => {
-                // FIXME(#6424): technically statically unreachable right now, given how we never
-                // drop the sender
-                Error::ShuttingDown
-            }
-            FlushLayerError::CreateImageLayersError(_) | FlushLayerError::Other(_) => {
-                Error::FlushAncestor(value)
-            }
-        }
-    }
-}
-
-pub(crate) enum Progress {
-    Prepared(completion::Completion, PreparedTimelineDetach),
-    Done(AncestorDetached),
-}
-
 pub(crate) struct PreparedTimelineDetach {
    layers: Vec<Layer>,
 }
@@ -122,7 +88,7 @@ pub(super) async fn prepare(
    tenant: &Tenant,
    options: Options,
    ctx: &RequestContext,
-) -> Result<Progress, Error> {
+) -> Result<(completion::Completion, PreparedTimelineDetach), Error> {
    use Error::*;

    let Some((ancestor, ancestor_lsn)) = detached
@@ -130,67 +96,15 @@ pub(super) async fn prepare(
        .as_ref()
        .map(|tl| (tl.clone(), detached.ancestor_lsn))
    else {
-        {
-            let accessor = detached.remote_client.initialized_upload_queue()?;
-
-            // we are safe to inspect the latest uploaded, because we can only witness this after
-            // restart is complete and ancestor is no more.
-            let latest = accessor.latest_uploaded_index_part();
-            if !latest.lineage.is_detached_from_original_ancestor() {
-                return Err(NoAncestor);
-            }
-        }
-
-        // detached has previously been detached; let's inspect each of the current timelines and
-        // report back the timelines which have been reparented by our detach
-        let mut all_direct_children = tenant
-            .timelines
-            .lock()
-            .unwrap()
-            .values()
-            .filter(|tl| matches!(tl.ancestor_timeline.as_ref(), Some(ancestor) if Arc::ptr_eq(ancestor, detached)))
-            .map(|tl| (tl.ancestor_lsn, tl.clone()))
-            .collect::<Vec<_>>();
-
-        let mut any_shutdown = false;
-
-        all_direct_children.retain(
-            |(_, tl)| match tl.remote_client.initialized_upload_queue() {
-                Ok(accessor) => accessor
-                    .latest_uploaded_index_part()
-                    .lineage
-                    .is_reparented(),
-                Err(_shutdownalike) => {
-                    // not 100% a shutdown, but let's bail early not to give inconsistent results in
-                    // sharded enviroment.
-                    any_shutdown = true;
-                    true
-                }
-            },
-        );
-
-        if any_shutdown {
-            // it could be one or many being deleted; have client retry
-            return Err(Error::ShuttingDown);
-        }
-
-        let mut reparented = all_direct_children;
-        // why this instead of hashset? there is a reason, but I've forgotten it many times.
+        // TODO: check if we have already been detached; for this we need to read the stored data
+        // on remote client, for that we need a follow-up which makes uploads cheaper and maintains
+        // a projection of the commited data.
        //
-        // maybe if this was a hashset we would not be able to distinguish some race condition.
-        reparented.sort_unstable_by_key(|(lsn, tl)| (*lsn, tl.timeline_id));
-
-        return Ok(Progress::Done(AncestorDetached {
-            reparented_timelines: reparented
-                .into_iter()
-                .map(|(_, tl)| tl.timeline_id)
-                .collect(),
-        }));
+        // the error is wrong per openapi
+        return Err(NoAncestor);
    };

    if !ancestor_lsn.is_valid() {
-        // rare case, probably wouldn't even load
-        tracing::error!("ancestor is set, but ancestor_lsn is invalid, this timeline needs fixing");
        return Err(NoAncestor);
    }

@@ -217,15 +131,6 @@ pub(super) async fn prepare(

    let _gate_entered = detached.gate.enter().map_err(|_| ShuttingDown)?;

-    utils::pausable_failpoint!("timeline-detach-ancestor::before_starting_after_locking_pausable");
-
-    fail::fail_point!(
-        "timeline-detach-ancestor::before_starting_after_locking",
-        |_| Err(Error::Failpoint(
-            "timeline-detach-ancestor::before_starting_after_locking"
-        ))
-    );
-
    if ancestor_lsn >= ancestor.get_disk_consistent_lsn() {
        let span =
            tracing::info_span!("freeze_and_flush", ancestor_timeline_id=%ancestor.timeline_id);
@@ -246,7 +151,7 @@ pub(super) async fn prepare(
                }
            };

-            res?;
+            res.map_err(FlushAncestor)?;

            // we do not need to wait for uploads to complete but we do need `struct Layer`,
            // copying delta prefix is unsupported currently for `InMemoryLayer`.
@@ -254,7 +159,7 @@ pub(super) async fn prepare(
                elapsed_ms = started_at.elapsed().as_millis(),
                "froze and flushed the ancestor"
            );
-            Ok::<_, Error>(())
+            Ok(())
        }
        .instrument(span)
        .await?;
@@ -378,7 +283,7 @@ pub(super) async fn prepare(

    let prepared = PreparedTimelineDetach { layers: new_layers };

-    Ok(Progress::Prepared(guard, prepared))
+    Ok((guard, prepared))
 }

 fn partition_work(
@@ -445,11 +350,7 @@ async fn copy_lsn_prefix(
    target_timeline: &Arc<Timeline>,
    ctx: &RequestContext,
 ) -> Result<Option<ResidentLayer>, Error> {
-    use Error::{CopyDeltaPrefix, RewrittenDeltaDownloadFailed, ShuttingDown};
-
-    if target_timeline.cancel.is_cancelled() {
-        return Err(ShuttingDown);
-    }
+    use Error::{CopyDeltaPrefix, RewrittenDeltaDownloadFailed};

    tracing::debug!(%layer, %end_lsn, "copying lsn prefix");

@@ -628,7 +529,7 @@ pub(super) async fn complete(
        match res {
            Ok(Some(timeline)) => {
                tracing::info!(reparented=%timeline.timeline_id, "reparenting done");
-                reparented.push((timeline.ancestor_lsn, timeline.timeline_id));
+                reparented.push(timeline.timeline_id);
            }
            Ok(None) => {
                // lets just ignore this for now. one or all reparented timelines could had
@@ -650,12 +551,5 @@ pub(super) async fn complete(
        tracing::info!("failed to reparent some candidates");
    }

-    reparented.sort_unstable();
-
-    let reparented = reparented
-        .into_iter()
-        .map(|(_, timeline_id)| timeline_id)
-        .collect();
-
    Ok(reparented)
 }
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -339,10 +339,6 @@ impl LayerManager {
        self.layer_fmgr.contains(layer)
    }

-    pub(crate) fn contains_key(&self, key: &PersistentLayerKey) -> bool {
-        self.layer_fmgr.contains_key(key)
-    }
-
    pub(crate) fn all_persistent_layers(&self) -> Vec<PersistentLayerKey> {
        self.layer_fmgr.0.keys().cloned().collect_vec()
    }
@@ -367,10 +363,6 @@ impl<T: AsLayerDesc + Clone> LayerFileManager<T> {
            .clone()
    }

-    fn contains_key(&self, key: &PersistentLayerKey) -> bool {
-        self.0.contains_key(key)
-    }
-
    pub(crate) fn insert(&mut self, layer: T) {
        let present = self.0.insert(layer.layer_desc().key(), layer.clone());
        if present.is_some() && cfg!(debug_assertions) {
--- a/pageserver/src/tenant/timeline/logical_size.rs
+++ b/pageserver/src/tenant/timeline/logical_size.rs
@@ -11,11 +11,11 @@ use std::sync::atomic::{AtomicBool, AtomicI64, Ordering as AtomicOrdering};
 /// Calculation consists of two stages:
 ///
 /// 1. Initial size calculation. That might take a long time, because it requires
-///    reading all layers containing relation sizes at `initial_part_end`.
+/// reading all layers containing relation sizes at `initial_part_end`.
 ///
 /// 2. Collecting an incremental part and adding that to the initial size.
-///    Increments are appended on walreceiver writing new timeline data,
-///    which result in increase or decrease of the logical size.
+/// Increments are appended on walreceiver writing new timeline data,
+/// which result in increase or decrease of the logical size.
 pub(super) struct LogicalSize {
    /// Size, potentially slow to compute. Calculating this might require reading multiple
    /// layers, and even ancestor's layers.
@@ -45,17 +45,17 @@ pub(super) struct LogicalSize {
    /// Size shouldn't ever be negative, but this is signed for two reasons:
    ///
    /// 1. If we initialized the "baseline" size lazily, while we already
-    ///    process incoming WAL, the incoming WAL records could decrement the
-    ///    variable and temporarily make it negative. (This is just future-proofing;
-    ///    the initialization is currently not done lazily.)
+    /// process incoming WAL, the incoming WAL records could decrement the
+    /// variable and temporarily make it negative. (This is just future-proofing;
+    /// the initialization is currently not done lazily.)
    ///
    /// 2. If there is a bug and we e.g. forget to increment it in some cases
-    ///    when size grows, but remember to decrement it when it shrinks again, the
-    ///    variable could go negative. In that case, it seems better to at least
-    ///    try to keep tracking it, rather than clamp or overflow it. Note that
-    ///    get_current_logical_size() will clamp the returned value to zero if it's
-    ///    negative, and log an error. Could set it permanently to zero or some
-    ///    special value to indicate "broken" instead, but this will do for now.
+    /// when size grows, but remember to decrement it when it shrinks again, the
+    /// variable could go negative. In that case, it seems better to at least
+    /// try to keep tracking it, rather than clamp or overflow it. Note that
+    /// get_current_logical_size() will clamp the returned value to zero if it's
+    /// negative, and log an error. Could set it permanently to zero or some
+    /// special value to indicate "broken" instead, but this will do for now.
    ///
    /// Note that we also expose a copy of this value as a prometheus metric,
    /// see `current_logical_size_gauge`. Use the `update_current_logical_size`
--- a/pageserver/src/tenant/timeline/walreceiver.rs
+++ b/pageserver/src/tenant/timeline/walreceiver.rs
@@ -2,13 +2,13 @@
 //! To do so, a current implementation needs to do the following:
 //!
 //! * acknowledge the timelines that it needs to stream WAL into.
-//!   Pageserver is able to dynamically (un)load tenants on attach and detach,
-//!   hence WAL receiver needs to react on such events.
+//! Pageserver is able to dynamically (un)load tenants on attach and detach,
+//! hence WAL receiver needs to react on such events.
 //!
 //! * get a broker subscription, stream data from it to determine that a timeline needs WAL streaming.
-//!   For that, it watches specific keys in storage_broker and pulls the relevant data periodically.
-//!   The data is produced by safekeepers, that push it periodically and pull it to synchronize between each other.
-//!   Without this data, no WAL streaming is possible currently.
+//! For that, it watches specific keys in storage_broker and pulls the relevant data periodically.
+//! The data is produced by safekeepers, that push it periodically and pull it to synchronize between each other.
+//! Without this data, no WAL streaming is possible currently.
 //!
 //! Only one active WAL streaming connection is allowed at a time.
 //! The connection is supposed to be updated periodically, based on safekeeper timeline data.
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Arpad Müller	92acb7cd5f	More commenting out	2024-07-10 03:27:33 +02:00
Arpad Müller	c0ae2c4232	more commenting out	2024-07-09 18:18:53 +02:00
Arpad Müller	c927675da9	Comment out some tests	2024-07-09 18:11:58 +02:00
Arpad Müller	f7c24c2834	Make vectored read_blobs function not fill buffer correctly	2024-07-09 15:40:25 +02:00