Do not perform XID alignment because looks like it has no noticeable impact on performance but can cause problrems with restoring running xacts from CLOG

2026-02-01 01:30:38 +00:00 · 2024-07-05 18:01:33 +03:00
167 changed files with 2661 additions and 6991 deletions
--- a/.github/actions/neon-project-create/action.yml
+++ b/.github/actions/neon-project-create/action.yml
@@ -9,8 +9,8 @@ inputs:
    description: 'Region ID, if not set the project will be created in the default region'
    default: aws-us-east-2
  postgres_version:
-    description: 'Postgres version; default is 16'
-    default: '16'
+    description: 'Postgres version; default is 15'
+    default: '15'
  api_host:
    description: 'Neon API host'
    default: console-stage.neon.build
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -115,7 +115,6 @@ runs:
        export POSTGRES_DISTRIB_DIR=${POSTGRES_DISTRIB_DIR:-/tmp/neon/pg_install}
        export DEFAULT_PG_VERSION=${PG_VERSION#v}
        export LD_LIBRARY_PATH=${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/lib
-        export BENCHMARK_CONNSTR=${BENCHMARK_CONNSTR:-}

        if [ "${BUILD_TYPE}" = "remote" ]; then
          export REMOTE_ENV=1
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -56,26 +56,15 @@ concurrency:
 jobs:
  bench:
    if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
-    strategy:
-      matrix:
-        include:
-          - DEFAULT_PG_VERSION: 16
-            PLATFORM: "neon-staging"
-            region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
-            provisioner: 'k8s-pod' 
-          - DEFAULT_PG_VERSION: 16
-            PLATFORM: "azure-staging"
-            region_id: 'azure-eastus2'
-            provisioner: 'k8s-neonvm'
    env:
      TEST_PG_BENCH_DURATIONS_MATRIX: "300"
      TEST_PG_BENCH_SCALES_MATRIX: "10,100"
      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
-      DEFAULT_PG_VERSION: ${{ matrix.DEFAULT_PG_VERSION }}
+      DEFAULT_PG_VERSION: 14
      TEST_OUTPUT: /tmp/test_output
      BUILD_TYPE: remote
      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
-      PLATFORM: ${{ matrix.PLATFORM }}
+      PLATFORM: "neon-staging"

    runs-on: [ self-hosted, us-east-2, x64 ]
    container:
@@ -96,10 +85,9 @@ jobs:
      id: create-neon-project
      uses: ./.github/actions/neon-project-create
      with:
-        region_id: ${{ matrix.region_id }}
+        region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
        postgres_version: ${{ env.DEFAULT_PG_VERSION }}
        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
-        provisioner: ${{ matrix.provisioner }}

    - name: Run benchmark
      uses: ./.github/actions/run-python-test-set
@@ -108,18 +96,10 @@ jobs:
        test_selection: performance
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
-        pg_version: ${{ env.DEFAULT_PG_VERSION }}
        # Set --sparse-ordering option of pytest-order plugin
        # to ensure tests are running in order of appears in the file.
        # It's important for test_perf_pgbench.py::test_pgbench_remote_* tests
-        extra_params:
-          -m remote_cluster
-          --sparse-ordering
-          --timeout 14400
-          --ignore test_runner/performance/test_perf_olap.py
-          --ignore test_runner/performance/test_perf_pgvector_queries.py
-          --ignore test_runner/performance/test_logical_replication.py
-          --ignore test_runner/performance/test_physical_replication.py
+        extra_params: -m remote_cluster --sparse-ordering --timeout 5400 --ignore test_runner/performance/test_perf_olap.py --ignore test_runner/performance/test_perf_pgvector_queries.py
      env:
        BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -145,71 +125,6 @@ jobs:
      env:
        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}

-  replication-tests:
-    if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
-    env:
-      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
-      DEFAULT_PG_VERSION: 14
-      TEST_OUTPUT: /tmp/test_output
-      BUILD_TYPE: remote
-      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
-      PLATFORM: "neon-staging"
-
-    runs-on: [ self-hosted, us-east-2, x64 ]
-    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
-      options: --init
-
-    steps:
-    - uses: actions/checkout@v4
-
-    - name: Download Neon artifact
-      uses: ./.github/actions/download
-      with:
-        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
-        path: /tmp/neon/
-        prefix: latest
-
-    - name: Run benchmark
-      uses: ./.github/actions/run-python-test-set
-      with:
-        build_type: ${{ env.BUILD_TYPE }}
-        test_selection: performance/test_logical_replication.py
-        run_in_parallel: false
-        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
-        extra_params: -m remote_cluster --timeout 5400
-      env:
-        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
-        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
-        NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }}
-
-    - name: Run benchmark
-      uses: ./.github/actions/run-python-test-set
-      with:
-        build_type: ${{ env.BUILD_TYPE }}
-        test_selection: performance/test_physical_replication.py
-        run_in_parallel: false
-        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
-        extra_params: -m remote_cluster --timeout 5400
-        pg_version: ${{ env.DEFAULT_PG_VERSION }}
-      env:
-        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
-        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
-        NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }}
-
-    - name: Create Allure report
-      if: ${{ !cancelled() }}
-      uses: ./.github/actions/allure-report-generate
-
-    - name: Post to a Slack channel
-      if: ${{ github.event.schedule && failure() }}
-      uses: slackapi/slack-github-action@v1
-      with:
-        channel-id: "C033QLM5P7D" # dev-staging-stream
-        slack-message: "Periodic replication testing: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
-      env:
-        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
-
  generate-matrices:
    if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
    # Create matrices for the benchmarking jobs, so we run benchmarks on rds only once a week (on Saturday)
@@ -217,14 +132,11 @@ jobs:
    # Available platforms:
    # - neon-captest-new: Freshly created project (1 CU)
    # - neon-captest-freetier: Use freetier-sized compute (0.25 CU)
-    # - neonvm-captest-azure-new: Freshly created project (1 CU) in azure region
-    # - neonvm-captest-azure-freetier: Use freetier-sized compute (0.25 CU) in azure region
    # - neon-captest-reuse: Reusing existing project
    # - rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs
    # - rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
    env:
      RUN_AWS_RDS_AND_AURORA: ${{ github.event.inputs.run_AWS_RDS_AND_AURORA || 'false' }}
-      DEFAULT_REGION_ID: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
    runs-on: ubuntu-22.04
    outputs:
      pgbench-compare-matrix: ${{ steps.pgbench-compare-matrix.outputs.matrix }}
@@ -235,33 +147,23 @@ jobs:
    - name: Generate matrix for pgbench benchmark
      id: pgbench-compare-matrix
      run: |
-        region_id_default=${{ env.DEFAULT_REGION_ID }}
        matrix='{
-          "pg_version" : [
-            16
-          ],
-          "region_id" : [
-            "'"$region_id_default"'"
-            ],
          "platform": [
            "neon-captest-new",
            "neon-captest-reuse",
            "neonvm-captest-new"
          ],
          "db_size": [ "10gb" ],
-          "include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neon-captest-freetier",         "db_size": "3gb"  },
-                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neon-captest-new",              "db_size": "50gb" },
-                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier",       "db_size": "3gb"  },
-                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new",            "db_size": "50gb" },
-                      { "pg_version": 16, "region_id": "azure-eastus2",      "platform": "neonvm-azure-captest-freetier", "db_size": "3gb"  },
-                      { "pg_version": 16, "region_id": "azure-eastus2",      "platform": "neonvm-azure-captest-new",      "db_size": "10gb" },
-                      { "pg_version": 16, "region_id": "azure-eastus2",      "platform": "neonvm-azure-captest-new",      "db_size": "50gb" },
-                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb" }]
+          "include": [{ "platform": "neon-captest-freetier",         "db_size": "3gb"  },
+                      { "platform": "neon-captest-new",              "db_size": "50gb" },
+                      { "platform": "neonvm-captest-freetier",       "db_size": "3gb"  },
+                      { "platform": "neonvm-captest-new",            "db_size": "50gb" },
+                      { "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb" }]
        }'

        if [ "$(date +%A)" = "Saturday" ]; then
-          matrix=$(echo "$matrix" | jq '.include += [{ "pg_version": 14, "region_id": "'"$region_id_default"'", "platform": "rds-postgres", "db_size": "10gb"},
-                                                     { "pg_version": 14, "region_id": "'"$region_id_default"'", "platform": "rds-aurora",   "db_size": "50gb"}]')
+          matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "db_size": "10gb"},
+                                                     { "platform": "rds-aurora",   "db_size": "50gb"}]')
        fi

        echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
@@ -313,7 +215,7 @@ jobs:
      TEST_PG_BENCH_DURATIONS_MATRIX: "60m"
      TEST_PG_BENCH_SCALES_MATRIX: ${{ matrix.db_size }}
      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
-      DEFAULT_PG_VERSION: ${{ matrix.pg_version }}
+      DEFAULT_PG_VERSION: 14
      TEST_OUTPUT: /tmp/test_output
      BUILD_TYPE: remote
      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
@@ -338,14 +240,14 @@ jobs:
        prefix: latest

    - name: Create Neon Project
-      if: contains(fromJson('["neon-captest-new", "neon-captest-freetier", "neonvm-captest-new", "neonvm-captest-freetier", "neonvm-azure-captest-freetier", "neonvm-azure-captest-new"]'), matrix.platform)
+      if: contains(fromJson('["neon-captest-new", "neon-captest-freetier", "neonvm-captest-new", "neonvm-captest-freetier"]'), matrix.platform)
      id: create-neon-project
      uses: ./.github/actions/neon-project-create
      with:
-        region_id: ${{ matrix.region_id }}
+        region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
        postgres_version: ${{ env.DEFAULT_PG_VERSION }}
        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
-        compute_units: ${{ (contains(matrix.platform, 'captest-freetier') && '[0.25, 0.25]') || '[1, 1]' }}
+        compute_units: ${{ (matrix.platform == 'neon-captest-freetier' && '[0.25, 0.25]') || '[1, 1]' }}
        provisioner: ${{ (contains(matrix.platform, 'neonvm-') && 'k8s-neonvm') || 'k8s-pod' }}

    - name: Set up Connection String
@@ -358,7 +260,7 @@ jobs:
          neonvm-captest-sharding-reuse)
            CONNSTR=${{ secrets.BENCHMARK_CAPTEST_SHARDING_CONNSTR }}
            ;;
-          neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier | neonvm-azure-captest-new | neonvm-azure-captest-freetier)
+          neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier)
            CONNSTR=${{ steps.create-neon-project.outputs.dsn }}
            ;;
          rds-aurora)
@@ -383,7 +285,6 @@ jobs:
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_init
-        pg_version: ${{ env.DEFAULT_PG_VERSION }}
      env:
        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -397,7 +298,6 @@ jobs:
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_simple_update
-        pg_version: ${{ env.DEFAULT_PG_VERSION }}
      env:
        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -411,7 +311,6 @@ jobs:
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_select_only
-        pg_version: ${{ env.DEFAULT_PG_VERSION }}
      env:
        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -438,12 +337,6 @@ jobs:
        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}

  pgbench-pgvector:
-    strategy:
-      matrix:
-        include:
-          - PLATFORM: "neon-captest-pgvector"
-          - PLATFORM: "azure-captest-pgvector"
-            
    env:
      TEST_PG_BENCH_DURATIONS_MATRIX: "15m"
      TEST_PG_BENCH_SCALES_MATRIX: "1"
@@ -452,7 +345,7 @@ jobs:
      TEST_OUTPUT: /tmp/test_output
      BUILD_TYPE: remote
      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
-      PLATFORM: ${{ matrix.PLATFORM }}
+      PLATFORM: "neon-captest-pgvector"

    runs-on: [ self-hosted, us-east-2, x64 ]
    container:
@@ -472,18 +365,7 @@ jobs:
    - name: Set up Connection String
      id: set-up-connstr
      run: |
-        case "${PLATFORM}" in
-          neon-captest-pgvector)
-            CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR }}
-            ;;
-          azure-captest-pgvector)
-            CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR_AZURE }}
-            ;;
-          *)
-            echo >&2 "Unknown PLATFORM=${PLATFORM}"
-            exit 1
-            ;;
-        esac
+        CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR }}

        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT

@@ -495,7 +377,6 @@ jobs:
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 21600 -k test_pgvector_indexing
-        pg_version: ${{ env.DEFAULT_PG_VERSION }}
      env:
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
@@ -509,7 +390,6 @@ jobs:
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 21600
-        pg_version: ${{ env.DEFAULT_PG_VERSION }}
      env:
        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -524,7 +404,7 @@ jobs:
      uses: slackapi/slack-github-action@v1
      with:
        channel-id: "C033QLM5P7D" # dev-staging-stream
-        slack-message: "Periodic perf testing ${PLATFORM}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+        slack-message: "Periodic perf testing neon-captest-pgvector: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
      env:
        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}

@@ -772,7 +652,6 @@ jobs:
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 21600 -k test_user_examples
-        pg_version: ${{ env.DEFAULT_PG_VERSION }}
      env:
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -1336,7 +1336,6 @@ jobs:
        env:
          BUCKET: neon-github-public-dev
          PREFIX: artifacts/latest
-          COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
        run: |
          # Update compatibility snapshot for the release
          for pg_version in v14 v15 v16; do
@@ -1350,7 +1349,7 @@ jobs:

          # Update Neon artifact for the release (reuse already uploaded artifact)
          for build_type in debug release; do
-            OLD_PREFIX=artifacts/${COMMIT_SHA}/${GITHUB_RUN_ID}
+            OLD_PREFIX=artifacts/${GITHUB_RUN_ID}
            FILENAME=neon-${{ runner.os }}-${{ runner.arch }}-${build_type}-artifact.tar.zst

            S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${OLD_PREFIX} | jq -r '.Contents[]?.Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1236,7 +1236,6 @@ dependencies = [
 "regex",
 "remote_storage",
 "reqwest 0.12.4",
- "rlimit",
 "rust-ini",
 "serde",
 "serde_json",
@@ -1398,9 +1397,9 @@ dependencies = [

 [[package]]
 name = "crc32c"
-version = "0.6.8"
+version = "0.6.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3a47af21622d091a8f0fb295b88bc886ac74efcc613efc19f5d0b21de5c89e47"
+checksum = "89254598aa9b9fa608de44b3ae54c810f0f06d755e24c50177f1f8f31ff50ce2"
 dependencies = [
 "rustc_version",
 ]
@@ -1652,16 +1651,6 @@ dependencies = [
 "rusticata-macros",
 ]

-[[package]]
-name = "deranged"
-version = "0.3.11"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b42b6fa04a440b495c8b04d0e71b707c585f83cb9cb28cf8cd0d976c315e31b4"
-dependencies = [
- "powerfmt",
- "serde",
-]
-
 [[package]]
 name = "desim"
 version = "0.1.0"
@@ -2028,6 +2017,16 @@ dependencies = [
 "tokio-util",
 ]

+[[package]]
+name = "fs2"
+version = "0.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9564fc758e15025b46aa6643b1b77d047d1a56a1aea6e01002ac0c7026876213"
+dependencies = [
+ "libc",
+ "winapi",
+]
+
 [[package]]
 name = "fsevent-sys"
 version = "4.1.0"
@@ -3009,9 +3008,9 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"

 [[package]]
 name = "measured"
-version = "0.0.22"
+version = "0.0.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3051f3a030d55d680cdef6ca50e80abd1182f8da29f2344a7c9cb575721138f0"
+checksum = "652bc741286361c06de8cb4d89b21a6437f120c508c51713663589eeb9928ac5"
 dependencies = [
 "bytes",
 "crossbeam-utils",
@@ -3027,9 +3026,9 @@ dependencies = [

 [[package]]
 name = "measured-derive"
-version = "0.0.22"
+version = "0.0.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b9e6777fc80a575f9503d908c8b498782a6c3ee88a06cb416dc3941401e43b94"
+checksum = "6ea497f33e1e856a376c32ad916f69a0bd3c597db1f912a399f842b01a4a685d"
 dependencies = [
 "heck 0.5.0",
 "proc-macro2",
@@ -3039,9 +3038,9 @@ dependencies = [

 [[package]]
 name = "measured-process"
-version = "0.0.22"
+version = "0.0.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7c4b80445aeb08e832d87bf1830049a924cdc1d6b7ef40b6b9b365bff17bf8ec"
+checksum = "b364ccb66937a814b6b2ad751d1a2f7a9d5a78c761144036825fb36bb0771000"
 dependencies = [
 "libc",
 "measured",
@@ -3276,12 +3275,6 @@ dependencies = [
 "num-traits",
 ]

-[[package]]
-name = "num-conv"
-version = "0.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9"
-
 [[package]]
 name = "num-integer"
 version = "0.1.45"
@@ -3674,7 +3667,6 @@ dependencies = [
 "sysinfo",
 "tenant_size_model",
 "thiserror",
- "tikv-jemallocator",
 "tokio",
 "tokio-epoll-uring",
 "tokio-io-timeout",
@@ -4085,7 +4077,6 @@ dependencies = [
 "tokio-postgres",
 "tokio-postgres-rustls",
 "tokio-rustls 0.25.0",
- "tokio-util",
 "tracing",
 "workspace_hack",
 ]
@@ -4126,12 +4117,6 @@ dependencies = [
 "workspace_hack",
 ]

-[[package]]
-name = "powerfmt"
-version = "0.2.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391"
-
 [[package]]
 name = "ppv-lite86"
 version = "0.2.17"
@@ -4404,7 +4389,6 @@ dependencies = [
 "tracing-opentelemetry",
 "tracing-subscriber",
 "tracing-utils",
- "typed-json",
 "url",
 "urlencoding",
 "utils",
@@ -4893,15 +4877,6 @@ dependencies = [
 "windows-sys 0.48.0",
 ]

-[[package]]
-name = "rlimit"
-version = "0.10.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3560f70f30a0f16d11d01ed078a07740fe6b489667abc7c7b029155d9f21c3d8"
-dependencies = [
- "libc",
-]
-
 [[package]]
 name = "routerify"
 version = "3.0.0"
@@ -5170,6 +5145,7 @@ dependencies = [
 "crc32c",
 "desim",
 "fail",
+ "fs2",
 "futures",
 "git-version",
 "hex",
@@ -5196,8 +5172,6 @@ dependencies = [
 "sha2",
 "signal-hook",
 "storage_broker",
- "strum",
- "strum_macros",
 "thiserror",
 "tokio",
 "tokio-io-timeout",
@@ -5422,9 +5396,9 @@ checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4"

 [[package]]
 name = "serde"
-version = "1.0.203"
+version = "1.0.183"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7253ab4de971e72fb7be983802300c30b5a7f0c2e56fab8abfc6a214307c0094"
+checksum = "32ac8da02677876d532745a130fc9d8e6edfa81a269b107c5b00829b91d8eb3c"
 dependencies = [
 "serde_derive",
 ]
@@ -5441,9 +5415,9 @@ dependencies = [

 [[package]]
 name = "serde_derive"
-version = "1.0.203"
+version = "1.0.183"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "500cbc0ebeb6f46627f50f3f5811ccf6bf00643be300b4c3eabc0ef55dc5b5ba"
+checksum = "aafe972d60b0b9bee71a91b92fee2d4fb3c9d7e8f6b179aa99f27203d99a4816"
 dependencies = [
 "proc-macro2",
 "quote",
@@ -6133,15 +6107,12 @@ dependencies = [

 [[package]]
 name = "time"
-version = "0.3.36"
+version = "0.3.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5dfd88e563464686c916c7e46e623e520ddc6d79fa6641390f2e3fa86e83e885"
+checksum = "8f3403384eaacbca9923fa06940178ac13e4edb725486d70e8e15881d0c836cc"
 dependencies = [
- "deranged",
 "itoa",
 "js-sys",
- "num-conv",
- "powerfmt",
 "serde",
 "time-core",
 "time-macros",
@@ -6149,17 +6120,16 @@ dependencies = [

 [[package]]
 name = "time-core"
-version = "0.1.2"
+version = "0.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3"
+checksum = "7300fbefb4dadc1af235a9cef3737cea692a9d97e1b9cbcd4ebdae6f8868e6fb"

 [[package]]
 name = "time-macros"
-version = "0.2.18"
+version = "0.2.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3f252a68540fde3a3877aeea552b832b40ab9a69e318efd078774a01ddee1ccf"
+checksum = "372950940a5f07bf38dbe211d7283c9e6d7327df53794992d293e534c733d09b"
 dependencies = [
- "num-conv",
 "time-core",
 ]

@@ -6502,6 +6472,17 @@ version = "0.3.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52"

+[[package]]
+name = "trace"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "clap",
+ "pageserver_api",
+ "utils",
+ "workspace_hack",
+]
+
 [[package]]
 name = "tracing"
 version = "0.1.37"
@@ -6666,16 +6647,6 @@ dependencies = [
 "static_assertions",
 ]

-[[package]]
-name = "typed-json"
-version = "0.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6024a8d0025400b3f6b189366e9aa92012cf9c4fe1cd2620848dd61425c49eed"
-dependencies = [
- "serde",
- "serde_json",
-]
-
 [[package]]
 name = "typenum"
 version = "1.16.0"
@@ -7456,12 +7427,13 @@ dependencies = [
 "clap",
 "clap_builder",
 "crossbeam-utils",
- "deranged",
 "either",
 "fail",
 "futures-channel",
+ "futures-core",
 "futures-executor",
 "futures-io",
+ "futures-sink",
 "futures-util",
 "getrandom 0.2.11",
 "hashbrown 0.14.5",
@@ -7479,9 +7451,7 @@ dependencies = [
 "num-traits",
 "once_cell",
 "parquet",
- "proc-macro2",
 "prost",
- "quote",
 "rand 0.8.5",
 "regex",
 "regex-automata 0.4.3",
@@ -7498,7 +7468,6 @@ dependencies = [
 "syn 1.0.109",
 "syn 2.0.52",
 "sync_wrapper",
- "tikv-jemalloc-sys",
 "time",
 "time-macros",
 "tokio",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -15,6 +15,7 @@ members = [
    "storage_controller",
    "storage_scrubber",
    "workspace_hack",
+    "trace",
    "libs/compute_api",
    "libs/pageserver_api",
    "libs/postgres_ffi",
@@ -83,6 +84,7 @@ enumset = "1.0.12"
 fail = "0.5.0"
 fallible-iterator = "0.2"
 framed-websockets = { version = "0.1.0", git = "https://github.com/neondatabase/framed-websockets" }
+fs2 = "0.4.3"
 futures = "0.3"
 futures-core = "0.3"
 futures-util = "0.3"
@@ -109,8 +111,8 @@ lasso = "0.7"
 leaky-bucket = "1.0.1"
 libc = "0.2"
 md5 = "0.7.0"
-measured = { version = "0.0.22", features=["lasso"] }
-measured-process = { version = "0.0.22" }
+measured = { version = "0.0.21", features=["lasso"] }
+measured-process = { version = "0.0.21" }
 memoffset = "0.8"
 nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] }
 notify = "6.0.0"
@@ -184,7 +186,6 @@ tracing-error = "0.2.0"
 tracing-opentelemetry = "0.21.0"
 tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json", "ansi"] }
 twox-hash = { version = "1.6.3", default-features = false }
-typed-json = "0.1"
 url = "2.2"
 urlencoding = "2.1"
 uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] }
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -311,12 +311,9 @@ RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz
 FROM build-deps AS rum-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-COPY patches/rum.patch /rum.patch
-
 RUN wget https://github.com/postgrespro/rum/archive/refs/tags/1.3.13.tar.gz -O rum.tar.gz && \
    echo "6ab370532c965568df6210bd844ac6ba649f53055e48243525b0b7e5c4d69a7d rum.tar.gz" | sha256sum --check && \
    mkdir rum-src && cd rum-src && tar xzf ../rum.tar.gz --strip-components=1 -C . && \
-    patch -p1 < /rum.patch && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/rum.control
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -44,4 +44,3 @@ vm_monitor = { version = "0.1", path = "../libs/vm_monitor/" }
 zstd = "0.13"
 bytes = "1.0"
 rust-ini = "0.20.0"
-rlimit = "0.10.1"
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -6,7 +6,7 @@
 //! - Every start is a fresh start, so the data directory is removed and
 //!   initialized again on each run.
 //! - If remote_extension_config is provided, it will be used to fetch extensions list
-//!   and download `shared_preload_libraries` from the remote storage.
+//!  and download `shared_preload_libraries` from the remote storage.
 //! - Next it will put configuration files into the `PGDATA` directory.
 //! - Sync safekeepers and get commit LSN.
 //! - Get `basebackup` from pageserver using the returned on the previous step LSN.
@@ -33,6 +33,7 @@
 //!             -b /usr/local/bin/postgres \
 //!             -r http://pg-ext-s3-gateway \
 //! ```
+//!
 use std::collections::HashMap;
 use std::fs::File;
 use std::path::Path;
@@ -63,7 +64,6 @@ use compute_tools::monitor::launch_monitor;
 use compute_tools::params::*;
 use compute_tools::spec::*;
 use compute_tools::swap::resize_swap;
-use rlimit::{setrlimit, Resource};

 // this is an arbitrary build tag. Fine as a default / for testing purposes
 // in-case of not-set environment var
@@ -72,9 +72,6 @@ const BUILD_TAG_DEFAULT: &str = "latest";
 fn main() -> Result<()> {
    let (build_tag, clap_args) = init()?;

-    // enable core dumping for all child processes
-    setrlimit(Resource::CORE, rlimit::INFINITY, rlimit::INFINITY)?;
-
    let (pg_handle, start_pg_result) = {
        // Enter startup tracing context
        let _startup_context_guard = startup_context_from_env();
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -56,7 +56,6 @@ pub struct ComputeNode {
    /// - we push new spec and it does reconfiguration
    /// - but then something happens and compute pod / VM is destroyed,
    ///   so k8s controller starts it again with the **old** spec
-    ///
    /// and the same for empty computes:
    /// - we started compute without any spec
    /// - we push spec and it does configuration
@@ -799,11 +798,7 @@ impl ComputeNode {
        // In this case we need to connect with old `zenith_admin` name
        // and create new user. We cannot simply rename connected user,
        // but we can create a new one and grant it all privileges.
-        let mut connstr = self.connstr.clone();
-        connstr
-            .query_pairs_mut()
-            .append_pair("application_name", "apply_config");
-
+        let connstr = self.connstr.clone();
        let mut client = match Client::connect(connstr.as_str(), NoTls) {
            Err(e) => match e.code() {
                Some(&SqlState::INVALID_PASSWORD)
@@ -872,11 +867,6 @@ impl ComputeNode {

        // Run migrations separately to not hold up cold starts
        thread::spawn(move || {
-            let mut connstr = connstr.clone();
-            connstr
-                .query_pairs_mut()
-                .append_pair("application_name", "migrations");
-
            let mut client = Client::connect(connstr.as_str(), NoTls)?;
            handle_migrations(&mut client).context("apply_config handle_migrations")
        });
@@ -1117,7 +1107,7 @@ impl ComputeNode {
    // EKS worker nodes have following core dump settings:
    //   /proc/sys/kernel/core_pattern -> core
    //   /proc/sys/kernel/core_uses_pid -> 1
-    //   ulimit -c -> unlimited
+    //   ulimint -c -> unlimited
    // which results in core dumps being written to postgres data directory as core.<pid>.
    //
    // Use that as a default location and pattern, except macos where core dumps are written
@@ -1396,9 +1386,7 @@ pub fn forward_termination_signal() {
    let pg_pid = PG_PID.load(Ordering::SeqCst);
    if pg_pid != 0 {
        let pg_pid = nix::unistd::Pid::from_raw(pg_pid as i32);
-        // Use 'fast' shutdown (SIGINT) because it also creates a shutdown checkpoint, which is important for
-        // ROs to get a list of running xacts faster instead of going through the CLOG.
-        // See https://www.postgresql.org/docs/current/server-shutdown.html for the list of modes and signals.
-        kill(pg_pid, Signal::SIGINT).ok();
+        // use 'immediate' shutdown (SIGQUIT): https://www.postgresql.org/docs/current/server-shutdown.html
+        kill(pg_pid, Signal::SIGQUIT).ok();
    }
 }
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -11,7 +11,6 @@ pub mod logger;
 pub mod catalog;
 pub mod compute;
 pub mod extension_server;
-mod migration;
 pub mod monitor;
 pub mod params;
 pub mod pg_helpers;
--- a/compute_tools/src/migration.rs
+++ b/compute_tools/src/migration.rs
@@ -1,105 +0,0 @@
-use anyhow::{Context, Result};
-use postgres::Client;
-use tracing::info;
-
-pub(crate) struct MigrationRunner<'m> {
-    client: &'m mut Client,
-    migrations: &'m [&'m str],
-}
-
-impl<'m> MigrationRunner<'m> {
-    pub fn new(client: &'m mut Client, migrations: &'m [&'m str]) -> Self {
-        // The neon_migration.migration_id::id column is a bigint, which is equivalent to an i64
-        assert!(migrations.len() + 1 < i64::MAX as usize);
-
-        Self { client, migrations }
-    }
-
-    fn get_migration_id(&mut self) -> Result<i64> {
-        let query = "SELECT id FROM neon_migration.migration_id";
-        let row = self
-            .client
-            .query_one(query, &[])
-            .context("run_migrations get migration_id")?;
-
-        Ok(row.get::<&str, i64>("id"))
-    }
-
-    fn update_migration_id(&mut self, migration_id: i64) -> Result<()> {
-        let setval = format!("UPDATE neon_migration.migration_id SET id={}", migration_id);
-
-        self.client
-            .simple_query(&setval)
-            .context("run_migrations update id")?;
-
-        Ok(())
-    }
-
-    fn prepare_migrations(&mut self) -> Result<()> {
-        let query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
-        self.client.simple_query(query)?;
-
-        let query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)";
-        self.client.simple_query(query)?;
-
-        let query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING";
-        self.client.simple_query(query)?;
-
-        let query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin";
-        self.client.simple_query(query)?;
-
-        let query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC";
-        self.client.simple_query(query)?;
-
-        Ok(())
-    }
-
-    pub fn run_migrations(mut self) -> Result<()> {
-        self.prepare_migrations()?;
-
-        let mut current_migration = self.get_migration_id()? as usize;
-        while current_migration < self.migrations.len() {
-            macro_rules! migration_id {
-                ($cm:expr) => {
-                    ($cm + 1) as i64
-                };
-            }
-
-            let migration = self.migrations[current_migration];
-
-            if migration.starts_with("-- SKIP") {
-                info!("Skipping migration id={}", migration_id!(current_migration));
-            } else {
-                info!(
-                    "Running migration id={}:\n{}\n",
-                    migration_id!(current_migration),
-                    migration
-                );
-
-                self.client
-                    .simple_query("BEGIN")
-                    .context("begin migration")?;
-
-                self.client.simple_query(migration).with_context(|| {
-                    format!(
-                        "run_migrations migration id={}",
-                        migration_id!(current_migration)
-                    )
-                })?;
-
-                // Migration IDs start at 1
-                self.update_migration_id(migration_id!(current_migration))?;
-
-                self.client
-                    .simple_query("COMMIT")
-                    .context("commit migration")?;
-
-                info!("Finished migration id={}", migration_id!(current_migration));
-            }
-
-            current_migration += 1;
-        }
-
-        Ok(())
-    }
-}
--- a/compute_tools/src/migrations/0000-neon_superuser_bypass_rls.sql
+++ b/compute_tools/src/migrations/0000-neon_superuser_bypass_rls.sql
--- a/compute_tools/src/migrations/0001-alter_roles.sql
+++ b/compute_tools/src/migrations/0001-alter_roles.sql
--- a/compute_tools/src/migrations/0002-grant_pg_create_subscription_to_neon_superuser.sql
+++ b/compute_tools/src/migrations/0002-grant_pg_create_subscription_to_neon_superuser.sql
--- a/compute_tools/src/migrations/0003-grant_pg_monitor_to_neon_superuser.sql
+++ b/compute_tools/src/migrations/0003-grant_pg_monitor_to_neon_superuser.sql
--- a/compute_tools/src/migrations/0004-grant_all_on_tables_to_neon_superuser.sql
+++ b/compute_tools/src/migrations/0004-grant_all_on_tables_to_neon_superuser.sql
--- a/compute_tools/src/migrations/0005-grant_all_on_sequences_to_neon_superuser.sql
+++ b/compute_tools/src/migrations/0005-grant_all_on_sequences_to_neon_superuser.sql
--- a/compute_tools/src/migrations/0006-grant_all_on_tables_to_neon_superuser_with_grant_option.sql
+++ b/compute_tools/src/migrations/0006-grant_all_on_tables_to_neon_superuser_with_grant_option.sql
--- a/compute_tools/src/migrations/0007-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql
+++ b/compute_tools/src/migrations/0007-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql
--- a/compute_tools/src/migrations/0008-revoke_replication_for_previously_allowed_roles.sql
+++ b/compute_tools/src/migrations/0008-revoke_replication_for_previously_allowed_roles.sql
--- a/compute_tools/src/migrations/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql
+++ b/compute_tools/src/migrations/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql
@@ -1,7 +0,0 @@
-DO $$
-BEGIN
-    IF (SELECT setting::numeric >= 160000 FROM pg_settings WHERE name = 'server_version_num') THEN
-       EXECUTE 'GRANT EXECUTE ON FUNCTION pg_export_snapshot TO neon_superuser';
-       EXECUTE 'GRANT EXECUTE ON FUNCTION pg_log_standby_snapshot TO neon_superuser';
-    END IF;
-END $$;
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -10,7 +10,6 @@ use tracing::{error, info, info_span, instrument, span_enabled, warn, Level};

 use crate::config;
 use crate::logger::inlinify;
-use crate::migration::MigrationRunner;
 use crate::params::PG_HBA_ALL_MD5;
 use crate::pg_helpers::*;

@@ -777,25 +776,84 @@ pub fn handle_migrations(client: &mut Client) -> Result<()> {

    // Add new migrations in numerical order.
    let migrations = [
-        include_str!("./migrations/0001-neon_superuser_bypass_rls.sql"),
-        include_str!("./migrations/0002-alter_roles.sql"),
-        include_str!("./migrations/0003-grant_pg_create_subscription_to_neon_superuser.sql"),
-        include_str!("./migrations/0004-grant_pg_monitor_to_neon_superuser.sql"),
-        include_str!("./migrations/0005-grant_all_on_tables_to_neon_superuser.sql"),
-        include_str!("./migrations/0006-grant_all_on_sequences_to_neon_superuser.sql"),
+        include_str!("./migrations/0000-neon_superuser_bypass_rls.sql"),
+        include_str!("./migrations/0001-alter_roles.sql"),
+        include_str!("./migrations/0002-grant_pg_create_subscription_to_neon_superuser.sql"),
+        include_str!("./migrations/0003-grant_pg_monitor_to_neon_superuser.sql"),
+        include_str!("./migrations/0004-grant_all_on_tables_to_neon_superuser.sql"),
+        include_str!("./migrations/0005-grant_all_on_sequences_to_neon_superuser.sql"),
        include_str!(
-            "./migrations/0007-grant_all_on_tables_to_neon_superuser_with_grant_option.sql"
+            "./migrations/0006-grant_all_on_tables_to_neon_superuser_with_grant_option.sql"
        ),
        include_str!(
-            "./migrations/0008-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql"
-        ),
-        include_str!("./migrations/0009-revoke_replication_for_previously_allowed_roles.sql"),
-        include_str!(
-            "./migrations/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql"
+            "./migrations/0007-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql"
        ),
+        include_str!("./migrations/0008-revoke_replication_for_previously_allowed_roles.sql"),
    ];

-    MigrationRunner::new(client, &migrations).run_migrations()?;
+    let mut func = || {
+        let query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
+        client.simple_query(query)?;
+
+        let query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)";
+        client.simple_query(query)?;
+
+        let query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING";
+        client.simple_query(query)?;
+
+        let query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin";
+        client.simple_query(query)?;
+
+        let query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC";
+        client.simple_query(query)?;
+        Ok::<_, anyhow::Error>(())
+    };
+    func().context("handle_migrations prepare")?;
+
+    let query = "SELECT id FROM neon_migration.migration_id";
+    let row = client
+        .query_one(query, &[])
+        .context("handle_migrations get migration_id")?;
+    let mut current_migration: usize = row.get::<&str, i64>("id") as usize;
+    let starting_migration_id = current_migration;
+
+    let query = "BEGIN";
+    client
+        .simple_query(query)
+        .context("handle_migrations begin")?;
+
+    while current_migration < migrations.len() {
+        let migration = &migrations[current_migration];
+        if migration.starts_with("-- SKIP") {
+            info!("Skipping migration id={}", current_migration);
+        } else {
+            info!(
+                "Running migration id={}:\n{}\n",
+                current_migration, migration
+            );
+            client.simple_query(migration).with_context(|| {
+                format!("handle_migrations current_migration={}", current_migration)
+            })?;
+        }
+        current_migration += 1;
+    }
+    let setval = format!(
+        "UPDATE neon_migration.migration_id SET id={}",
+        migrations.len()
+    );
+    client
+        .simple_query(&setval)
+        .context("handle_migrations update id")?;
+
+    let query = "COMMIT";
+    client
+        .simple_query(query)
+        .context("handle_migrations commit")?;
+
+    info!(
+        "Ran {} migrations",
+        (migrations.len() - starting_migration_id)
+    );

    Ok(())
 }
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -15,6 +15,7 @@ use std::time::Duration;

 use anyhow::{bail, Context};
 use camino::Utf8PathBuf;
+use futures::SinkExt;
 use pageserver_api::models::{
    self, AuxFilePolicy, LocationConfig, TenantHistorySize, TenantInfo, TimelineInfo,
 };
@@ -349,6 +350,11 @@ impl PageServerNode {
                .map(|x| x.parse::<NonZeroU64>())
                .transpose()
                .context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?,
+            trace_read_requests: settings
+                .remove("trace_read_requests")
+                .map(|x| x.parse::<bool>())
+                .transpose()
+                .context("Failed to parse 'trace_read_requests' as bool")?,
            eviction_policy: settings
                .remove("eviction_policy")
                .map(serde_json::from_str)
@@ -449,6 +455,11 @@ impl PageServerNode {
                    .map(|x| x.parse::<NonZeroU64>())
                    .transpose()
                    .context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?,
+                trace_read_requests: settings
+                    .remove("trace_read_requests")
+                    .map(|x| x.parse::<bool>())
+                    .transpose()
+                    .context("Failed to parse 'trace_read_requests' as bool")?,
                eviction_policy: settings
                    .remove("eviction_policy")
                    .map(serde_json::from_str)
@@ -555,39 +566,60 @@ impl PageServerNode {
        pg_wal: Option<(Lsn, PathBuf)>,
        pg_version: u32,
    ) -> anyhow::Result<()> {
+        let (client, conn) = self.page_server_psql_client().await?;
+        // The connection object performs the actual communication with the database,
+        // so spawn it off to run on its own.
+        tokio::spawn(async move {
+            if let Err(e) = conn.await {
+                eprintln!("connection error: {}", e);
+            }
+        });
+        let client = std::pin::pin!(client);
+
        // Init base reader
        let (start_lsn, base_tarfile_path) = base;
        let base_tarfile = tokio::fs::File::open(base_tarfile_path).await?;
-        let base_tarfile =
-            mgmt_api::ReqwestBody::wrap_stream(tokio_util::io::ReaderStream::new(base_tarfile));
+        let base_tarfile = tokio_util::io::ReaderStream::new(base_tarfile);

        // Init wal reader if necessary
        let (end_lsn, wal_reader) = if let Some((end_lsn, wal_tarfile_path)) = pg_wal {
            let wal_tarfile = tokio::fs::File::open(wal_tarfile_path).await?;
-            let wal_reader =
-                mgmt_api::ReqwestBody::wrap_stream(tokio_util::io::ReaderStream::new(wal_tarfile));
+            let wal_reader = tokio_util::io::ReaderStream::new(wal_tarfile);
            (end_lsn, Some(wal_reader))
        } else {
            (start_lsn, None)
        };

-        // Import base
-        self.http_client
-            .import_basebackup(
-                tenant_id,
-                timeline_id,
-                start_lsn,
-                end_lsn,
-                pg_version,
-                base_tarfile,
-            )
-            .await?;
+        let copy_in = |reader, cmd| {
+            let client = &client;
+            async move {
+                let writer = client.copy_in(&cmd).await?;
+                let writer = std::pin::pin!(writer);
+                let mut writer = writer.sink_map_err(|e| {
+                    std::io::Error::new(std::io::ErrorKind::Other, format!("{e}"))
+                });
+                let mut reader = std::pin::pin!(reader);
+                writer.send_all(&mut reader).await?;
+                writer.into_inner().finish().await?;
+                anyhow::Ok(())
+            }
+        };

+        // Import base
+        copy_in(
+            base_tarfile,
+            format!(
+                "import basebackup {tenant_id} {timeline_id} {start_lsn} {end_lsn} {pg_version}"
+            ),
+        )
+        .await?;
        // Import wal if necessary
        if let Some(wal_reader) = wal_reader {
-            self.http_client
-                .import_wal(tenant_id, timeline_id, start_lsn, end_lsn, wal_reader)
-                .await?;
+            copy_in(
+                wal_reader,
+                format!("import wal {tenant_id} {timeline_id} {start_lsn} {end_lsn}"),
+            )
+            .await?;
        }

        Ok(())
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -56,10 +56,6 @@ enum Command {
        #[arg(long)]
        scheduling: Option<NodeSchedulingPolicy>,
    },
-    NodeDelete {
-        #[arg(long)]
-        node_id: NodeId,
-    },
    /// Modify a tenant's policies in the storage controller
    TenantPolicy {
        #[arg(long)]
@@ -341,7 +337,7 @@ async fn main() -> anyhow::Result<()> {
        }
        Command::TenantCreate { tenant_id } => {
            storcon_client
-                .dispatch::<_, ()>(
+                .dispatch(
                    Method::POST,
                    "v1/tenant".to_string(),
                    Some(TenantCreateRequest {
@@ -361,16 +357,13 @@ async fn main() -> anyhow::Result<()> {
            tracing::info!("Delete status: {}", status);
        }
        Command::Nodes {} => {
-            let mut resp = storcon_client
+            let resp = storcon_client
                .dispatch::<(), Vec<NodeDescribeResponse>>(
                    Method::GET,
                    "control/v1/node".to_string(),
                    None,
                )
                .await?;
-
-            resp.sort_by(|a, b| a.listen_http_addr.cmp(&b.listen_http_addr));
-
            let mut table = comfy_table::Table::new();
            table.set_header(["Id", "Hostname", "Scheduling", "Availability"]);
            for node in resp {
@@ -402,16 +395,13 @@ async fn main() -> anyhow::Result<()> {
                .await?;
        }
        Command::Tenants {} => {
-            let mut resp = storcon_client
+            let resp = storcon_client
                .dispatch::<(), Vec<TenantDescribeResponse>>(
                    Method::GET,
                    "control/v1/tenant".to_string(),
                    None,
                )
                .await?;
-
-            resp.sort_by(|a, b| a.tenant_id.cmp(&b.tenant_id));
-
            let mut table = comfy_table::Table::new();
            table.set_header([
                "TenantId",
@@ -660,11 +650,6 @@ async fn main() -> anyhow::Result<()> {
                .dispatch::<(), ()>(Method::POST, format!("debug/v1/node/{node_id}/drop"), None)
                .await?;
        }
-        Command::NodeDelete { node_id } => {
-            storcon_client
-                .dispatch::<(), ()>(Method::DELETE, format!("control/v1/node/{node_id}"), None)
-                .await?;
-        }
        Command::TenantSetTimeBasedEviction {
            tenant_id,
            period,
--- a/docs/rfcs/034-ancestor-deletion.md
+++ b/docs/rfcs/034-ancestor-deletion.md
@@ -1,252 +0,0 @@
-# Ancestor Timeline Deletion
-
-Created on: 2024-02-23
-
-Author: John Spray
-
-# Summary
-
-When a tenant creates a new timeline that they will treat as their 'main' history,
-it is awkward to permanently retain an 'old main' timeline as its ancestor. Currently
-this is necessary because it is forbidden to delete a timeline which has descendents.
-
-A new pageserver API is proposed to 'adopt' data from a parent timeline into
-one of its children, such that the link between ancestor and child can be severed,
-leaving the parent in a state where it may then be deleted.
-
-# Motivation
-
-Retaining parent timelines currently has two costs:
-
- Cognitive load on users, who have to remember which is the "real" main timeline.
- Storage capacity cost, as the parent timeline will retain layers up to the
-  child's timeline point, even if the child fully covers its keyspace with image
-  layers and will never actually read from the parent.
-
-# Solution
-
-A new pageserver API `PUT /v1/tenant/:tenant_id/timeline/:timeline_id/detach_ancestor`
-will be added. The `timeline_id` in this URL is that of the _child_ timeline that we
-wish to detach from its parent.
-
-On success, this API will leave the following state:
-
- The detached child timeline will no longer have an ancestor, and will contain all
-  the data needed to service reads without recursing into an ancestor.
- Any other children of the parent whose timeline points were at a lower LSN than
-  the detached child timeline will be modified to have the child timeline as their
-  new parent.
- The parent timeline will still exist, but the child will no longer have it as an
-  ancestor. If this was the last timeline that depended on the parent, then the
-  parent will become deletable.
-
-This API's implementation will consist of a series of retryable steps, such that
-on failures/timeout it can safely be called again to reach the target state.
-
-## Example
-
-### Before
-
-The user has "rolled back" their project to LSN X, resulting in a "new main"
-timeline. The parent "old main" timeline still exists, and they would like
-to clean it up.
-
-They have two other timelines A and B. A is from before the rollback point,
-and B is from after the rollback point.
-
-```
----"old main" timeline-------X-------------------------------------------->
-                |             |                         |
-                |-> child A   |                         |
-                              |-> "new main" timeline   |
-                                                        -> child B
-
-```
-
-### After calling detach ancestor API
-
-The "new main" timeline is no longer dependent on old main, and neither
-is child A, because it had a branch point before X.
-
-The user may now choose to delete child B and "old main" to get to
-a pristine state. Child B is likely to be unwanted since the user
-chose to roll back to X, and it branches from after X. However, we
-don't assume this in the API; it is up to the user to delete it.
-
-```
-|----"old main" timeline---------------------------------------------------->
-                                                         |
-                                                         |
-                                                         |
-                                                         -> child B
-
-|----"new main" timeline--------->
-                 |
-                 |-> child A
-
-
-```
-
-### After removing timelines
-
-We end up with a totally clean state that leaves no trace that a rollback
-ever happened: there is only one root timeline.
-
-```
-| ----"new main" timeline----------->
-                |
-                |-> child A
-
-
-```
-
-## Caveats
-
-Important things for API users to bear in mind:
-
- this API does not delete the parent timeline: you must still do that explicitly.
- if there are other child timelines ahead of the branch point of the detached
-  child, the parent won't be deletable: you must either delete or detach those
-  children.
- do _not_ simply loop over all children and detach them all: this can have an
-  extremely high storage cost. The detach ancestor API is intended for use on a single
-  timeline to make it the new "main".
- The detach ancestor API should also not be
-  exposed directly to the user as button/API, because they might decide
-  to click it for all the children and thereby generate many copies of the
-  parent's data -- the detach ancestor API should be used as part
-  of a high level "clean up after rollback" feature.
-
-## `detach_ancestor` API implementation
-
-Terms used in the following sections:
-
- "the child": the timeline whose ID is specified in the detach ancestor API URL, also
-  called "new main" in the example.
- "the parent": the parent of "the child". Also called "old main" in the example.
- "the branch point" the ancestor_lsn of "the child"
-
-### Phase 1: write out adopted layers to S3
-
-The child will "adopt" layers from the parent, such that its end state contains
-all the parent's history as well as its own.
-
-For all layers in the parent's layer map whose high LSN is below the branch
-point, issue S3 CopyObject requests to duplicate them into the child timeline's
-prefix. Do not add them to the child's layer map yet.
-
-For delta layers in the parent's layer map which straddle the branch point, read them
-and write out only content up to the branch point into new layer objects.
-
-This is a long running operation if the parent has many layers: it should be
-implemented in a way that resumes rather than restarting from scratch, if the API
-times out and is called again.
-
-As an optimization, if there are no other timelines that will be adopted into
-the child, _and_ the child's image layers already full cover the branch LSN,
-then we may skip adopting layers.
-
-### Phase 2: update the child's index
-
-Having written out all needed layers in phase 1, atomically link them all
-into the child's IndexPart and upload to S3. This may be done while the
-child Timeline is still running.
-
-### Phase 3: modify timelines ancestry
-
-Modify the child's ancestor to None, and upload its IndexPart to persist the change.
-
-For all timelines which have the same parent as the child, and have a branch
-point lower than our branch point, switch their ancestor_timeline to the child,
-and upload their IndexPart to persist the change.
-
-## Alternatives considered
-
-### Generate full image layer on child, rather than adopting parent deltas
-
-This would work for the case of a single child, but would prevent re-targeting
-other timelines that depended on the parent. If we detached many children this
-way, the storage cost would become prohibitive (consider a 1TB database with
-100 child timelines: it would cost 100TiB if they all generated their own image layers).
-
-### Don't rewrite anything: just fake it in the API
-
-We could add a layer of indirection that let a child "pretend" that it had no
-ancestor, when in reality it still had the parent. The pageserver API could
-accept deletion of ancestor timelines, and just update child metadata to make
-them look like they have no ancestor.
-
-This would not achieve the desired reduction in storage cost, and may well be more
-complex to maintain than simply implementing the API described in this RFC.
-
-### Avoid copying objects: enable child index to use parent layers directly
-
-We could teach IndexPart to store a TimelineId for each layer, such that a child
-timeline could reference a parent's layers directly, rather than copying them
-into the child's prefix.
-
-This would impose a cost for the normal case of indices that only target the
-timeline's own layers, add complexity, and break the useful simplifying
-invariant that timelines "own" their own path. If child timelines were
-referencing layers from the parent, we would have to ensure that the parent
-never runs GC/compaction again, which would make the API less flexible (the
-proposal in this RFC enables deletion of the parent but doesn't require it.)
-
-## Performance
-
-### Adopting layers
-
- CopyObject is a relatively cheap operation, but we may need to issue tens of thousands
-  of such requests: this can take up to tens of seconds and will compete for RemoteStorage
-  semaphore units with other activity on the pageserver.
- If we are running on storage backend that doesn't implement CopyObject, then
-  this part will be much more expensive as we would stream all layer content
-  through the pageserver. This is no different to issuing a lot
-  of reads to a timeline that does not have a warm local cache: it will move
-  a lot of gigabytes, but that shouldn't break anything.
- Generating truncated layers for delta that straddle the branch point will
-  require streaming read/write of all the layers in question.
-
-### Updating timeline ancestry
-
-The simplest way to update timeline ancestry will probably be to stop and start
-all the Timeline objects: this is preferable to the complexity of making their
-ancestry mutable at runtime.
-
-There will be a corresponding "stutter" in the availability of the timelines,
-of the order 10-100ms, which is the time taken to upload their IndexPart, and
-restart the Timeline.
-
-# Interaction with other features
-
-## Concurrent timeline creation
-
-If new historic timelines are created using the parent as an ancestor while the
-detach ancestor API is running, they will not be re-parented to the child. This
-doesn't break anything, but it leaves the parent in a state where it might not
-be possible to delete it.
-
-Since timeline creations are an explicit user action, this is not something we need to
-worry about as the storage layer: a user who wants to delete their parent timeline will not create
-new children, and if they do, they can choose to delete those children to
-enable deleting the parent.
-
-For the least surprise to the user, before starting the detach ancestor branch
-operation, the control plane should wait until all branches are created and not
-allow any branches to be created before the branch point on the ancestor branch
-while the operation is ongoing.
-
-## WAL based disaster recovery
-
-WAL based disaster recovery currently supports only restoring of the main
-branch. Enabling WAL based disaster recovery in the future requires that we
-keep a record which timeline generated the WAL and at which LSN was a parent
-detached. Keep a list of timeline ids and the LSN in which they were detached in
-the `index_part.json`. Limit the size of the list to 100 first entries, after
-which the WAL disaster recovery will not be possible.
-
-## Sharded tenants
-
-For sharded tenants, calls to the detach ancestor API will pass through the storage
-controller, which will handle them the same as timeline creations: invoke first
-on shard zero, and then on all the other shards.
--- a/docs/rfcs/034-timeline-archive.md
+++ b/docs/rfcs/034-timeline-archive.md
@@ -1,507 +0,0 @@
-# Timeline Archival
-
-## Summary
-
-This RFC describes a mechanism for pageservers to eliminate local storage + compute work
-for timelines which are not in use, in response to external API calls to "archive" a timeline.
-
-The archived state roughly corresponds to fully offloading a timeline to object storage, such
-that its cost is purely the cost of that object storage.
-
-## Motivation
-
-Archived timelines serve multiple purposes:
- Act as a 'snapshot' for workloads that would like to retain restorable copies of their
-  database from longer ago than their PITR window.
- Enable users to create huge numbers of branches (e.g. one per github PR) without having
-  to diligently clean them up later to avoid overloading the pageserver (currently we support
-  up to ~500 branches per tenant).
-
-### Prior art
-
-Most storage and database systems have some form of snapshot, which can be implemented several ways:
-1. full copies of data (e.g. an EBS snapshot to S3)
-2. shallow snapshots which are CoW relative to the original version of the data, e.g. on a typical NFS appliance, or a filesystem like CephFS.
-3. a series of snapshots which are CoW or de-duplicated relative to one another.
-
-Today's Neon branches are approximately like `2.`, although due to implementation details branches
-often end up storing much more data than they really need, as parent branches assume that all data
-at the branch point is needed.  The layers pinned in the parent branch may have a much larger size
-than the physical size of a compressed image layer representing the data at the branch point.
-
-## Requirements
-
- Enter & exit the archived state in response to external admin API calls
- API calls to modify the archived state are atomic and durable
- An archived timeline should eventually (once out of PITR window) use an efficient compressed
-  representation, and avoid retaining arbitrarily large data in its parent branch.
- Remote object GETs during tenant start may be O(N) with the number of _active_ branches,
-  but must not scale with the number of _archived_ branches.
- Background I/O for archived branches should only be done a limited number of times to evolve them
-  to a long-term-efficient state (e.g. rewriting to image layers).  There should be no ongoing "housekeeping"
-  overhead for archived branches, including operations related to calculating sizes for billing.
- The pageserver should put no load on the safekeeper for archived branches.
- Performance of un-archiving a branch must make good use of S3/disk bandwidth to restore the branch
-  to a performant state in a short time (linear with the branch's logical size)
-
-## Non Goals
-
- Archived branches are not a literal `fullbackup` postgres snapshot: they are still stored
-  in Neon's internal format.
- Compute cold starts after activating an archived branch will not have comparable performance to
-  cold starts on an active branch.
- Archived branches will not use any new/additional compression or de-duplication beyond what
-  is already implemented for image layers (zstd per page).
- The pageserver will not "auto start" archived branches in response to page_service API requests: they
-  are only activated explicitly via the HTTP API.
- We will not implement a total offload of archived timelines from safekeepers: their control file (small) will
-  remain on local disk, although existing eviction mechanisms will remove any segments from local disk.
- We will not expose any prometheus metrics for archived timelines, or make them visible in any
-  detailed HTTP APIs other than the specific API for listing archived timelines.
- A parent branch may not be archived unless all its children are.
-
-## Impacted Components
-
-pageserver, storage controller
-
-## Terminology
-
-**Archived**: a branch is _archived_ when an HTTP API request to archive it has succeeded: the caller
-may assume that this branch is now very cheap to store, although this may not be physically so until the
-branch proceeds to the offloaded state.
-
-**Active** branches are branches which are available for use by page_service clients, and have a relatively
-high cost due to consuming local storage.
-
-**Offloaded** branches are a subset of _archived_ branches, which have had their local state removed such
-that they now consume minimal runtime resources and have a cost similar to the cost of object storage.
-
-**Activate** (verb): transition from Archived to Active
-
-**Archive** (verb): transition from Active to Archived
-
-**Offload** (verb): transition from Archived to Offloaded
-
-**Offload manifest**: an object stored in S3 that describes timelines which pageservers do not load.
-
-**Warm up** (verb): operation done on an active branch, by downloading its active layers.  Once a branch is
-warmed up, good performance will be available to page_service clients.
-
-## Implementation
-
-### High level flow
-
-We may think of a timeline which is archived and then activated as proceeding through a series of states:
-
-```mermaid
-stateDiagram
-  [*] --> Active(warm)
-  Active(warm) --> Archived
-  Archived --> Offloaded
-  Archived --> Active(warm)
-  Offloaded --> Active(cold)
-  Active(cold) --> Active(warm)
-```
-
-Note that the transition from Archived to Active(warm) is expected to be fairly rare: the most common lifecycles
-of branches will be:
- Very frequent: Short lived branches: Active -> Deleted
- Frequent: Long-lived branches: Active -> Archived -> Offloaded -> Deleted
- Rare: Branches used to restore old state: Active ->Archived -> Offloaded -> Active
-
-These states are _not_ all stored as a single physical state on the timeline, but rather represent the combination
-of:
- the timeline's lifecycle state: active or archived, stored in the timeline's index
- its offload state: whether pageserver has chosen to drop local storage of the timeline and write it into the
-  manifest of offloaded timelines.
- cache state (whether it's warm or cold).
-
-### Storage format changes
-
-There are two storage format changes:
-1. `index_part.json` gets a new attribute `state` that describes whether the timeline is to
-   be considered active or archived.
-2. A new tenant-level _manifest_ object `tenant_manifest-v1.json` describes which timelines a tenant does not need to load
-   at startup (and is available for storing other small, rarely changing tenant-wide attributes in future)
-
-The manifest object will have a format like this:
-```
-{
-  "offload_timelines": [
-    {
-      "timeline_id": ...
-      "last_record_lsn": ...
-      "last_record_lsn_time": ...
-      "pitr_interval": ...
-      "last_gc_lsn": ...  # equal to last_record_lsn if this branch has no history (i.e. a snapshot)
-      "logical_size": ...  # The size at last_record_lsn
-      "physical_size" ...
-      "parent": Option<{
-        "timeline_id"...
-        "lsn"... # Branch point LSN on the parent
-        "requires_data": bool # True if this branch depends on layers in its parent, identify it here
-
-      }>
-    }
-  ]
-}
-```
-
-The information about a timeline in its offload state is intentionally minimal: just enough to decide:
- Whether it requires [archive optimization](#archive-branch-optimization) by rewriting as a set of image layers: we may infer this
-  by checking if now > last_record_lsn_time - pitr_interval, and pitr_lsn < last_record_lsn.
- Whether a parent branch should include this offloaded branch in its GC inputs to avoid removing
-  layers that the archived branch depends on
- Whether requests to delete this `timeline_id` should be executed (i.e. if a deletion request
-  is received for a timeline_id that isn't in the site of live `Timelines` or in the manifest, then
-  we don't need to go to S3 for the deletion.
- How much archived space to report in consumption metrics
-
-The contents of the manifest's offload list will also be stored as an attribute of `Tenant`, such that the total
-set of timelines may be found by the union of `Tenant::timelines` (non-offloaded timelines) and `Tenant::offloaded`
-(offloaded timelines).
-
-For split-brain protection, the manifest object will be written with a generation suffix, in the same way as
-index_part objects are (see [generation numbers RFC](025-generation-numbers.md)).  This will add some complexity, but
-give us total safety against two pageservers with the same tenant attached fighting over the object.  Existing code
-for finding the latest generation and for cleaning up old generations (in the scrubber) will be generalized to cover
-the manifest file.
-
-### API & Timeline state
-
-Timelines will store a lifecycle state (enum of Active or Archived) in their IndexPart.  This will
-be controlled by a new per-timeline `configure` endpoint.  This is intentionally generic naming, which
-may be used in future to control other per-timeline attributes (e.g. in future we may make PITR interval
-a per-timeline configuration).
-
-`PUT /v1/tenants/{tenant_id}/timelines/{timeline_id}/configure`
-```
-{
-  'state': 'active|archive'
-}
-```
-
-When archiving a timeline, this API will complete as soon as the timeline's state has been set in index_part, and that index has been uploaded.
-
-When activating a timeline, this API will complete as soon as the timeline's state has been set in index_part,
-**and** the `Timeline` object has been instantiated and activated.  This will require reading the timeline's
-index, but not any data: it should be about as fast as a couple of small S3 requests.
-
-The API will be available with identical path via the storage controller: calling this on a sharded tenant
-will simply map the API call to all the shards.
-
-Archived timelines may never have descendent timelines which are active.  This will be enforced at the API level,
-such that activating a timeline requires that all its ancestors are active, and archiving a timeline requires
-that all its descendents are archived.  It is the callers responsibility to walk the hierarchy of timelines
-in the proper order if they would like to archive whole trees of branches.
-
-Because archive timelines will be excluded from the usual timeline listing APIs, a new API specifically
-for archived timelines will be added: this is for use in support/debug:
-
-```
-GET /v1/tenants/{tenant_id}/archived_timelines
-
-{
-  ...same per-timeline content as the tenant manifest...
-}
-
-```
-
-### Tenant attach changes
-
-Currently, during Tenant::spawn we list all the timelines in the S3 bucket, and then for each timeline
-we load their index_part.json.  To avoid the number of GETs scaling linearly with the number of archived
-timelines, we must have a single object that tells us which timelines do not need to be loaded.  The
-number of ListObjects requests while listing timelines will still scale O(N), but this is less problematic
-because each request covers 1000 timelines.
-
-This is **not** literally the same as the set of timelines who have state=archived.  Rather, it is
-the set of timelines which have been offloaded in the background after their state was set to archived.
-
-We may simply skip loading these timelines: there will be no special state of `Timeline`, they just won't
-exist from the perspective of an active `Tenant` apart from in deletion: timeline deletion will need
-to check for offloaded timelines as well as active timelines, to avoid wrongly returning 404 on trying
-to delete an offloaded timeline.
-
-### Warm-up API
-
-`PUT /v1/tenants/{tenant_id}/timelines/{timeline_id}/download?wait_ms=1234`
-
-This API will be similar to the existing `download_remote_layers` API, but smarter:
- It will not download _all_ remote layers, just the visible set (i.e. layers needed for a read)
- It will download layers in the visible set until reaching `wait_ms`, then return a struct describing progress
-  of downloads, so that the caller can poll.
-
-The _visible set_ mentioned above will be calculated by the pageserver in the background, by taking the set
-of readable LSNs (i.e. branch points and heads of branches), and walking the layer map to work out which layers
-can possibly be read from these LSNs.  This concept of layer visibility is more generally useful for cache
-eviction and heatmaps, as well as in this specific case of warming up a timeline.
-
-The caller does not have to wait for the warm up API, or call it at all.  But it is strongly advised
-to call it, because otherwise populating local contents for a timeline can take a long time when waiting
-for SQL queries to coincidentally hit all the layers, and during that time query latency remains quite
-volatile.
-
-### Background work
-
-Archived branches are not subject to normal compaction.  Instead, when the compaction loop encounters
-an archived branch, it will consider rewriting the branch to just image layers if the branch has no history
-([archive branch optimization](#archive-branch-optimization)), or offloading the timeline from local disk
-if its state permits that.
-
-Additionally, the tenant compaction task will walk the state of already offloaded timelines to consider
-optimizing their storage, e.g. if a timeline had some history when offloaded, but since then its PITR
-has elapsed and it can now be rewritten to image layers.
-
-#### Archive branch offload
-
-Recall that when we archive a timeline via the HTTP API, this only sets a state: it doesn't do
-any actual work.
-
-This work is done in the background compaction loop.  It makes sense to tag this work on to the compaction
-loop, because it is spiritually aligned: offloading data for archived branches improves storage efficiency.
-
-The condition for offload is simple:
- - a `Timeline` object exists with state `Archived`
- - the timeline does not have any non-offloaded children.
- 
- Regarding the condition that children must be offloaded, this will always be eventually true, because
- we enforce at the API level that children of archived timelines must themselves be archived, and all
- archived timelines will eventually be offloaded.
-
-Offloading a timeline is simple:
- Read the timeline's attributes that we will store in its offloaded state (especially its logical size)
- Call `shutdown()` on the timeline and remove it from the `Tenant` (as if we were about to delete it)
- Erase all the timeline's content from local storage (`remove_dir_all` on its path)
- Write the tenant manifest to S3 to prevent this timeline being loaded on next start.
-
-#### Archive branch optimization (flattening)
-
-When we offloaded a branch, it might have had some history that prevented rewriting it to a single
-point in time set of image layers.  For example, a branch might have several days of writes and a 7
-day PITR: when we archive it, it still has those days of history.
-
-Once the PITR has expired, we have an opportunity to reduce the physical footprint of the branch by:
- Writing compressed image layers within the archived branch, as these are more efficient as a way of storing
-  a point in time compared with delta layers
- Updating the branch's offload metadata to indicate that this branch no longer depends on its ancestor
-  for data, i.e. the ancestor is free to GC layers files at+below the branch point
-
-Fully compacting an archived branch into image layers at a single LSN may be thought of as *flattening* the
-branch, such that it is now a one-dimensional keyspace rather than a two-dimensional key/lsn space. It becomes
-a true snapshot at that LSN.
-
-It is not always more efficient to flatten a branch than to keep some extra history on the parent: this
-is described in more detail in [optimizations](#delaying-storage-optimization-if-retaining-parent-layers-is-cheaper)
-
-Archive branch optimization should be done _before_ background offloads during compaction, because there may
-be timelines which are ready to be offloaded but also would benefit from the optimization step before
-being offloaded.  For example, a branch which has already fallen out of PITR window and has no history
-of its own may be immediately re-written as a series of image layers before being offloaded.
-
-### Consumption metrics
-
-Archived timelines and offloaded timelines will be excluded from the synthetic size calculation, in anticipating
-that billing structures based on consumption metrics are highly likely to apply different $/GB rates to archived
-vs. ordinary content.
-
-Archived and offloaded timelines' logical size will be reported under the existing `timeline_logical_size`
-variant of `MetricsKey`: receivers are then free to bill on this metric as they please.
-
-### Secondary locations
-
-Archived timelines (including offloaded timelines) will be excluded from heatmaps, and thereby
-when a timeline is archived, after the next cycle of heatmap upload & secondary download, its contents
-will be dropped from secondary locations.
-
-### Sharding
-
-Archiving or activating a timeline will be done symmetrically across all shards in a tenant, in
-the same way that timeline creation and deletion is done.  There are no special rules about ordering:
-the storage controller may dispatch concurrent calls to all shards when archiving or activating a timeline.
-
-Since consumption metrics are only transmitted from shard zero, the state of archival on this shard
-will be authoritative for consumption metrics.
-
-## Error cases
-
-### Errors in sharded tenants
-
-If one shard in a tenant fails an operation but others succeed, the tenant may end up in a mixed
-state, where a timeline is archived on some shards but not on others.  
-
-We will not bother implementing a rollback mechanism for this: errors in archiving/activating a timeline
-are either transient (e.g. S3 unavailable, shutting down), or the fault of the caller (NotFound, BadRequest).
-In the transient case callers are expected to retry until success, or to make appropriate API calls to clear
-up their mistake.  We rely on this good behavior of callers to eventually get timelines into a consistent
-state across all shards.  If callers do leave a timeline in an inconsistent state across shards, this doesn't
-break anything, it's just "weird".
-
-This is similar to the status quo for timeline creation and deletion: callers are expected to retry
-these operations until they succeed.
-
-### Archiving/activating
-
-Archiving/activating a timeline can fail in a limited number of ways:
-1. I/O error storing/reading the timeline's updated index
-    - These errors are always retryable: a fundamental design assumption of the pageserver is that remote
-      storage errors are always transient. 
-2. NotFound if the timeline doesn't exist
-    - Callers of the API are expected to avoid calling deletion and archival APIs concurrently.
-    - The storage controller has runtime locking to prevent races such as deleting a timeline while
-      archiving it.
-3. BadRequest if the rules around ancestors/descendents of archived timelines would be violated
-    - Callers are expected to do their own checks to avoid hitting this case.  If they make
-      a mistake and encounter this error, they should give up.
-
-### Offloading
-
-Offloading can only fail if remote storage is unavailable, which would prevent us from writing the
-tenant manifest.  In such error cases, we give up in the expectation that offloading will be tried 
-again at the next iteration of the compaction loop.
-
-### Archive branch optimization
-
-Optimization is a special form of compaction, so can encounter all the same errors as regular compaction
-can: it should return Result<(), CompactionError>, and as with compaction it will be retried on
-the next iteration of the compaction loop.
-
-## Optimizations
-
-### Delaying storage optimization if retaining parent layers is cheaper
-
-Optimizing archived branches to image layers and thereby enabling parent branch GC to progress
-is a safe default: archived branches cannot over-fill a pageserver's local disk, and once they
-are offloaded to S3 they're totally safe, inert things.
-
-However, in some cases it can be advantageous to retain extra history on their parent branch rather
-than flattening the archived branch.  For example, if a 1TB parent branch is rather slow-changing (1GB
-of data per day), and archive branches are being created nightly, then writing out full 1TB image layers
-for each nightly branch is inefficient compared with just keeping more history on the main branch.
-
-Getting this right requires consideration of:
- Compaction: if keeping more history on the main branch is going to prompt the main branch's compaction to
-  write out extra image layers, then it might make more sense to just write out the image layers on
-  the archived branch.
- Metadata bloat: keeping extra history on a parent branch doesn't just cost GB of storage, it makes
-  the layer map (and index_part) bigger.  There are practical limits beyond which writing an indefinitely
-  large layer map can cause problems elsewhere.
-
-This optimization can probably be implemented quite cheaply with some basic heuristics like:
- don't bother doing optimization on an archive branch if the LSN distance between
-  its branch point and the end of the PITR window is <5% of the logical size of the archive branch.
- ...but, Don't keep more history on the main branch than double the PITR
-
-### Creating a timeline in archived state (a snapshot)
-
-Sometimes, one might want to create a branch with no history, which will not be written to
-before it is archived.  This is a snapshot, although we do not require a special snapshot API,
-since a snapshot can be represented as a timeline with no history.
-
-This can be accomplished by simply creating a timeline and then immediately archiving it, but
-that is somewhat wasteful: this timeline it will spin up various tasks and open a connection to the storage
-broker to try and ingest WAL, before being shutdown in the subsequent archival call.  To explicitly
-support this common special case, we may add a parameter to the timeline creation API which
-creates a timeline directly into the archived state.
-
-Such a timeline creation will do exactly two I/Os at creation time:
- write the index_part object to record the timeline's existence
- when the timeline is offloaded in the next iteration of the compaction loop (~20s later),
-  write the tenant manifest.
-
-Later, when the timeline falls off the end of the PITR interval, the usual offload logic will wake
-up the 'snapshot' branch and write out image layers.
-
-## Future Work
-
-### Enabling `fullbackup` dumps from archive branches
-
-It would be useful to be able to export an archive branch to another system, or for use in a local
-postgres database.
-
-This could be implemented as a general capability for all branches, in which case it would "just work"
-for archive branches by activating them.  However, downloading all the layers in a branch just to generate
-a fullbackup is a bit inefficient: we could implement a special case for flattened archived branches
-which streams image layers from S3 and outputs the fullbackup stream without writing the layers out to disk.
-
-Implementing `fullbackup` is a bit more complicated than this because of sharding, but solving that problem
-is unrelated to the topic of archived branches (it probably involves having each shard write out a fullbackup 
-stream to S3 in an intermediate format and, then having one node stitch them together).
-
-### Tagging layers from archived branches
-
-When we know a layer is an image layer written for an archived branch that has fallen off the PITR window,
-we may add tags to the S3 objects to enable writing lifecycle policies that transition such layers to even
-cheaper storage.
-
-This could be done for all archived layers, or it could be driven by the archival API, to give the pageserver
-external hints on which branches are likely to be reactivated, and which branches are good candidates for
-tagging for low performance storage.
-
-Tagging+lifecycles is just one mechanism: one might also directly use S3 storage classes.  Other clouds' object
-stores have similar mechanisms.
-
-### Storing sequences of archive branches as deltas
-
-When archived branches are used as scheduled snapshots, we could store them even more efficiently
-by encoding them as deltas relative to each other (i.e. for nightly snapshots, when we do the
-storage optimization for Tuesday's snapshot, we would read Monday's snapshot and store only the modified
-pages). This is the kind of encoding that many backup storage systems use.
-
-The utility of this depends a lot on the churn rate of the data, and the cost of doing the delta encoding
-vs. just writing out a simple stream of the entire database.  For smaller databases, writing out a full
-copy is pretty trivial (e.g. writing a compressed copy of a 10GiB database to S3 can take under 10 seconds,
-so the complexity tradeoff of diff-encoding it is dubious).
-
-One does not necessarily have to read-back the previous snapshot in order to encoded the next one: if the
-pageserver knows about the schedule, it can intentionally retain extra history on the main branch so that
-we can say: "A branch exists from Monday night.  I have Monday night's data still active in the main branch,
-so now I can read at the Monday LSN and the Tuesday LSN, calculate the delta, and store it as Tuesday's
-delta snapshot".
-
-Clearly this all requires careful housekeeping to retain the relationship between branches that depend on
-each other: perhaps this would be done by making the archive branches have child/parent relationships with
-each other, or perhaps we would permit them to remain children of their original parent, but additionally
-have a relationship with the snapshot they're encoded relative to.
-
-Activating a branch that is diff-encoded may require activating several earlier branches too, so figuring
-out how frequently to write a full copy is important.  This is essentially a zoomed-out version of what
-we do with delta layers and image layers within a timeline, except each "layer" is a whole timeline.
-
-
-## FAQ/Alternatives
-
-### Store all timelines in the tenant manifest
-
-Rather than special-casing offloaded timelines in the offload manifest, we could store a total
-manifest of all timelines, eliminating the need for the pageserver to list timelines in S3 on
-startup.
-
-That would be a more invasive change (require hooking in to timeline creation), and would
-generate much more I/O to this manifest for tenants that had many branches _and_ frequent
-create/delete cycles for short lived branches.  Restricting the manifest to offloaded timelines
-means that we only have to cope with the rate at which long-lived timelines are archived, rather
-than the rate at which sort lived timelines are created & destroyed.
-
-### Automatically archiving/activating timelines without external API calls
-
-We could implement TTL driven offload of timelines, waking them up when a page request
-arrives.
-
-This has downsides:
- Opacity: if we do TTL-driven offload inside the pageserver, then the end user doesn't
-  know which of their branches are in this state, and might get a surprise when they try
-  to use such a branch.
- Price fluctuation: if the archival of a branch is used in end user pricing, then users
-  prefer clarity & consistency.  Ideally a branch's storage should cost the same from the moment it
-  is created, rather than having a usage-dependency storage price.
- Complexity: enabling the page service to call up into the Tenant to activate a timeline
-  would be awkward, compared with an external entry point.
-
-### Make offloaded a state of Timeline
-
-To reduce the operator-facing complexity of having some timelines APIs that only return
-non-offloaded timelines, we could build the offloaded state into the Timeline type.
-
-`timeline.rs` is already one of the most egregiously long source files in the tree, so
-this is rejected on the basis that we need to avoid making that complexity worse.
--- a/docs/storage_controller.md
+++ b/docs/storage_controller.md
@@ -44,7 +44,7 @@ If you need to modify the database schema, here’s how to create a migration:
 - Use `diesel migration generate <name>` to create a new migration
 - Populate the SQL files in the `migrations/` subdirectory
 - Use `DATABASE_URL=... diesel migration run` to apply the migration you just wrote: this will update the `[schema.rs](http://schema.rs)` file automatically.
-  - This requires a running database: the easiest way to do that is to just run `cargo neon init ; cargo neon start`, which will leave a database available at `postgresql://localhost:1235/storage_controller`
+  - This requires a running database: the easiest way to do that is to just run `cargo neon init ; cargo neon start`, which will leave a database available at `postgresql://localhost:1235/attachment_service`
 - Commit the migration files and the changes to schema.rs
 - If you need to iterate, you can rewind migrations with `diesel migration revert -a` and then `diesel migration run` again.
 - The migrations are build into the storage controller binary, and automatically run at startup after it is deployed, so once you’ve committed a migration no further steps are needed.
--- a/libs/metrics/src/hll.rs
+++ b/libs/metrics/src/hll.rs
@@ -13,7 +13,11 @@ use std::{

 use measured::{
    label::{LabelGroupVisitor, LabelName, LabelValue, LabelVisitor},
-    metric::{counter::CounterState, name::MetricNameEncoder, Metric, MetricType, MetricVec},
+    metric::{
+        group::{Encoding, MetricValue},
+        name::MetricNameEncoder,
+        Metric, MetricType, MetricVec,
+    },
    text::TextEncoder,
    LabelGroup,
 };
@@ -140,7 +144,6 @@ impl<const N: usize> HyperLogLogState<N> {
        })
    }
 }
-
 impl<W: std::io::Write, const N: usize> measured::metric::MetricEncoding<TextEncoder<W>>
    for HyperLogLogState<N>
 {
@@ -179,13 +182,12 @@ impl<W: std::io::Write, const N: usize> measured::metric::MetricEncoding<TextEnc
            .into_iter()
            .enumerate()
            .try_for_each(|(hll_shard, val)| {
-                CounterState::new(val as u64).collect_into(
-                    &(),
+                enc.write_metric_value(
+                    name.by_ref(),
                    labels.by_ref().compose_with(HllShardLabel {
                        hll_shard: hll_shard as i64,
                    }),
-                    name.by_ref(),
-                    enc,
+                    MetricValue::Int(val as i64),
                )
            })
    }
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -9,7 +9,7 @@ use measured::{
    metric::{
        counter::CounterState,
        gauge::GaugeState,
-        group::Encoding,
+        group::{Encoding, MetricValue},
        name::{MetricName, MetricNameEncoder},
        MetricEncoding, MetricFamilyEncoding,
    },
@@ -171,11 +171,8 @@ fn write_gauge<Enc: Encoding>(
    labels: impl LabelGroup,
    name: impl MetricNameEncoder,
    enc: &mut Enc,
-) -> Result<(), Enc::Err>
-where
-    GaugeState: MetricEncoding<Enc>,
-{
-    GaugeState::new(x).collect_into(&(), labels, name, enc)
+) -> Result<(), Enc::Err> {
+    enc.write_metric_value(name, labels, MetricValue::Int(x))
 }

 #[derive(Default)]
@@ -547,6 +544,15 @@ impl<T: Encoding> Encoding for Inc<T> {
    fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> {
        self.0.write_help(name, help)
    }
+
+    fn write_metric_value(
+        &mut self,
+        name: impl MetricNameEncoder,
+        labels: impl LabelGroup,
+        value: MetricValue,
+    ) -> Result<(), Self::Err> {
+        self.0.write_metric_value(name, labels, value)
+    }
 }

 impl<T: Encoding> MetricEncoding<Inc<T>> for MeasuredCounterPairState
@@ -573,6 +579,15 @@ impl<T: Encoding> Encoding for Dec<T> {
    fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> {
        self.0.write_help(name, help)
    }
+
+    fn write_metric_value(
+        &mut self,
+        name: impl MetricNameEncoder,
+        labels: impl LabelGroup,
+        value: MetricValue,
+    ) -> Result<(), Self::Err> {
+        self.0.write_metric_value(name, labels, value)
+    }
 }

 /// Write the dec counter to the encoder
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -9,7 +9,6 @@ use std::{
    collections::HashMap,
    io::{BufRead, Read},
    num::{NonZeroU64, NonZeroUsize},
-    str::FromStr,
    sync::atomic::AtomicUsize,
    time::{Duration, SystemTime},
 };
@@ -294,6 +293,7 @@ pub struct TenantConfig {
    pub walreceiver_connect_timeout: Option<String>,
    pub lagging_wal_timeout: Option<String>,
    pub max_lsn_wal_lag: Option<NonZeroU64>,
+    pub trace_read_requests: Option<bool>,
    pub eviction_policy: Option<EvictionPolicy>,
    pub min_resident_size_override: Option<u64>,
    pub evictions_low_residence_duration_metric_threshold: Option<String>,
@@ -437,8 +437,22 @@ pub enum CompactionAlgorithm {
    Tiered,
 }

-#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+#[derive(
+    Debug,
+    Clone,
+    Copy,
+    PartialEq,
+    Eq,
+    Serialize,
+    Deserialize,
+    strum_macros::FromRepr,
+    strum_macros::EnumString,
+)]
+#[strum(serialize_all = "kebab-case")]
 pub enum ImageCompressionAlgorithm {
+    /// Disabled for writes, and never decompress during reading.
+    /// Never set this after you've enabled compression once!
+    DisabledNoDecompress,
    // Disabled for writes, support decompressing during read path
    Disabled,
    /// Zstandard compression. Level 0 means and None mean the same (default level). Levels can be negative as well.
@@ -448,27 +462,9 @@ pub enum ImageCompressionAlgorithm {
    },
 }

-impl FromStr for ImageCompressionAlgorithm {
-    type Err = anyhow::Error;
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        let mut components = s.split(['(', ')']);
-        let first = components
-            .next()
-            .ok_or_else(|| anyhow::anyhow!("empty string"))?;
-        match first {
-            "disabled" => Ok(ImageCompressionAlgorithm::Disabled),
-            "zstd" => {
-                let level = if let Some(v) = components.next() {
-                    let v: i8 = v.parse()?;
-                    Some(v)
-                } else {
-                    None
-                };
-
-                Ok(ImageCompressionAlgorithm::Zstd { level })
-            }
-            _ => anyhow::bail!("invalid specifier '{first}'"),
-        }
+impl ImageCompressionAlgorithm {
+    pub fn allow_decompression(&self) -> bool {
+        !matches!(self, ImageCompressionAlgorithm::DisabledNoDecompress)
    }
 }

@@ -1664,25 +1660,4 @@ mod tests {
            AuxFilePolicy::CrossValidation
        );
    }
-
-    #[test]
-    fn test_image_compression_algorithm_parsing() {
-        use ImageCompressionAlgorithm::*;
-        assert_eq!(
-            ImageCompressionAlgorithm::from_str("disabled").unwrap(),
-            Disabled
-        );
-        assert_eq!(
-            ImageCompressionAlgorithm::from_str("zstd").unwrap(),
-            Zstd { level: None }
-        );
-        assert_eq!(
-            ImageCompressionAlgorithm::from_str("zstd(18)").unwrap(),
-            Zstd { level: Some(18) }
-        );
-        assert_eq!(
-            ImageCompressionAlgorithm::from_str("zstd(-3)").unwrap(),
-            Zstd { level: Some(-3) }
-        );
-    }
 }
--- a/libs/pageserver_api/src/models/detach_ancestor.rs
+++ b/libs/pageserver_api/src/models/detach_ancestor.rs
@@ -1,6 +1,6 @@
 use utils::id::TimelineId;

-#[derive(Debug, Default, PartialEq, serde::Serialize, serde::Deserialize)]
+#[derive(Default, serde::Serialize)]
 pub struct AncestorDetached {
    pub reparented_timelines: Vec<TimelineId>,
 }
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -1,42 +1,59 @@
-//! See docs/rfcs/031-sharding-static.md for an overview of sharding.
-//!
-//! This module contains a variety of types used to represent the concept of sharding
-//! a Neon tenant across multiple physical shards.  Since there are quite a few of these,
-//! we provide an summary here.
-//!
-//! Types used to describe shards:
-//! - [`ShardCount`] describes how many shards make up a tenant, plus the magic `unsharded` value
-//!   which identifies a tenant which is not shard-aware.  This means its storage paths do not include
-//!   a shard suffix.
-//! - [`ShardNumber`] is simply the zero-based index of a shard within a tenant.
-//! - [`ShardIndex`] is the 2-tuple of `ShardCount` and `ShardNumber`, it's just like a `TenantShardId`
-//!   without the tenant ID.  This is useful for things that are implicitly scoped to a particular
-//!   tenant, such as layer files.
-//! - [`ShardIdentity`]` is the full description of a particular shard's parameters, in sufficient
-//!   detail to convert a [`Key`] to a [`ShardNumber`] when deciding where to write/read.
-//! - The [`ShardSlug`] is a terse formatter for ShardCount and ShardNumber, written as
-//!   four hex digits.  An unsharded tenant is `0000`.
-//! - [`TenantShardId`] is the unique ID of a particular shard within a particular tenant
-//!
-//! Types used to describe the parameters for data distribution in a sharded tenant:
-//! - [`ShardStripeSize`] controls how long contiguous runs of [`Key`]s (stripes) are when distributed across
-//!   multiple shards.  Its value is given in 8kiB pages.
-//! - [`ShardLayout`] describes the data distribution scheme, and at time of writing is
-//!   always zero: this is provided for future upgrades that might introduce different
-//!   data distribution schemes.
-//!
-//! Examples:
-//! - A legacy unsharded tenant has one shard with ShardCount(0), ShardNumber(0), and its slug is 0000
-//! - A single sharded tenant has one shard with ShardCount(1), ShardNumber(0), and its slug is 0001
-//! - In a tenant with 4 shards, each shard has ShardCount(N), ShardNumber(i) where i in 0..N-1 (inclusive),
-//!   and their slugs are 0004, 0104, 0204, and 0304.
+use std::{ops::RangeInclusive, str::FromStr};

 use crate::{key::Key, models::ShardParameters};
+use hex::FromHex;
 use postgres_ffi::relfile_utils::INIT_FORKNUM;
 use serde::{Deserialize, Serialize};
+use utils::id::TenantId;

-#[doc(inline)]
-pub use ::utils::shard::*;
+/// See docs/rfcs/031-sharding-static.md for an overview of sharding.
+///
+/// This module contains a variety of types used to represent the concept of sharding
+/// a Neon tenant across multiple physical shards.  Since there are quite a few of these,
+/// we provide an summary here.
+///
+/// Types used to describe shards:
+/// - [`ShardCount`] describes how many shards make up a tenant, plus the magic `unsharded` value
+///   which identifies a tenant which is not shard-aware.  This means its storage paths do not include
+///   a shard suffix.
+/// - [`ShardNumber`] is simply the zero-based index of a shard within a tenant.
+/// - [`ShardIndex`] is the 2-tuple of `ShardCount` and `ShardNumber`, it's just like a `TenantShardId`
+///   without the tenant ID.  This is useful for things that are implicitly scoped to a particular
+///   tenant, such as layer files.
+/// - [`ShardIdentity`]` is the full description of a particular shard's parameters, in sufficient
+///   detail to convert a [`Key`] to a [`ShardNumber`] when deciding where to write/read.
+/// - The [`ShardSlug`] is a terse formatter for ShardCount and ShardNumber, written as
+///   four hex digits.  An unsharded tenant is `0000`.
+/// - [`TenantShardId`] is the unique ID of a particular shard within a particular tenant
+///
+/// Types used to describe the parameters for data distribution in a sharded tenant:
+/// - [`ShardStripeSize`] controls how long contiguous runs of [`Key`]s (stripes) are when distributed across
+///   multiple shards.  Its value is given in 8kiB pages.
+/// - [`ShardLayout`] describes the data distribution scheme, and at time of writing is
+///   always zero: this is provided for future upgrades that might introduce different
+///   data distribution schemes.
+///
+/// Examples:
+/// - A legacy unsharded tenant has one shard with ShardCount(0), ShardNumber(0), and its slug is 0000
+/// - A single sharded tenant has one shard with ShardCount(1), ShardNumber(0), and its slug is 0001
+/// - In a tenant with 4 shards, each shard has ShardCount(N), ShardNumber(i) where i in 0..N-1 (inclusive),
+///   and their slugs are 0004, 0104, 0204, and 0304.
+
+#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
+pub struct ShardNumber(pub u8);
+
+#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
+pub struct ShardCount(u8);
+
+/// Combination of ShardNumber and ShardCount.  For use within the context of a particular tenant,
+/// when we need to know which shard we're dealing with, but do not need to know the full
+/// ShardIdentity (because we won't be doing any page->shard mapping), and do not need to know
+/// the fully qualified TenantShardId.
+#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
+pub struct ShardIndex {
+    pub shard_number: ShardNumber,
+    pub shard_count: ShardCount,
+}

 /// The ShardIdentity contains enough information to map a [`Key`] to a [`ShardNumber`],
 /// and to check whether that [`ShardNumber`] is the same as the current shard.
@@ -48,6 +65,362 @@ pub struct ShardIdentity {
    layout: ShardLayout,
 }

+/// Formatting helper, for generating the `shard_id` label in traces.
+struct ShardSlug<'a>(&'a TenantShardId);
+
+/// TenantShardId globally identifies a particular shard in a particular tenant.
+///
+/// These are written as `<TenantId>-<ShardSlug>`, for example:
+///   # The second shard in a two-shard tenant
+///   072f1291a5310026820b2fe4b2968934-0102
+///
+/// If the `ShardCount` is _unsharded_, the `TenantShardId` is written without
+/// a shard suffix and is equivalent to the encoding of a `TenantId`: this enables
+/// an unsharded [`TenantShardId`] to be used interchangably with a [`TenantId`].
+///
+/// The human-readable encoding of an unsharded TenantShardId, such as used in API URLs,
+/// is both forward and backward compatible with TenantId: a legacy TenantId can be
+/// decoded as a TenantShardId, and when re-encoded it will be parseable
+/// as a TenantId.
+#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
+pub struct TenantShardId {
+    pub tenant_id: TenantId,
+    pub shard_number: ShardNumber,
+    pub shard_count: ShardCount,
+}
+
+impl ShardCount {
+    pub const MAX: Self = Self(u8::MAX);
+
+    /// The internal value of a ShardCount may be zero, which means "1 shard, but use
+    /// legacy format for TenantShardId that excludes the shard suffix", also known
+    /// as [`TenantShardId::unsharded`].
+    ///
+    /// This method returns the actual number of shards, i.e. if our internal value is
+    /// zero, we return 1 (unsharded tenants have 1 shard).
+    pub fn count(&self) -> u8 {
+        if self.0 > 0 {
+            self.0
+        } else {
+            1
+        }
+    }
+
+    /// The literal internal value: this is **not** the number of shards in the
+    /// tenant, as we have a special zero value for legacy unsharded tenants.  Use
+    /// [`Self::count`] if you want to know the cardinality of shards.
+    pub fn literal(&self) -> u8 {
+        self.0
+    }
+
+    /// Whether the `ShardCount` is for an unsharded tenant, so uses one shard but
+    /// uses the legacy format for `TenantShardId`. See also the documentation for
+    /// [`Self::count`].
+    pub fn is_unsharded(&self) -> bool {
+        self.0 == 0
+    }
+
+    /// `v` may be zero, or the number of shards in the tenant.  `v` is what
+    /// [`Self::literal`] would return.
+    pub const fn new(val: u8) -> Self {
+        Self(val)
+    }
+}
+
+impl ShardNumber {
+    pub const MAX: Self = Self(u8::MAX);
+}
+
+impl TenantShardId {
+    pub fn unsharded(tenant_id: TenantId) -> Self {
+        Self {
+            tenant_id,
+            shard_number: ShardNumber(0),
+            shard_count: ShardCount(0),
+        }
+    }
+
+    /// The range of all TenantShardId that belong to a particular TenantId.  This is useful when
+    /// you have a BTreeMap of TenantShardId, and are querying by TenantId.
+    pub fn tenant_range(tenant_id: TenantId) -> RangeInclusive<Self> {
+        RangeInclusive::new(
+            Self {
+                tenant_id,
+                shard_number: ShardNumber(0),
+                shard_count: ShardCount(0),
+            },
+            Self {
+                tenant_id,
+                shard_number: ShardNumber::MAX,
+                shard_count: ShardCount::MAX,
+            },
+        )
+    }
+
+    pub fn shard_slug(&self) -> impl std::fmt::Display + '_ {
+        ShardSlug(self)
+    }
+
+    /// Convenience for code that has special behavior on the 0th shard.
+    pub fn is_shard_zero(&self) -> bool {
+        self.shard_number == ShardNumber(0)
+    }
+
+    /// The "unsharded" value is distinct from simply having a single shard: it represents
+    /// a tenant which is not shard-aware at all, and whose storage paths will not include
+    /// a shard suffix.
+    pub fn is_unsharded(&self) -> bool {
+        self.shard_number == ShardNumber(0) && self.shard_count.is_unsharded()
+    }
+
+    /// Convenience for dropping the tenant_id and just getting the ShardIndex: this
+    /// is useful when logging from code that is already in a span that includes tenant ID, to
+    /// keep messages reasonably terse.
+    pub fn to_index(&self) -> ShardIndex {
+        ShardIndex {
+            shard_number: self.shard_number,
+            shard_count: self.shard_count,
+        }
+    }
+
+    /// Calculate the children of this TenantShardId when splitting the overall tenant into
+    /// the given number of shards.
+    pub fn split(&self, new_shard_count: ShardCount) -> Vec<TenantShardId> {
+        let effective_old_shard_count = std::cmp::max(self.shard_count.0, 1);
+        let mut child_shards = Vec::new();
+        for shard_number in 0..ShardNumber(new_shard_count.0).0 {
+            // Key mapping is based on a round robin mapping of key hash modulo shard count,
+            // so our child shards are the ones which the same keys would map to.
+            if shard_number % effective_old_shard_count == self.shard_number.0 {
+                child_shards.push(TenantShardId {
+                    tenant_id: self.tenant_id,
+                    shard_number: ShardNumber(shard_number),
+                    shard_count: new_shard_count,
+                })
+            }
+        }
+
+        child_shards
+    }
+}
+
+impl<'a> std::fmt::Display for ShardSlug<'a> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "{:02x}{:02x}",
+            self.0.shard_number.0, self.0.shard_count.0
+        )
+    }
+}
+
+impl std::fmt::Display for TenantShardId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        if self.shard_count != ShardCount(0) {
+            write!(f, "{}-{}", self.tenant_id, self.shard_slug())
+        } else {
+            // Legacy case (shard_count == 0) -- format as just the tenant id.  Note that this
+            // is distinct from the normal single shard case (shard count == 1).
+            self.tenant_id.fmt(f)
+        }
+    }
+}
+
+impl std::fmt::Debug for TenantShardId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        // Debug is the same as Display: the compact hex representation
+        write!(f, "{}", self)
+    }
+}
+
+impl std::str::FromStr for TenantShardId {
+    type Err = hex::FromHexError;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        // Expect format: 16 byte TenantId, '-', 1 byte shard number, 1 byte shard count
+        if s.len() == 32 {
+            // Legacy case: no shard specified
+            Ok(Self {
+                tenant_id: TenantId::from_str(s)?,
+                shard_number: ShardNumber(0),
+                shard_count: ShardCount(0),
+            })
+        } else if s.len() == 37 {
+            let bytes = s.as_bytes();
+            let tenant_id = TenantId::from_hex(&bytes[0..32])?;
+            let mut shard_parts: [u8; 2] = [0u8; 2];
+            hex::decode_to_slice(&bytes[33..37], &mut shard_parts)?;
+            Ok(Self {
+                tenant_id,
+                shard_number: ShardNumber(shard_parts[0]),
+                shard_count: ShardCount(shard_parts[1]),
+            })
+        } else {
+            Err(hex::FromHexError::InvalidStringLength)
+        }
+    }
+}
+
+impl From<[u8; 18]> for TenantShardId {
+    fn from(b: [u8; 18]) -> Self {
+        let tenant_id_bytes: [u8; 16] = b[0..16].try_into().unwrap();
+
+        Self {
+            tenant_id: TenantId::from(tenant_id_bytes),
+            shard_number: ShardNumber(b[16]),
+            shard_count: ShardCount(b[17]),
+        }
+    }
+}
+
+impl ShardIndex {
+    pub fn new(number: ShardNumber, count: ShardCount) -> Self {
+        Self {
+            shard_number: number,
+            shard_count: count,
+        }
+    }
+    pub fn unsharded() -> Self {
+        Self {
+            shard_number: ShardNumber(0),
+            shard_count: ShardCount(0),
+        }
+    }
+
+    /// The "unsharded" value is distinct from simply having a single shard: it represents
+    /// a tenant which is not shard-aware at all, and whose storage paths will not include
+    /// a shard suffix.
+    pub fn is_unsharded(&self) -> bool {
+        self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
+    }
+
+    /// For use in constructing remote storage paths: concatenate this with a TenantId
+    /// to get a fully qualified TenantShardId.
+    ///
+    /// Backward compat: this function returns an empty string if Self::is_unsharded, such
+    /// that the legacy pre-sharding remote key format is preserved.
+    pub fn get_suffix(&self) -> String {
+        if self.is_unsharded() {
+            "".to_string()
+        } else {
+            format!("-{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
+        }
+    }
+}
+
+impl std::fmt::Display for ShardIndex {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
+    }
+}
+
+impl std::fmt::Debug for ShardIndex {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        // Debug is the same as Display: the compact hex representation
+        write!(f, "{}", self)
+    }
+}
+
+impl std::str::FromStr for ShardIndex {
+    type Err = hex::FromHexError;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        // Expect format: 1 byte shard number, 1 byte shard count
+        if s.len() == 4 {
+            let bytes = s.as_bytes();
+            let mut shard_parts: [u8; 2] = [0u8; 2];
+            hex::decode_to_slice(bytes, &mut shard_parts)?;
+            Ok(Self {
+                shard_number: ShardNumber(shard_parts[0]),
+                shard_count: ShardCount(shard_parts[1]),
+            })
+        } else {
+            Err(hex::FromHexError::InvalidStringLength)
+        }
+    }
+}
+
+impl From<[u8; 2]> for ShardIndex {
+    fn from(b: [u8; 2]) -> Self {
+        Self {
+            shard_number: ShardNumber(b[0]),
+            shard_count: ShardCount(b[1]),
+        }
+    }
+}
+
+impl Serialize for TenantShardId {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        if serializer.is_human_readable() {
+            serializer.collect_str(self)
+        } else {
+            // Note: while human encoding of [`TenantShardId`] is backward and forward
+            // compatible, this binary encoding is not.
+            let mut packed: [u8; 18] = [0; 18];
+            packed[0..16].clone_from_slice(&self.tenant_id.as_arr());
+            packed[16] = self.shard_number.0;
+            packed[17] = self.shard_count.0;
+
+            packed.serialize(serializer)
+        }
+    }
+}
+
+impl<'de> Deserialize<'de> for TenantShardId {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        struct IdVisitor {
+            is_human_readable_deserializer: bool,
+        }
+
+        impl<'de> serde::de::Visitor<'de> for IdVisitor {
+            type Value = TenantShardId;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
+                if self.is_human_readable_deserializer {
+                    formatter.write_str("value in form of hex string")
+                } else {
+                    formatter.write_str("value in form of integer array([u8; 18])")
+                }
+            }
+
+            fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
+            where
+                A: serde::de::SeqAccess<'de>,
+            {
+                let s = serde::de::value::SeqAccessDeserializer::new(seq);
+                let id: [u8; 18] = Deserialize::deserialize(s)?;
+                Ok(TenantShardId::from(id))
+            }
+
+            fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
+            where
+                E: serde::de::Error,
+            {
+                TenantShardId::from_str(v).map_err(E::custom)
+            }
+        }
+
+        if deserializer.is_human_readable() {
+            deserializer.deserialize_str(IdVisitor {
+                is_human_readable_deserializer: true,
+            })
+        } else {
+            deserializer.deserialize_tuple(
+                18,
+                IdVisitor {
+                    is_human_readable_deserializer: false,
+                },
+            )
+        }
+    }
+}
+
 /// Stripe size in number of pages
 #[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
 pub struct ShardStripeSize(pub u32);
@@ -212,6 +585,77 @@ impl ShardIdentity {
    }
 }

+impl Serialize for ShardIndex {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        if serializer.is_human_readable() {
+            serializer.collect_str(self)
+        } else {
+            // Binary encoding is not used in index_part.json, but is included in anticipation of
+            // switching various structures (e.g. inter-process communication, remote metadata) to more
+            // compact binary encodings in future.
+            let mut packed: [u8; 2] = [0; 2];
+            packed[0] = self.shard_number.0;
+            packed[1] = self.shard_count.0;
+            packed.serialize(serializer)
+        }
+    }
+}
+
+impl<'de> Deserialize<'de> for ShardIndex {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        struct IdVisitor {
+            is_human_readable_deserializer: bool,
+        }
+
+        impl<'de> serde::de::Visitor<'de> for IdVisitor {
+            type Value = ShardIndex;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
+                if self.is_human_readable_deserializer {
+                    formatter.write_str("value in form of hex string")
+                } else {
+                    formatter.write_str("value in form of integer array([u8; 2])")
+                }
+            }
+
+            fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
+            where
+                A: serde::de::SeqAccess<'de>,
+            {
+                let s = serde::de::value::SeqAccessDeserializer::new(seq);
+                let id: [u8; 2] = Deserialize::deserialize(s)?;
+                Ok(ShardIndex::from(id))
+            }
+
+            fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
+            where
+                E: serde::de::Error,
+            {
+                ShardIndex::from_str(v).map_err(E::custom)
+            }
+        }
+
+        if deserializer.is_human_readable() {
+            deserializer.deserialize_str(IdVisitor {
+                is_human_readable_deserializer: true,
+            })
+        } else {
+            deserializer.deserialize_tuple(
+                2,
+                IdVisitor {
+                    is_human_readable_deserializer: false,
+                },
+            )
+        }
+    }
+}
+
 /// Whether this key is always held on shard 0 (e.g. shard 0 holds all SLRU keys
 /// in order to be able to serve basebackup requests without peer communication).
 fn key_is_shard0(key: &Key) -> bool {
@@ -293,9 +737,7 @@ pub fn describe(

 #[cfg(test)]
 mod tests {
-    use std::str::FromStr;
-
-    use utils::{id::TenantId, Hex};
+    use utils::Hex;

    use super::*;

--- a/libs/postgres_backend/Cargo.toml
+++ b/libs/postgres_backend/Cargo.toml
@@ -13,7 +13,6 @@ rustls.workspace = true
 serde.workspace = true
 thiserror.workspace = true
 tokio.workspace = true
-tokio-util.workspace = true
 tokio-rustls.workspace = true
 tracing.workspace = true

@@ -24,4 +23,4 @@ workspace_hack.workspace = true
 once_cell.workspace = true
 rustls-pemfile.workspace = true
 tokio-postgres.workspace = true
-tokio-postgres-rustls.workspace = true
+tokio-postgres-rustls.workspace = true
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -16,7 +16,6 @@ use std::{fmt, io};
 use std::{future::Future, str::FromStr};
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_rustls::TlsAcceptor;
-use tokio_util::sync::CancellationToken;
 use tracing::{debug, error, info, trace, warn};

 use pq_proto::framed::{ConnectionError, Framed, FramedReader, FramedWriter};
@@ -401,15 +400,21 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
    }

    /// Wrapper for run_message_loop() that shuts down socket when we are done
-    pub async fn run(
+    pub async fn run<F, S>(
        mut self,
        handler: &mut impl Handler<IO>,
-        cancel: &CancellationToken,
-    ) -> Result<(), QueryError> {
-        let ret = self.run_message_loop(handler, cancel).await;
+        shutdown_watcher: F,
+    ) -> Result<(), QueryError>
+    where
+        F: Fn() -> S + Clone,
+        S: Future,
+    {
+        let ret = self
+            .run_message_loop(handler, shutdown_watcher.clone())
+            .await;

        tokio::select! {
-            _ = cancel.cancelled() => {
+            _ = shutdown_watcher() => {
                // do nothing; we most likely got already stopped by shutdown and will log it next.
            }
            _ = self.framed.shutdown() => {
@@ -439,17 +444,21 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
        }
    }

-    async fn run_message_loop(
+    async fn run_message_loop<F, S>(
        &mut self,
        handler: &mut impl Handler<IO>,
-        cancel: &CancellationToken,
-    ) -> Result<(), QueryError> {
+        shutdown_watcher: F,
+    ) -> Result<(), QueryError>
+    where
+        F: Fn() -> S,
+        S: Future,
+    {
        trace!("postgres backend to {:?} started", self.peer_addr);

        tokio::select!(
            biased;

-            _ = cancel.cancelled() => {
+            _ = shutdown_watcher() => {
                // We were requested to shut down.
                tracing::info!("shutdown request received during handshake");
                return Err(QueryError::Shutdown)
@@ -464,7 +473,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
        let mut query_string = Bytes::new();
        while let Some(msg) = tokio::select!(
            biased;
-            _ = cancel.cancelled() => {
+            _ = shutdown_watcher() => {
                // We were requested to shut down.
                tracing::info!("shutdown request received in run_message_loop");
                return Err(QueryError::Shutdown)
@@ -476,7 +485,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
            let result = self.process_message(handler, msg, &mut query_string).await;
            tokio::select!(
                biased;
-                _ = cancel.cancelled() => {
+                _ = shutdown_watcher() => {
                    // We were requested to shut down.
                    tracing::info!("shutdown request received during response flush");

@@ -663,17 +672,11 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
        assert!(self.state < ProtoState::Authentication);
        let have_tls = self.tls_config.is_some();
        match msg {
-            FeStartupPacket::SslRequest { direct } => {
+            FeStartupPacket::SslRequest => {
                debug!("SSL requested");

-                if !direct {
-                    self.write_message(&BeMessage::EncryptionResponse(have_tls))
-                        .await?;
-                } else if !have_tls {
-                    return Err(QueryError::Other(anyhow::anyhow!(
-                        "direct SSL negotiation but no TLS support"
-                    )));
-                }
+                self.write_message(&BeMessage::EncryptionResponse(have_tls))
+                    .await?;

                if have_tls {
                    self.start_tls().await?;
--- a/libs/postgres_backend/tests/simple_select.rs
+++ b/libs/postgres_backend/tests/simple_select.rs
@@ -3,14 +3,13 @@ use once_cell::sync::Lazy;
 use postgres_backend::{AuthType, Handler, PostgresBackend, QueryError};
 use pq_proto::{BeMessage, RowDescriptor};
 use std::io::Cursor;
-use std::sync::Arc;
+use std::{future, sync::Arc};
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio::net::{TcpListener, TcpStream};
 use tokio_postgres::config::SslMode;
 use tokio_postgres::tls::MakeTlsConnect;
 use tokio_postgres::{Config, NoTls, SimpleQueryMessage};
 use tokio_postgres_rustls::MakeRustlsConnect;
-use tokio_util::sync::CancellationToken;

 // generate client, server test streams
 async fn make_tcp_pair() -> (TcpStream, TcpStream) {
@@ -51,7 +50,7 @@ async fn simple_select() {

    tokio::spawn(async move {
        let mut handler = TestHandler {};
-        pgbackend.run(&mut handler, &CancellationToken::new()).await
+        pgbackend.run(&mut handler, future::pending::<()>).await
    });

    let conf = Config::new();
@@ -103,7 +102,7 @@ async fn simple_select_ssl() {

    tokio::spawn(async move {
        let mut handler = TestHandler {};
-        pgbackend.run(&mut handler, &CancellationToken::new()).await
+        pgbackend.run(&mut handler, future::pending::<()>).await
    });

    let client_cfg = rustls::ClientConfig::builder()
--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -48,15 +48,6 @@ pub const XLOG_SIZE_OF_XLOG_RECORD: usize = std::mem::size_of::<XLogRecord>();
 #[allow(clippy::identity_op)]
 pub const SIZE_OF_XLOG_RECORD_DATA_HEADER_SHORT: usize = 1 * 2;

-/// Interval of checkpointing metadata file. We should store metadata file to enforce
-/// predicate that checkpoint.nextXid is larger than any XID in WAL.
-/// But flushing checkpoint file for each transaction seems to be too expensive,
-/// so XID_CHECKPOINT_INTERVAL is used to forward align nextXid and so perform
-/// metadata checkpoint only once per XID_CHECKPOINT_INTERVAL transactions.
-/// XID_CHECKPOINT_INTERVAL should not be larger than BLCKSZ*CLOG_XACTS_PER_BYTE
-/// in order to let CLOG_TRUNCATE mechanism correctly extend CLOG.
-const XID_CHECKPOINT_INTERVAL: u32 = 1024;
-
 pub fn XLogSegmentsPerXLogId(wal_segsz_bytes: usize) -> XLogSegNo {
    (0x100000000u64 / wal_segsz_bytes as u64) as XLogSegNo
 }
@@ -331,14 +322,10 @@ impl CheckPoint {
    /// Returns 'true' if the XID was updated.
    pub fn update_next_xid(&mut self, xid: u32) -> bool {
        // nextXid should be greater than any XID in WAL, so increment provided XID and check for wraparround.
-        let mut new_xid = std::cmp::max(
+        let new_xid = std::cmp::max(
            xid.wrapping_add(1),
            pg_constants::FIRST_NORMAL_TRANSACTION_ID,
        );
-        // To reduce number of metadata checkpoints, we forward align XID on XID_CHECKPOINT_INTERVAL.
-        // XID_CHECKPOINT_INTERVAL should not be larger than BLCKSZ*CLOG_XACTS_PER_BYTE
-        new_xid =
-            new_xid.wrapping_add(XID_CHECKPOINT_INTERVAL - 1) & !(XID_CHECKPOINT_INTERVAL - 1);
        let full_xid = self.nextXid.value;
        let old_xid = full_xid as u32;
        if new_xid.wrapping_sub(old_xid) as i32 > 0 {
@@ -360,7 +347,7 @@ impl CheckPoint {
    /// Advance next multi-XID/offset to those given in arguments.
    ///
    /// It's important that this handles wraparound correctly. This should match the
-    /// MultiXactAdvanceNextMXact() logic in PostgreSQL's xlog_redo() function.
+    /// MultiXactAdvceNextMXact() logic in PostgreSQL's xlog_redo() function.
    ///
    /// Returns 'true' if the Checkpoint was updated.
    pub fn update_next_multixid(&mut self, multi_xid: u32, multi_offset: u32) -> bool {
--- a/libs/pq_proto/src/framed.rs
+++ b/libs/pq_proto/src/framed.rs
@@ -44,9 +44,9 @@ impl ConnectionError {
 /// Wraps async io `stream`, providing messages to write/flush + read Postgres
 /// messages.
 pub struct Framed<S> {
-    pub stream: S,
-    pub read_buf: BytesMut,
-    pub write_buf: BytesMut,
+    stream: S,
+    read_buf: BytesMut,
+    write_buf: BytesMut,
 }

 impl<S> Framed<S> {
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -39,39 +39,14 @@ pub enum FeMessage {
    PasswordMessage(Bytes),
 }

-#[derive(Clone, Copy, PartialEq, PartialOrd)]
-pub struct ProtocolVersion(u32);
-
-impl ProtocolVersion {
-    pub const fn new(major: u16, minor: u16) -> Self {
-        Self((major as u32) << 16 | minor as u32)
-    }
-    pub const fn minor(self) -> u16 {
-        self.0 as u16
-    }
-    pub const fn major(self) -> u16 {
-        (self.0 >> 16) as u16
-    }
-}
-
-impl fmt::Debug for ProtocolVersion {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        f.debug_list()
-            .entry(&self.major())
-            .entry(&self.minor())
-            .finish()
-    }
-}
-
 #[derive(Debug)]
 pub enum FeStartupPacket {
    CancelRequest(CancelKeyData),
-    SslRequest {
-        direct: bool,
-    },
+    SslRequest,
    GssEncRequest,
    StartupMessage {
-        version: ProtocolVersion,
+        major_version: u32,
+        minor_version: u32,
        params: StartupMessageParams,
    },
 }
@@ -326,23 +301,11 @@ impl FeStartupPacket {
    /// different from [`FeMessage::parse`] because startup messages don't have
    /// message type byte; otherwise, its comments apply.
    pub fn parse(buf: &mut BytesMut) -> Result<Option<FeStartupPacket>, ProtocolError> {
-        /// <https://github.com/postgres/postgres/blob/ca481d3c9ab7bf69ff0c8d71ad3951d407f6a33c/src/include/libpq/pqcomm.h#L118>
        const MAX_STARTUP_PACKET_LENGTH: usize = 10000;
-        const RESERVED_INVALID_MAJOR_VERSION: u16 = 1234;
-        /// <https://github.com/postgres/postgres/blob/ca481d3c9ab7bf69ff0c8d71ad3951d407f6a33c/src/include/libpq/pqcomm.h#L132>
-        const CANCEL_REQUEST_CODE: ProtocolVersion = ProtocolVersion::new(1234, 5678);
-        /// <https://github.com/postgres/postgres/blob/ca481d3c9ab7bf69ff0c8d71ad3951d407f6a33c/src/include/libpq/pqcomm.h#L166>
-        const NEGOTIATE_SSL_CODE: ProtocolVersion = ProtocolVersion::new(1234, 5679);
-        /// <https://github.com/postgres/postgres/blob/ca481d3c9ab7bf69ff0c8d71ad3951d407f6a33c/src/include/libpq/pqcomm.h#L167>
-        const NEGOTIATE_GSS_CODE: ProtocolVersion = ProtocolVersion::new(1234, 5680);
-
-        // <https://github.com/postgres/postgres/blob/04bcf9e19a4261fe9c7df37c777592c2e10c32a7/src/backend/tcop/backend_startup.c#L378-L382>
-        // First byte indicates standard SSL handshake message
-        // (It can't be a Postgres startup length because in network byte order
-        // that would be a startup packet hundreds of megabytes long)
-        if buf.first() == Some(&0x16) {
-            return Ok(Some(FeStartupPacket::SslRequest { direct: true }));
-        }
+        const RESERVED_INVALID_MAJOR_VERSION: u32 = 1234;
+        const CANCEL_REQUEST_CODE: u32 = 5678;
+        const NEGOTIATE_SSL_CODE: u32 = 5679;
+        const NEGOTIATE_GSS_CODE: u32 = 5680;

        // need at least 4 bytes with packet len
        if buf.len() < 4 {
@@ -375,10 +338,12 @@ impl FeStartupPacket {
        let mut msg = buf.split_to(len).freeze();
        msg.advance(4); // consume len

-        let request_code = ProtocolVersion(msg.get_u32());
+        let request_code = msg.get_u32();
+        let req_hi = request_code >> 16;
+        let req_lo = request_code & ((1 << 16) - 1);
        // StartupMessage, CancelRequest, SSLRequest etc are differentiated by request code.
-        let message = match request_code {
-            CANCEL_REQUEST_CODE => {
+        let message = match (req_hi, req_lo) {
+            (RESERVED_INVALID_MAJOR_VERSION, CANCEL_REQUEST_CODE) => {
                if msg.remaining() != 8 {
                    return Err(ProtocolError::BadMessage(
                        "CancelRequest message is malformed, backend PID / secret key missing"
@@ -390,22 +355,21 @@ impl FeStartupPacket {
                    cancel_key: msg.get_i32(),
                })
            }
-            NEGOTIATE_SSL_CODE => {
+            (RESERVED_INVALID_MAJOR_VERSION, NEGOTIATE_SSL_CODE) => {
                // Requested upgrade to SSL (aka TLS)
-                FeStartupPacket::SslRequest { direct: false }
+                FeStartupPacket::SslRequest
            }
-            NEGOTIATE_GSS_CODE => {
+            (RESERVED_INVALID_MAJOR_VERSION, NEGOTIATE_GSS_CODE) => {
                // Requested upgrade to GSSAPI
                FeStartupPacket::GssEncRequest
            }
-            version if version.major() == RESERVED_INVALID_MAJOR_VERSION => {
+            (RESERVED_INVALID_MAJOR_VERSION, unrecognized_code) => {
                return Err(ProtocolError::Protocol(format!(
-                    "Unrecognized request code {}",
-                    version.minor()
+                    "Unrecognized request code {unrecognized_code}"
                )));
            }
            // TODO bail if protocol major_version is not 3?
-            version => {
+            (major_version, minor_version) => {
                // StartupMessage

                let s = str::from_utf8(&msg).map_err(|_e| {
@@ -418,7 +382,8 @@ impl FeStartupPacket {
                })?;

                FeStartupPacket::StartupMessage {
-                    version,
+                    major_version,
+                    minor_version,
                    params: StartupMessageParams {
                        params: msg.slice_ref(s.as_bytes()),
                    },
@@ -557,10 +522,6 @@ pub enum BeMessage<'a> {
    RowDescription(&'a [RowDescriptor<'a>]),
    XLogData(XLogDataBody<'a>),
    NoticeResponse(&'a str),
-    NegotiateProtocolVersion {
-        version: ProtocolVersion,
-        options: &'a [&'a str],
-    },
    KeepAlive(WalSndKeepAlive),
 }

@@ -984,18 +945,6 @@ impl<'a> BeMessage<'a> {
                    buf.put_u8(u8::from(req.request_reply));
                });
            }
-
-            BeMessage::NegotiateProtocolVersion { version, options } => {
-                buf.put_u8(b'v');
-                write_body(buf, |buf| {
-                    buf.put_u32(version.0);
-                    buf.put_u32(options.len() as u32);
-                    for option in options.iter() {
-                        write_cstr(option, buf)?;
-                    }
-                    Ok(())
-                })?
-            }
        }
        Ok(())
    }
--- a/libs/utils/src/circuit_breaker.rs
+++ b/libs/utils/src/circuit_breaker.rs
@@ -1,114 +0,0 @@
-use std::{
-    fmt::Display,
-    time::{Duration, Instant},
-};
-
-use metrics::IntCounter;
-
-/// Circuit breakers are for operations that are expensive and fallible: if they fail repeatedly,
-/// we will stop attempting them for some period of time, to avoid denial-of-service from retries, and
-/// to mitigate the log spam from repeated failures.
-pub struct CircuitBreaker {
-    /// An identifier that enables us to log useful errors when a circuit is broken
-    name: String,
-
-    /// Consecutive failures since last success
-    fail_count: usize,
-
-    /// How many consecutive failures before we break the circuit
-    fail_threshold: usize,
-
-    /// If circuit is broken, when was it broken?
-    broken_at: Option<Instant>,
-
-    /// If set, we will auto-reset the circuit this long after it was broken.  If None, broken
-    /// circuits stay broken forever, or until success() is called.
-    reset_period: Option<Duration>,
-
-    /// If this is true, no actual circuit-breaking happens.  This is for overriding a circuit breaker
-    /// to permit something to keep running even if it would otherwise have tripped it.
-    short_circuit: bool,
-}
-
-impl CircuitBreaker {
-    pub fn new(name: String, fail_threshold: usize, reset_period: Option<Duration>) -> Self {
-        Self {
-            name,
-            fail_count: 0,
-            fail_threshold,
-            broken_at: None,
-            reset_period,
-            short_circuit: false,
-        }
-    }
-
-    /// Construct an unbreakable circuit breaker, for use in unit tests etc.
-    pub fn short_circuit() -> Self {
-        Self {
-            name: String::new(),
-            fail_threshold: 0,
-            fail_count: 0,
-            broken_at: None,
-            reset_period: None,
-            short_circuit: true,
-        }
-    }
-
-    pub fn fail<E>(&mut self, metric: &IntCounter, error: E)
-    where
-        E: Display,
-    {
-        if self.short_circuit {
-            return;
-        }
-
-        self.fail_count += 1;
-        if self.broken_at.is_none() && self.fail_count >= self.fail_threshold {
-            self.break_circuit(metric, error);
-        }
-    }
-
-    /// Call this after successfully executing an operation
-    pub fn success(&mut self, metric: &IntCounter) {
-        self.fail_count = 0;
-        if let Some(broken_at) = &self.broken_at {
-            tracing::info!(breaker=%self.name, "Circuit breaker failure ended (was broken for {})",
-                humantime::format_duration(broken_at.elapsed()));
-            self.broken_at = None;
-            metric.inc();
-        }
-    }
-
-    /// Call this before attempting an operation, and skip the operation if we are currently broken.
-    pub fn is_broken(&mut self) -> bool {
-        if self.short_circuit {
-            return false;
-        }
-
-        if let Some(broken_at) = self.broken_at {
-            match self.reset_period {
-                Some(reset_period) if broken_at.elapsed() > reset_period => {
-                    self.reset_circuit();
-                    false
-                }
-                _ => true,
-            }
-        } else {
-            false
-        }
-    }
-
-    fn break_circuit<E>(&mut self, metric: &IntCounter, error: E)
-    where
-        E: Display,
-    {
-        self.broken_at = Some(Instant::now());
-        tracing::error!(breaker=%self.name, "Circuit breaker broken!  Last error: {error}");
-        metric.inc();
-    }
-
-    fn reset_circuit(&mut self) {
-        self.broken_at = None;
-        self.fail_count = 0;
-    }
-}
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -52,17 +52,17 @@ struct RequestId(String);
 /// There could be other ways to implement similar functionality:
 ///
 /// * procmacros placed on top of all handler methods
-///   With all the drawbacks of procmacros, brings no difference implementation-wise,
-///   and little code reduction compared to the existing approach.
+/// With all the drawbacks of procmacros, brings no difference implementation-wise,
+/// and little code reduction compared to the existing approach.
 ///
 /// * Another `TraitExt` with e.g. the `get_with_span`, `post_with_span` methods to do similar logic,
-///   implemented for [`RouterBuilder`].
-///   Could be simpler, but we don't want to depend on [`routerify`] more, targeting to use other library later.
+/// implemented for [`RouterBuilder`].
+/// Could be simpler, but we don't want to depend on [`routerify`] more, targeting to use other library later.
 ///
 /// * In theory, a span guard could've been created in a pre-request middleware and placed into a global collection, to be dropped
-///   later, in a post-response middleware.
-///   Due to suspendable nature of the futures, would give contradictive results which is exactly the opposite of what `tracing-futures`
-///   tries to achive with its `.instrument` used in the current approach.
+/// later, in a post-response middleware.
+/// Due to suspendable nature of the futures, would give contradictive results which is exactly the opposite of what `tracing-futures`
+/// tries to achive with its `.instrument` used in the current approach.
 ///
 /// If needed, a declarative macro to substitute the |r| ... closure boilerplate could be introduced.
 pub async fn request_span<R, H>(request: Request<Body>, handler: H) -> R::Output
--- a/libs/utils/src/http/request.rs
+++ b/libs/utils/src/http/request.rs
@@ -74,15 +74,6 @@ pub fn parse_query_param<E: fmt::Display, T: FromStr<Err = E>>(
        .transpose()
 }

-pub fn must_parse_query_param<E: fmt::Display, T: FromStr<Err = E>>(
-    request: &Request<Body>,
-    param_name: &str,
-) -> Result<T, ApiError> {
-    parse_query_param(request, param_name)?.ok_or_else(|| {
-        ApiError::BadRequest(anyhow!("no {param_name} specified in query parameters"))
-    })
-}
-
 pub async fn ensure_no_body(request: &mut Request<Body>) -> Result<(), ApiError> {
    match request.body_mut().data().await {
        Some(_) => Err(ApiError::BadRequest(anyhow!("Unexpected request body"))),
--- a/libs/utils/src/id.rs
+++ b/libs/utils/src/id.rs
@@ -302,6 +302,17 @@ pub struct TenantId(Id);

 id_newtype!(TenantId);

+/// Neon Connection Id identifies long-lived connections (for example a pagestream
+/// connection with the page_service). Is used for better logging and tracing
+///
+/// NOTE: It (de)serializes as an array of hex bytes, so the string representation would look
+/// like `[173,80,132,115,129,226,72,254,170,201,135,108,199,26,228,24]`.
+/// See [`Id`] for alternative ways to serialize it.
+#[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, PartialOrd, Ord)]
+pub struct ConnectionId(Id);
+
+id_newtype!(ConnectionId);
+
 // A pair uniquely identifying Neon instance.
 #[derive(Debug, Clone, Copy, PartialOrd, Ord, PartialEq, Eq, Hash, Serialize, Deserialize)]
 pub struct TenantTimelineId {
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -26,8 +26,6 @@ pub mod auth;
 // utility functions and helper traits for unified unique id generation/serialization etc.
 pub mod id;

-pub mod shard;
-
 mod hex;
 pub use hex::Hex;

@@ -98,8 +96,6 @@ pub mod poison;

 pub mod toml_edit_ext;

-pub mod circuit_breaker;
-
 /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
 ///
 /// we have several cases:
--- a/libs/utils/src/shard.rs
+++ b/libs/utils/src/shard.rs
@@ -1,451 +0,0 @@
-//! See `pageserver_api::shard` for description on sharding.
-
-use std::{ops::RangeInclusive, str::FromStr};
-
-use hex::FromHex;
-use serde::{Deserialize, Serialize};
-
-use crate::id::TenantId;
-
-#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
-pub struct ShardNumber(pub u8);
-
-#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
-pub struct ShardCount(pub u8);
-
-/// Combination of ShardNumber and ShardCount.  For use within the context of a particular tenant,
-/// when we need to know which shard we're dealing with, but do not need to know the full
-/// ShardIdentity (because we won't be doing any page->shard mapping), and do not need to know
-/// the fully qualified TenantShardId.
-#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
-pub struct ShardIndex {
-    pub shard_number: ShardNumber,
-    pub shard_count: ShardCount,
-}
-
-/// Formatting helper, for generating the `shard_id` label in traces.
-pub struct ShardSlug<'a>(&'a TenantShardId);
-
-/// TenantShardId globally identifies a particular shard in a particular tenant.
-///
-/// These are written as `<TenantId>-<ShardSlug>`, for example:
-///   # The second shard in a two-shard tenant
-///   072f1291a5310026820b2fe4b2968934-0102
-///
-/// If the `ShardCount` is _unsharded_, the `TenantShardId` is written without
-/// a shard suffix and is equivalent to the encoding of a `TenantId`: this enables
-/// an unsharded [`TenantShardId`] to be used interchangably with a [`TenantId`].
-///
-/// The human-readable encoding of an unsharded TenantShardId, such as used in API URLs,
-/// is both forward and backward compatible with TenantId: a legacy TenantId can be
-/// decoded as a TenantShardId, and when re-encoded it will be parseable
-/// as a TenantId.
-#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
-pub struct TenantShardId {
-    pub tenant_id: TenantId,
-    pub shard_number: ShardNumber,
-    pub shard_count: ShardCount,
-}
-
-impl ShardCount {
-    pub const MAX: Self = Self(u8::MAX);
-
-    /// The internal value of a ShardCount may be zero, which means "1 shard, but use
-    /// legacy format for TenantShardId that excludes the shard suffix", also known
-    /// as [`TenantShardId::unsharded`].
-    ///
-    /// This method returns the actual number of shards, i.e. if our internal value is
-    /// zero, we return 1 (unsharded tenants have 1 shard).
-    pub fn count(&self) -> u8 {
-        if self.0 > 0 {
-            self.0
-        } else {
-            1
-        }
-    }
-
-    /// The literal internal value: this is **not** the number of shards in the
-    /// tenant, as we have a special zero value for legacy unsharded tenants.  Use
-    /// [`Self::count`] if you want to know the cardinality of shards.
-    pub fn literal(&self) -> u8 {
-        self.0
-    }
-
-    /// Whether the `ShardCount` is for an unsharded tenant, so uses one shard but
-    /// uses the legacy format for `TenantShardId`. See also the documentation for
-    /// [`Self::count`].
-    pub fn is_unsharded(&self) -> bool {
-        self.0 == 0
-    }
-
-    /// `v` may be zero, or the number of shards in the tenant.  `v` is what
-    /// [`Self::literal`] would return.
-    pub const fn new(val: u8) -> Self {
-        Self(val)
-    }
-}
-
-impl ShardNumber {
-    pub const MAX: Self = Self(u8::MAX);
-}
-
-impl TenantShardId {
-    pub fn unsharded(tenant_id: TenantId) -> Self {
-        Self {
-            tenant_id,
-            shard_number: ShardNumber(0),
-            shard_count: ShardCount(0),
-        }
-    }
-
-    /// The range of all TenantShardId that belong to a particular TenantId.  This is useful when
-    /// you have a BTreeMap of TenantShardId, and are querying by TenantId.
-    pub fn tenant_range(tenant_id: TenantId) -> RangeInclusive<Self> {
-        RangeInclusive::new(
-            Self {
-                tenant_id,
-                shard_number: ShardNumber(0),
-                shard_count: ShardCount(0),
-            },
-            Self {
-                tenant_id,
-                shard_number: ShardNumber::MAX,
-                shard_count: ShardCount::MAX,
-            },
-        )
-    }
-
-    pub fn shard_slug(&self) -> impl std::fmt::Display + '_ {
-        ShardSlug(self)
-    }
-
-    /// Convenience for code that has special behavior on the 0th shard.
-    pub fn is_shard_zero(&self) -> bool {
-        self.shard_number == ShardNumber(0)
-    }
-
-    /// The "unsharded" value is distinct from simply having a single shard: it represents
-    /// a tenant which is not shard-aware at all, and whose storage paths will not include
-    /// a shard suffix.
-    pub fn is_unsharded(&self) -> bool {
-        self.shard_number == ShardNumber(0) && self.shard_count.is_unsharded()
-    }
-
-    /// Convenience for dropping the tenant_id and just getting the ShardIndex: this
-    /// is useful when logging from code that is already in a span that includes tenant ID, to
-    /// keep messages reasonably terse.
-    pub fn to_index(&self) -> ShardIndex {
-        ShardIndex {
-            shard_number: self.shard_number,
-            shard_count: self.shard_count,
-        }
-    }
-
-    /// Calculate the children of this TenantShardId when splitting the overall tenant into
-    /// the given number of shards.
-    pub fn split(&self, new_shard_count: ShardCount) -> Vec<TenantShardId> {
-        let effective_old_shard_count = std::cmp::max(self.shard_count.0, 1);
-        let mut child_shards = Vec::new();
-        for shard_number in 0..ShardNumber(new_shard_count.0).0 {
-            // Key mapping is based on a round robin mapping of key hash modulo shard count,
-            // so our child shards are the ones which the same keys would map to.
-            if shard_number % effective_old_shard_count == self.shard_number.0 {
-                child_shards.push(TenantShardId {
-                    tenant_id: self.tenant_id,
-                    shard_number: ShardNumber(shard_number),
-                    shard_count: new_shard_count,
-                })
-            }
-        }
-
-        child_shards
-    }
-}
-
-impl<'a> std::fmt::Display for ShardSlug<'a> {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(
-            f,
-            "{:02x}{:02x}",
-            self.0.shard_number.0, self.0.shard_count.0
-        )
-    }
-}
-
-impl std::fmt::Display for TenantShardId {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        if self.shard_count != ShardCount(0) {
-            write!(f, "{}-{}", self.tenant_id, self.shard_slug())
-        } else {
-            // Legacy case (shard_count == 0) -- format as just the tenant id.  Note that this
-            // is distinct from the normal single shard case (shard count == 1).
-            self.tenant_id.fmt(f)
-        }
-    }
-}
-
-impl std::fmt::Debug for TenantShardId {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        // Debug is the same as Display: the compact hex representation
-        write!(f, "{}", self)
-    }
-}
-
-impl std::str::FromStr for TenantShardId {
-    type Err = hex::FromHexError;
-
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        // Expect format: 16 byte TenantId, '-', 1 byte shard number, 1 byte shard count
-        if s.len() == 32 {
-            // Legacy case: no shard specified
-            Ok(Self {
-                tenant_id: TenantId::from_str(s)?,
-                shard_number: ShardNumber(0),
-                shard_count: ShardCount(0),
-            })
-        } else if s.len() == 37 {
-            let bytes = s.as_bytes();
-            let tenant_id = TenantId::from_hex(&bytes[0..32])?;
-            let mut shard_parts: [u8; 2] = [0u8; 2];
-            hex::decode_to_slice(&bytes[33..37], &mut shard_parts)?;
-            Ok(Self {
-                tenant_id,
-                shard_number: ShardNumber(shard_parts[0]),
-                shard_count: ShardCount(shard_parts[1]),
-            })
-        } else {
-            Err(hex::FromHexError::InvalidStringLength)
-        }
-    }
-}
-
-impl From<[u8; 18]> for TenantShardId {
-    fn from(b: [u8; 18]) -> Self {
-        let tenant_id_bytes: [u8; 16] = b[0..16].try_into().unwrap();
-
-        Self {
-            tenant_id: TenantId::from(tenant_id_bytes),
-            shard_number: ShardNumber(b[16]),
-            shard_count: ShardCount(b[17]),
-        }
-    }
-}
-
-impl ShardIndex {
-    pub fn new(number: ShardNumber, count: ShardCount) -> Self {
-        Self {
-            shard_number: number,
-            shard_count: count,
-        }
-    }
-    pub fn unsharded() -> Self {
-        Self {
-            shard_number: ShardNumber(0),
-            shard_count: ShardCount(0),
-        }
-    }
-
-    /// The "unsharded" value is distinct from simply having a single shard: it represents
-    /// a tenant which is not shard-aware at all, and whose storage paths will not include
-    /// a shard suffix.
-    pub fn is_unsharded(&self) -> bool {
-        self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
-    }
-
-    /// For use in constructing remote storage paths: concatenate this with a TenantId
-    /// to get a fully qualified TenantShardId.
-    ///
-    /// Backward compat: this function returns an empty string if Self::is_unsharded, such
-    /// that the legacy pre-sharding remote key format is preserved.
-    pub fn get_suffix(&self) -> String {
-        if self.is_unsharded() {
-            "".to_string()
-        } else {
-            format!("-{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
-        }
-    }
-}
-
-impl std::fmt::Display for ShardIndex {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
-    }
-}
-
-impl std::fmt::Debug for ShardIndex {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        // Debug is the same as Display: the compact hex representation
-        write!(f, "{}", self)
-    }
-}
-
-impl std::str::FromStr for ShardIndex {
-    type Err = hex::FromHexError;
-
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        // Expect format: 1 byte shard number, 1 byte shard count
-        if s.len() == 4 {
-            let bytes = s.as_bytes();
-            let mut shard_parts: [u8; 2] = [0u8; 2];
-            hex::decode_to_slice(bytes, &mut shard_parts)?;
-            Ok(Self {
-                shard_number: ShardNumber(shard_parts[0]),
-                shard_count: ShardCount(shard_parts[1]),
-            })
-        } else {
-            Err(hex::FromHexError::InvalidStringLength)
-        }
-    }
-}
-
-impl From<[u8; 2]> for ShardIndex {
-    fn from(b: [u8; 2]) -> Self {
-        Self {
-            shard_number: ShardNumber(b[0]),
-            shard_count: ShardCount(b[1]),
-        }
-    }
-}
-
-impl Serialize for TenantShardId {
-    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
-    where
-        S: serde::Serializer,
-    {
-        if serializer.is_human_readable() {
-            serializer.collect_str(self)
-        } else {
-            // Note: while human encoding of [`TenantShardId`] is backward and forward
-            // compatible, this binary encoding is not.
-            let mut packed: [u8; 18] = [0; 18];
-            packed[0..16].clone_from_slice(&self.tenant_id.as_arr());
-            packed[16] = self.shard_number.0;
-            packed[17] = self.shard_count.0;
-
-            packed.serialize(serializer)
-        }
-    }
-}
-
-impl<'de> Deserialize<'de> for TenantShardId {
-    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
-    where
-        D: serde::Deserializer<'de>,
-    {
-        struct IdVisitor {
-            is_human_readable_deserializer: bool,
-        }
-
-        impl<'de> serde::de::Visitor<'de> for IdVisitor {
-            type Value = TenantShardId;
-
-            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
-                if self.is_human_readable_deserializer {
-                    formatter.write_str("value in form of hex string")
-                } else {
-                    formatter.write_str("value in form of integer array([u8; 18])")
-                }
-            }
-
-            fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
-            where
-                A: serde::de::SeqAccess<'de>,
-            {
-                let s = serde::de::value::SeqAccessDeserializer::new(seq);
-                let id: [u8; 18] = Deserialize::deserialize(s)?;
-                Ok(TenantShardId::from(id))
-            }
-
-            fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
-            where
-                E: serde::de::Error,
-            {
-                TenantShardId::from_str(v).map_err(E::custom)
-            }
-        }
-
-        if deserializer.is_human_readable() {
-            deserializer.deserialize_str(IdVisitor {
-                is_human_readable_deserializer: true,
-            })
-        } else {
-            deserializer.deserialize_tuple(
-                18,
-                IdVisitor {
-                    is_human_readable_deserializer: false,
-                },
-            )
-        }
-    }
-}
-
-impl Serialize for ShardIndex {
-    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
-    where
-        S: serde::Serializer,
-    {
-        if serializer.is_human_readable() {
-            serializer.collect_str(self)
-        } else {
-            // Binary encoding is not used in index_part.json, but is included in anticipation of
-            // switching various structures (e.g. inter-process communication, remote metadata) to more
-            // compact binary encodings in future.
-            let mut packed: [u8; 2] = [0; 2];
-            packed[0] = self.shard_number.0;
-            packed[1] = self.shard_count.0;
-            packed.serialize(serializer)
-        }
-    }
-}
-
-impl<'de> Deserialize<'de> for ShardIndex {
-    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
-    where
-        D: serde::Deserializer<'de>,
-    {
-        struct IdVisitor {
-            is_human_readable_deserializer: bool,
-        }
-
-        impl<'de> serde::de::Visitor<'de> for IdVisitor {
-            type Value = ShardIndex;
-
-            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
-                if self.is_human_readable_deserializer {
-                    formatter.write_str("value in form of hex string")
-                } else {
-                    formatter.write_str("value in form of integer array([u8; 2])")
-                }
-            }
-
-            fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
-            where
-                A: serde::de::SeqAccess<'de>,
-            {
-                let s = serde::de::value::SeqAccessDeserializer::new(seq);
-                let id: [u8; 2] = Deserialize::deserialize(s)?;
-                Ok(ShardIndex::from(id))
-            }
-
-            fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
-            where
-                E: serde::de::Error,
-            {
-                ShardIndex::from_str(v).map_err(E::custom)
-            }
-        }
-
-        if deserializer.is_human_readable() {
-            deserializer.deserialize_str(IdVisitor {
-                is_human_readable_deserializer: true,
-            })
-        } else {
-            deserializer.deserialize_tuple(
-                2,
-                IdVisitor {
-                    is_human_readable_deserializer: false,
-                },
-            )
-        }
-    }
-}
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -62,7 +62,6 @@ sync_wrapper.workspace = true
 sysinfo.workspace = true
 tokio-tar.workspace = true
 thiserror.workspace = true
-tikv-jemallocator.workspace = true
 tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
 tokio-epoll-uring.workspace = true
 tokio-io-timeout.workspace = true
--- a/pageserver/client/Cargo.toml
+++ b/pageserver/client/Cargo.toml
@@ -8,7 +8,7 @@ license.workspace = true
 pageserver_api.workspace = true
 thiserror.workspace = true
 async-trait.workspace = true
-reqwest = { workspace = true, features = [ "stream" ] }
+reqwest.workspace = true
 utils.workspace = true
 serde.workspace = true
 workspace_hack = { version = "0.1", path = "../../workspace_hack" }
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -1,7 +1,6 @@
 use std::collections::HashMap;

 use bytes::Bytes;
-use detach_ancestor::AncestorDetached;
 use pageserver_api::{models::*, shard::TenantShardId};
 use reqwest::{IntoUrl, Method, StatusCode};
 use utils::{
@@ -10,8 +9,6 @@ use utils::{
    lsn::Lsn,
 };

-pub use reqwest::Body as ReqwestBody;
-
 pub mod util;

 #[derive(Debug, Clone)]
@@ -23,9 +20,6 @@ pub struct Client {

 #[derive(thiserror::Error, Debug)]
 pub enum Error {
-    #[error("send request: {0}")]
-    SendRequest(reqwest::Error),
-
    #[error("receive body: {0}")]
    ReceiveBody(reqwest::Error),

@@ -179,30 +173,19 @@ impl Client {
        self.request(Method::GET, uri, ()).await
    }

-    fn start_request<U: reqwest::IntoUrl>(
-        &self,
-        method: Method,
-        uri: U,
-    ) -> reqwest::RequestBuilder {
-        let req = self.client.request(method, uri);
-        if let Some(value) = &self.authorization_header {
-            req.header(reqwest::header::AUTHORIZATION, value)
-        } else {
-            req
-        }
-    }
-
    async fn request_noerror<B: serde::Serialize, U: reqwest::IntoUrl>(
        &self,
        method: Method,
        uri: U,
        body: B,
    ) -> Result<reqwest::Response> {
-        self.start_request(method, uri)
-            .json(&body)
-            .send()
-            .await
-            .map_err(Error::ReceiveBody)
+        let req = self.client.request(method, uri);
+        let req = if let Some(value) = &self.authorization_header {
+            req.header(reqwest::header::AUTHORIZATION, value)
+        } else {
+            req
+        };
+        req.json(&body).send().await.map_err(Error::ReceiveBody)
    }

    async fn request<B: serde::Serialize, U: reqwest::IntoUrl>(
@@ -419,23 +402,6 @@ impl Client {
        }
    }

-    pub async fn timeline_detach_ancestor(
-        &self,
-        tenant_shard_id: TenantShardId,
-        timeline_id: TimelineId,
-    ) -> Result<AncestorDetached> {
-        let uri = format!(
-            "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/detach_ancestor",
-            self.mgmt_api_endpoint
-        );
-
-        self.request(Method::PUT, &uri, ())
-            .await?
-            .json()
-            .await
-            .map_err(Error::ReceiveBody)
-    }
-
    pub async fn tenant_reset(&self, tenant_shard_id: TenantShardId) -> Result<()> {
        let uri = format!(
            "{}/v1/tenant/{}/reset",
@@ -643,53 +609,4 @@ impl Client {
            }),
        }
    }
-
-    pub async fn import_basebackup(
-        &self,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-        base_lsn: Lsn,
-        end_lsn: Lsn,
-        pg_version: u32,
-        basebackup_tarball: ReqwestBody,
-    ) -> Result<()> {
-        let uri = format!(
-            "{}/v1/tenant/{tenant_id}/timeline/{timeline_id}/import_basebackup?base_lsn={base_lsn}&end_lsn={end_lsn}&pg_version={pg_version}",
-            self.mgmt_api_endpoint,
-        );
-        self.start_request(Method::PUT, uri)
-            .body(basebackup_tarball)
-            .send()
-            .await
-            .map_err(Error::SendRequest)?
-            .error_from_body()
-            .await?
-            .json()
-            .await
-            .map_err(Error::ReceiveBody)
-    }
-
-    pub async fn import_wal(
-        &self,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-        start_lsn: Lsn,
-        end_lsn: Lsn,
-        wal_tarball: ReqwestBody,
-    ) -> Result<()> {
-        let uri = format!(
-            "{}/v1/tenant/{tenant_id}/timeline/{timeline_id}/import_wal?start_lsn={start_lsn}&end_lsn={end_lsn}",
-            self.mgmt_api_endpoint,
-        );
-        self.start_request(Method::PUT, uri)
-            .body(wal_tarball)
-            .send()
-            .await
-            .map_err(Error::SendRequest)?
-            .error_from_body()
-            .await?
-            .json()
-            .await
-            .map_err(Error::ReceiveBody)
-    }
 }
--- a/pageserver/compaction/src/interface.rs
+++ b/pageserver/compaction/src/interface.rs
@@ -131,7 +131,7 @@ impl CompactionKey for Key {
 pub type CompactionKeySpace<K> = Vec<Range<K>>;

 /// Functions needed from all layers.
-pub trait CompactionLayer<K: CompactionKey> {
+pub trait CompactionLayer<K: CompactionKey + ?Sized> {
    fn key_range(&self) -> &Range<K>;
    fn lsn_range(&self) -> &Range<Lsn>;

--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -47,9 +47,6 @@ use utils::{
 project_git_version!(GIT_VERSION);
 project_build_tag!(BUILD_TAG);

-#[global_allocator]
-static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
-
 const PID_FILE_NAME: &str = "pageserver.pid";

 const FEATURES: &[&str] = &[
@@ -660,6 +657,7 @@ fn start_pageserver(
                async move {
                    page_service::libpq_listener_main(
                        tenant_manager,
+                        broker_client,
                        pg_auth,
                        pageserver_listener,
                        conf.pg_auth_type,
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -12,6 +12,7 @@ use serde::de::IntoDeserializer;
 use std::env;
 use storage_broker::Uri;
 use utils::crashsafe::path_with_suffix_extension;
+use utils::id::ConnectionId;
 use utils::logging::SecretString;

 use once_cell::sync::OnceCell;
@@ -91,7 +92,7 @@ pub mod defaults {
    pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB

    pub const DEFAULT_IMAGE_COMPRESSION: ImageCompressionAlgorithm =
-        ImageCompressionAlgorithm::Disabled;
+        ImageCompressionAlgorithm::DisabledNoDecompress;

    pub const DEFAULT_VALIDATE_VECTORED_GET: bool = true;

@@ -869,6 +870,22 @@ impl PageServerConf {
        )
    }

+    pub fn traces_path(&self) -> Utf8PathBuf {
+        self.workdir.join("traces")
+    }
+
+    pub fn trace_path(
+        &self,
+        tenant_shard_id: &TenantShardId,
+        timeline_id: &TimelineId,
+        connection_id: &ConnectionId,
+    ) -> Utf8PathBuf {
+        self.traces_path()
+            .join(tenant_shard_id.to_string())
+            .join(timeline_id.to_string())
+            .join(connection_id.to_string())
+    }
+
    /// Turns storage remote path of a file into its local path.
    pub fn local_path(&self, remote_path: &RemotePath) -> Utf8PathBuf {
        remote_path.with_base(&self.workdir)
@@ -1543,6 +1560,34 @@ broker_endpoint = '{broker_endpoint}'
        Ok(())
    }

+    #[test]
+    fn parse_tenant_config() -> anyhow::Result<()> {
+        let tempdir = tempdir()?;
+        let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?;
+
+        let broker_endpoint = "http://127.0.0.1:7777";
+        let trace_read_requests = true;
+
+        let config_string = format!(
+            r#"{ALL_BASE_VALUES_TOML}
+pg_distrib_dir='{pg_distrib_dir}'
+broker_endpoint = '{broker_endpoint}'
+
+[tenant_config]
+trace_read_requests = {trace_read_requests}"#,
+        );
+
+        let toml = config_string.parse()?;
+
+        let conf = PageServerConf::parse_and_validate(&toml, &workdir)?;
+        assert_eq!(
+            conf.default_tenant_conf.trace_read_requests, trace_read_requests,
+            "Tenant config from pageserver config file should be parsed and udpated values used as defaults for all tenants",
+        );
+
+        Ok(())
+    }
+
    #[test]
    fn parse_incorrect_tenant_config() -> anyhow::Result<()> {
        let config_string = r#"
--- a/pageserver/src/context.rs
+++ b/pageserver/src/context.rs
@@ -59,7 +59,6 @@
 //! 1. It should be easy to forward the context to callees.
 //! 2. To propagate more data from high-level to low-level code, the functions in
 //!    the middle should not need to be modified.
-//!
 //! The solution is to have a container structure ([`RequestContext`]) that
 //! carries the information. Functions that don't care about what's in it
 //! pass it along to callees.
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -873,6 +873,8 @@ components:
          type: string
        max_lsn_wal_lag:
          type: integer
+        trace_read_requests:
+          type: boolean
        heatmap_period:
          type: string
    TenantConfigResponse:
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -10,7 +10,6 @@ use std::time::Duration;

 use anyhow::{anyhow, Context, Result};
 use enumset::EnumSet;
-use futures::StreamExt;
 use futures::TryFutureExt;
 use humantime::format_rfc3339;
 use hyper::header;
@@ -45,14 +44,12 @@ use remote_storage::DownloadError;
 use remote_storage::GenericRemoteStorage;
 use remote_storage::TimeTravelError;
 use tenant_size_model::{svg::SvgBranchKind, SizeResult, StorageModel};
-use tokio_util::io::StreamReader;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::auth::JwtAuth;
 use utils::failpoint_support::failpoints_handler;
 use utils::http::endpoint::prometheus_metrics_handler;
 use utils::http::endpoint::request_span;
-use utils::http::request::must_parse_query_param;
 use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};

 use crate::context::{DownloadBehavior, RequestContext};
@@ -1721,9 +1718,7 @@ async fn timeline_detach_ancestor_handler(
    request: Request<Body>,
    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    use crate::tenant::timeline::detach_ancestor;
-    use pageserver_api::models::detach_ancestor::AncestorDetached;
-
+    use crate::tenant::timeline::detach_ancestor::Options;
    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
@@ -1731,7 +1726,7 @@ async fn timeline_detach_ancestor_handler(
    let span = tracing::info_span!("detach_ancestor", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id);

    async move {
-        let mut options = detach_ancestor::Options::default();
+        let mut options = Options::default();

        let rewrite_concurrency =
            parse_query_param::<_, std::num::NonZeroUsize>(&request, "rewrite_concurrency")?;
@@ -1759,36 +1754,27 @@ async fn timeline_detach_ancestor_handler(

        let timeline = tenant.get_timeline(timeline_id, true)?;

-        let progress = timeline
+        let (_guard, prepared) = timeline
            .prepare_to_detach_from_ancestor(&tenant, options, ctx)
            .await?;

-        // uncomment to allow early as possible Tenant::drop
-        // drop(tenant);
+        let res = state
+            .tenant_manager
+            .complete_detaching_timeline_ancestor(tenant_shard_id, timeline_id, prepared, ctx)
+            .await;

-        let resp = match progress {
-            detach_ancestor::Progress::Prepared(_guard, prepared) => {
-                // it would be great to tag the guard on to the tenant activation future
-                let reparented_timelines = state
-                    .tenant_manager
-                    .complete_detaching_timeline_ancestor(
-                        tenant_shard_id,
-                        timeline_id,
-                        prepared,
-                        ctx,
-                    )
-                    .await
-                    .context("timeline detach ancestor completion")
-                    .map_err(ApiError::InternalServerError)?;
-
-                AncestorDetached {
+        match res {
+            Ok(reparented_timelines) => {
+                let resp = pageserver_api::models::detach_ancestor::AncestorDetached {
                    reparented_timelines,
-                }
-            }
-            detach_ancestor::Progress::Done(resp) => resp,
-        };
+                };

-        json_response(StatusCode::OK, resp)
+                json_response(StatusCode::OK, resp)
+            }
+            Err(e) => Err(ApiError::InternalServerError(
+                e.context("timeline detach completion"),
+            )),
+        }
    }
    .instrument(span)
    .await
@@ -2418,189 +2404,6 @@ async fn post_top_tenants(
    )
 }

-async fn put_tenant_timeline_import_basebackup(
-    request: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
-    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
-    let base_lsn: Lsn = must_parse_query_param(&request, "base_lsn")?;
-    let end_lsn: Lsn = must_parse_query_param(&request, "end_lsn")?;
-    let pg_version: u32 = must_parse_query_param(&request, "pg_version")?;
-
-    check_permission(&request, Some(tenant_id))?;
-
-    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
-
-    let span = info_span!("import_basebackup", tenant_id=%tenant_id, timeline_id=%timeline_id, base_lsn=%base_lsn, end_lsn=%end_lsn, pg_version=%pg_version);
-    async move {
-        let state = get_state(&request);
-        let tenant = state
-            .tenant_manager
-            .get_attached_tenant_shard(TenantShardId::unsharded(tenant_id))?;
-
-        let broker_client = state.broker_client.clone();
-
-        let mut body = StreamReader::new(request.into_body().map(|res| {
-            res.map_err(|error| {
-                std::io::Error::new(std::io::ErrorKind::Other, anyhow::anyhow!(error))
-            })
-        }));
-
-        tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
-
-        let timeline = tenant
-            .create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx)
-            .map_err(ApiError::InternalServerError)
-            .await?;
-
-        // TODO mark timeline as not ready until it reaches end_lsn.
-        // We might have some wal to import as well, and we should prevent compute
-        // from connecting before that and writing conflicting wal.
-        //
-        // This is not relevant for pageserver->pageserver migrations, since there's
-        // no wal to import. But should be fixed if we want to import from postgres.
-
-        // TODO leave clean state on error. For now you can use detach to clean
-        // up broken state from a failed import.
-
-        // Import basebackup provided via CopyData
-        info!("importing basebackup");
-
-        timeline
-            .import_basebackup_from_tar(tenant.clone(), &mut body, base_lsn, broker_client, &ctx)
-            .await
-            .map_err(ApiError::InternalServerError)?;
-
-        // Read the end of the tar archive.
-        read_tar_eof(body)
-            .await
-            .map_err(ApiError::InternalServerError)?;
-
-        // TODO check checksum
-        // Meanwhile you can verify client-side by taking fullbackup
-        // and checking that it matches in size with what was imported.
-        // It wouldn't work if base came from vanilla postgres though,
-        // since we discard some log files.
-
-        info!("done");
-        json_response(StatusCode::OK, ())
-    }
-    .instrument(span)
-    .await
-}
-
-async fn put_tenant_timeline_import_wal(
-    request: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
-    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
-    let start_lsn: Lsn = must_parse_query_param(&request, "start_lsn")?;
-    let end_lsn: Lsn = must_parse_query_param(&request, "end_lsn")?;
-
-    check_permission(&request, Some(tenant_id))?;
-
-    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
-
-    let span = info_span!("import_wal", tenant_id=%tenant_id, timeline_id=%timeline_id, start_lsn=%start_lsn, end_lsn=%end_lsn);
-    async move {
-        let state = get_state(&request);
-
-        let timeline = active_timeline_of_active_tenant(&state.tenant_manager, TenantShardId::unsharded(tenant_id), timeline_id).await?;
-
-        let mut body = StreamReader::new(request.into_body().map(|res| {
-            res.map_err(|error| {
-                std::io::Error::new(std::io::ErrorKind::Other, anyhow::anyhow!(error))
-            })
-        }));
-
-        let last_record_lsn = timeline.get_last_record_lsn();
-        if last_record_lsn != start_lsn {
-            return Err(ApiError::InternalServerError(anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}")));
-        }
-
-        // TODO leave clean state on error. For now you can use detach to clean
-        // up broken state from a failed import.
-
-        // Import wal provided via CopyData
-        info!("importing wal");
-        crate::import_datadir::import_wal_from_tar(&timeline, &mut body, start_lsn, end_lsn, &ctx).await.map_err(ApiError::InternalServerError)?;
-        info!("wal import complete");
-
-        // Read the end of the tar archive.
-        read_tar_eof(body).await.map_err(ApiError::InternalServerError)?;
-
-        // TODO Does it make sense to overshoot?
-        if timeline.get_last_record_lsn() < end_lsn {
-            return Err(ApiError::InternalServerError(anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}")));
-        }
-
-        // Flush data to disk, then upload to s3. No need for a forced checkpoint.
-        // We only want to persist the data, and it doesn't matter if it's in the
-        // shape of deltas or images.
-        info!("flushing layers");
-        timeline.freeze_and_flush().await.map_err(|e| match e {
-            tenant::timeline::FlushLayerError::Cancelled => ApiError::ShuttingDown,
-            other => ApiError::InternalServerError(anyhow::anyhow!(other)),
-        })?;
-
-        info!("done");
-
-        json_response(StatusCode::OK, ())
-    }.instrument(span).await
-}
-
-/// Read the end of a tar archive.
-///
-/// A tar archive normally ends with two consecutive blocks of zeros, 512 bytes each.
-/// `tokio_tar` already read the first such block. Read the second all-zeros block,
-/// and check that there is no more data after the EOF marker.
-///
-/// 'tar' command can also write extra blocks of zeros, up to a record
-/// size, controlled by the --record-size argument. Ignore them too.
-async fn read_tar_eof(mut reader: (impl tokio::io::AsyncRead + Unpin)) -> anyhow::Result<()> {
-    use tokio::io::AsyncReadExt;
-    let mut buf = [0u8; 512];
-
-    // Read the all-zeros block, and verify it
-    let mut total_bytes = 0;
-    while total_bytes < 512 {
-        let nbytes = reader.read(&mut buf[total_bytes..]).await?;
-        total_bytes += nbytes;
-        if nbytes == 0 {
-            break;
-        }
-    }
-    if total_bytes < 512 {
-        anyhow::bail!("incomplete or invalid tar EOF marker");
-    }
-    if !buf.iter().all(|&x| x == 0) {
-        anyhow::bail!("invalid tar EOF marker");
-    }
-
-    // Drain any extra zero-blocks after the EOF marker
-    let mut trailing_bytes = 0;
-    let mut seen_nonzero_bytes = false;
-    loop {
-        let nbytes = reader.read(&mut buf).await?;
-        trailing_bytes += nbytes;
-        if !buf.iter().all(|&x| x == 0) {
-            seen_nonzero_bytes = true;
-        }
-        if nbytes == 0 {
-            break;
-        }
-    }
-    if seen_nonzero_bytes {
-        anyhow::bail!("unexpected non-zero bytes after the tar archive");
-    }
-    if trailing_bytes % 512 != 0 {
-        anyhow::bail!("unexpected number of zeros ({trailing_bytes}), not divisible by tar block size (512 bytes), after the tar archive");
-    }
-    Ok(())
-}
-
 /// Common functionality of all the HTTP API handlers.
 ///
 /// - Adds a tracing span to each request (by `request_span`)
@@ -2895,13 +2698,5 @@ pub fn make_router(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/perf_info",
            |r| testing_api_handler("perf_info", r, perf_info),
        )
-        .put(
-            "/v1/tenant/:tenant_id/timeline/:timeline_id/import_basebackup",
-            |r| api_handler(r, put_tenant_timeline_import_basebackup),
-        )
-        .put(
-            "/v1/tenant/:tenant_id/timeline/:timeline_id/import_wal",
-            |r| api_handler(r, put_tenant_timeline_import_wal),
-        )
        .any(handler_404))
 }
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -23,6 +23,7 @@ pub mod span;
 pub(crate) mod statvfs;
 pub mod task_mgr;
 pub mod tenant;
+pub mod trace;
 pub mod utilization;
 pub mod virtual_file;
 pub mod walingest;
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -473,31 +473,6 @@ static PITR_HISTORY_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-#[derive(strum_macros::EnumString, strum_macros::Display, strum_macros::IntoStaticStr)]
-#[strum(serialize_all = "kebab_case")]
-pub(crate) enum MetricLayerKind {
-    Delta,
-    Image,
-}
-
-static TIMELINE_LAYER_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
-    register_uint_gauge_vec!(
-        "pageserver_layer_bytes",
-        "Sum of layer physical sizes in bytes",
-        &["tenant_id", "shard_id", "timeline_id", "kind"]
-    )
-    .expect("failed to define a metric")
-});
-
-static TIMELINE_LAYER_COUNT: Lazy<UIntGaugeVec> = Lazy::new(|| {
-    register_uint_gauge_vec!(
-        "pageserver_layer_count",
-        "Number of layers that exist",
-        &["tenant_id", "shard_id", "timeline_id", "kind"]
-    )
-    .expect("failed to define a metric")
-});
-
 static TIMELINE_ARCHIVE_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
    register_uint_gauge_vec!(
        "pageserver_archive_size",
@@ -594,22 +569,6 @@ static VALID_LSN_LEASE_COUNT: Lazy<UIntGaugeVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub(crate) static CIRCUIT_BREAKERS_BROKEN: Lazy<IntCounter> = Lazy::new(|| {
-    register_int_counter!(
-        "pageserver_circuit_breaker_broken",
-        "How many times a circuit breaker has broken"
-    )
-    .expect("failed to define a metric")
-});
-
-pub(crate) static CIRCUIT_BREAKERS_UNBROKEN: Lazy<IntCounter> = Lazy::new(|| {
-    register_int_counter!(
-        "pageserver_circuit_breaker_unbroken",
-        "How many times a circuit breaker has been un-broken (recovered)"
-    )
-    .expect("failed to define a metric")
-});
-
 pub(crate) mod initial_logical_size {
    use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
    use once_cell::sync::Lazy;
@@ -1497,12 +1456,10 @@ impl<'a, 'c> BasebackupQueryTimeOngoingRecording<'a, 'c> {
    }
 }

-pub(crate) static LIVE_CONNECTIONS: Lazy<IntCounterPairVec> = Lazy::new(|| {
-    register_int_counter_pair_vec!(
-        "pageserver_live_connections_started",
-        "Number of network connections that we started handling",
-        "pageserver_live_connections_finished",
-        "Number of network connections that we finished handling",
+pub(crate) static LIVE_CONNECTIONS_COUNT: Lazy<IntGaugeVec> = Lazy::new(|| {
+    register_int_gauge_vec!(
+        "pageserver_live_connections",
+        "Number of live network connections",
        &["pageserver_connection_kind"]
    )
    .expect("failed to define a metric")
@@ -1514,7 +1471,10 @@ pub(crate) enum ComputeCommandKind {
    PageStream,
    Basebackup,
    Fullbackup,
+    ImportBasebackup,
+    ImportWal,
    LeaseLsn,
+    Show,
 }

 pub(crate) struct ComputeCommandCounters {
@@ -2166,10 +2126,6 @@ pub(crate) struct TimelineMetrics {
    pub last_record_gauge: IntGauge,
    pub pitr_history_size: UIntGauge,
    pub archival_size: UIntGauge,
-    pub(crate) layer_size_image: UIntGauge,
-    pub(crate) layer_count_image: UIntGauge,
-    pub(crate) layer_size_delta: UIntGauge,
-    pub(crate) layer_count_delta: UIntGauge,
    pub standby_horizon_gauge: IntGauge,
    pub resident_physical_size_gauge: UIntGauge,
    /// copy of LayeredTimeline.current_logical_size
@@ -2252,42 +2208,6 @@ impl TimelineMetrics {
            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
            .unwrap();

-        let layer_size_image = TIMELINE_LAYER_SIZE
-            .get_metric_with_label_values(&[
-                &tenant_id,
-                &shard_id,
-                &timeline_id,
-                MetricLayerKind::Image.into(),
-            ])
-            .unwrap();
-
-        let layer_count_image = TIMELINE_LAYER_COUNT
-            .get_metric_with_label_values(&[
-                &tenant_id,
-                &shard_id,
-                &timeline_id,
-                MetricLayerKind::Image.into(),
-            ])
-            .unwrap();
-
-        let layer_size_delta = TIMELINE_LAYER_SIZE
-            .get_metric_with_label_values(&[
-                &tenant_id,
-                &shard_id,
-                &timeline_id,
-                MetricLayerKind::Delta.into(),
-            ])
-            .unwrap();
-
-        let layer_count_delta = TIMELINE_LAYER_COUNT
-            .get_metric_with_label_values(&[
-                &tenant_id,
-                &shard_id,
-                &timeline_id,
-                MetricLayerKind::Delta.into(),
-            ])
-            .unwrap();
-
        let standby_horizon_gauge = STANDBY_HORIZON
            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
            .unwrap();
@@ -2342,10 +2262,6 @@ impl TimelineMetrics {
            last_record_gauge,
            pitr_history_size,
            archival_size,
-            layer_size_image,
-            layer_count_image,
-            layer_size_delta,
-            layer_count_delta,
            standby_horizon_gauge,
            resident_physical_size_gauge,
            current_logical_size_gauge,
@@ -2407,31 +2323,6 @@ impl TimelineMetrics {
        let _ = TIMELINE_ARCHIVE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        let _ = PITR_HISTORY_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);

-        let _ = TIMELINE_LAYER_SIZE.remove_label_values(&[
-            tenant_id,
-            shard_id,
-            timeline_id,
-            MetricLayerKind::Image.into(),
-        ]);
-        let _ = TIMELINE_LAYER_COUNT.remove_label_values(&[
-            tenant_id,
-            shard_id,
-            timeline_id,
-            MetricLayerKind::Image.into(),
-        ]);
-        let _ = TIMELINE_LAYER_SIZE.remove_label_values(&[
-            tenant_id,
-            shard_id,
-            timeline_id,
-            MetricLayerKind::Delta.into(),
-        ]);
-        let _ = TIMELINE_LAYER_COUNT.remove_label_values(&[
-            tenant_id,
-            shard_id,
-            timeline_id,
-            MetricLayerKind::Delta.into(),
-        ]);
-
        let _ = EVICTIONS.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        let _ = AUX_FILE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        let _ = VALID_LSN_LEASE_COUNT.remove_label_values(&[tenant_id, shard_id, timeline_id]);
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -4,7 +4,9 @@
 use anyhow::Context;
 use async_compression::tokio::write::GzipEncoder;
 use bytes::Buf;
+use bytes::Bytes;
 use futures::stream::FuturesUnordered;
+use futures::Stream;
 use futures::StreamExt;
 use pageserver_api::key::Key;
 use pageserver_api::models::TenantState;
@@ -26,6 +28,7 @@ use std::borrow::Cow;
 use std::collections::HashMap;
 use std::io;
 use std::net::TcpListener;
+use std::pin::pin;
 use std::str;
 use std::str::FromStr;
 use std::sync::Arc;
@@ -34,8 +37,10 @@ use std::time::Instant;
 use std::time::SystemTime;
 use tokio::io::AsyncWriteExt;
 use tokio::io::{AsyncRead, AsyncWrite};
+use tokio_util::io::StreamReader;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
+use utils::id::ConnectionId;
 use utils::sync::gate::GateGuard;
 use utils::{
    auth::{Claims, Scope, SwappableJwtAuth},
@@ -48,8 +53,9 @@ use crate::auth::check_permission;
 use crate::basebackup;
 use crate::basebackup::BasebackupError;
 use crate::context::{DownloadBehavior, RequestContext};
+use crate::import_datadir::import_wal_from_tar;
 use crate::metrics;
-use crate::metrics::{ComputeCommandKind, COMPUTE_COMMANDS_COUNTERS, LIVE_CONNECTIONS};
+use crate::metrics::{ComputeCommandKind, COMPUTE_COMMANDS_COUNTERS, LIVE_CONNECTIONS_COUNT};
 use crate::pgdatadir_mapping::Version;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
@@ -60,11 +66,13 @@ use crate::tenant::mgr::GetTenantError;
 use crate::tenant::mgr::ShardResolveResult;
 use crate::tenant::mgr::ShardSelector;
 use crate::tenant::mgr::TenantManager;
+use crate::tenant::timeline::FlushLayerError;
 use crate::tenant::timeline::WaitLsnError;
 use crate::tenant::GetTimelineError;
 use crate::tenant::PageReconstructError;
 use crate::tenant::Tenant;
 use crate::tenant::Timeline;
+use crate::trace::Tracer;
 use pageserver_api::key::rel_block_to_key;
 use pageserver_api::reltag::SlruKind;
 use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
@@ -74,6 +82,56 @@ use postgres_ffi::BLCKSZ;
 // is not yet in state [`TenantState::Active`].
 const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000);

+/// Read the end of a tar archive.
+///
+/// A tar archive normally ends with two consecutive blocks of zeros, 512 bytes each.
+/// `tokio_tar` already read the first such block. Read the second all-zeros block,
+/// and check that there is no more data after the EOF marker.
+///
+/// 'tar' command can also write extra blocks of zeros, up to a record
+/// size, controlled by the --record-size argument. Ignore them too.
+async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<()> {
+    use tokio::io::AsyncReadExt;
+    let mut buf = [0u8; 512];
+
+    // Read the all-zeros block, and verify it
+    let mut total_bytes = 0;
+    while total_bytes < 512 {
+        let nbytes = reader.read(&mut buf[total_bytes..]).await?;
+        total_bytes += nbytes;
+        if nbytes == 0 {
+            break;
+        }
+    }
+    if total_bytes < 512 {
+        anyhow::bail!("incomplete or invalid tar EOF marker");
+    }
+    if !buf.iter().all(|&x| x == 0) {
+        anyhow::bail!("invalid tar EOF marker");
+    }
+
+    // Drain any extra zero-blocks after the EOF marker
+    let mut trailing_bytes = 0;
+    let mut seen_nonzero_bytes = false;
+    loop {
+        let nbytes = reader.read(&mut buf).await?;
+        trailing_bytes += nbytes;
+        if !buf.iter().all(|&x| x == 0) {
+            seen_nonzero_bytes = true;
+        }
+        if nbytes == 0 {
+            break;
+        }
+    }
+    if seen_nonzero_bytes {
+        anyhow::bail!("unexpected non-zero bytes after the tar archive");
+    }
+    if trailing_bytes % 512 != 0 {
+        anyhow::bail!("unexpected number of zeros ({trailing_bytes}), not divisible by tar block size (512 bytes), after the tar archive");
+    }
+    Ok(())
+}
+
 ///////////////////////////////////////////////////////////////////////////////

 ///
@@ -83,6 +141,7 @@ const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000);
 ///
 pub async fn libpq_listener_main(
    tenant_manager: Arc<TenantManager>,
+    broker_client: storage_broker::BrokerClientChannel,
    auth: Option<Arc<SwappableJwtAuth>>,
    listener: TcpListener,
    auth_type: AuthType,
@@ -127,6 +186,7 @@ pub async fn libpq_listener_main(
                    false,
                    page_service_conn_main(
                        tenant_manager.clone(),
+                        broker_client.clone(),
                        local_auth,
                        socket,
                        auth_type,
@@ -149,14 +209,20 @@ pub async fn libpq_listener_main(
 #[instrument(skip_all, fields(peer_addr))]
 async fn page_service_conn_main(
    tenant_manager: Arc<TenantManager>,
+    broker_client: storage_broker::BrokerClientChannel,
    auth: Option<Arc<SwappableJwtAuth>>,
    socket: tokio::net::TcpStream,
    auth_type: AuthType,
    connection_ctx: RequestContext,
 ) -> anyhow::Result<()> {
-    let _guard = LIVE_CONNECTIONS
-        .with_label_values(&["page_service"])
-        .guard();
+    // Immediately increment the gauge, then create a job to decrement it on task exit.
+    // One of the pros of `defer!` is that this will *most probably*
+    // get called, even in presence of panics.
+    let gauge = LIVE_CONNECTIONS_COUNT.with_label_values(&["page_service"]);
+    gauge.inc();
+    scopeguard::defer! {
+        gauge.dec();
+    }

    socket
        .set_nodelay(true)
@@ -201,11 +267,12 @@ async fn page_service_conn_main(
    // and create a child per-query context when it invokes process_query.
    // But it's in a shared crate, so, we store connection_ctx inside PageServerHandler
    // and create the per-query context in process_query ourselves.
-    let mut conn_handler = PageServerHandler::new(tenant_manager, auth, connection_ctx);
+    let mut conn_handler =
+        PageServerHandler::new(tenant_manager, broker_client, auth, connection_ctx);
    let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;

    match pgbackend
-        .run(&mut conn_handler, &task_mgr::shutdown_token())
+        .run(&mut conn_handler, task_mgr::shutdown_watcher)
        .await
    {
        Ok(()) => {
@@ -232,6 +299,7 @@ struct HandlerTimeline {
 }

 struct PageServerHandler {
+    broker_client: storage_broker::BrokerClientChannel,
    auth: Option<Arc<SwappableJwtAuth>>,
    claims: Option<Claims>,

@@ -323,11 +391,13 @@ impl From<WaitLsnError> for QueryError {
 impl PageServerHandler {
    pub fn new(
        tenant_manager: Arc<TenantManager>,
+        broker_client: storage_broker::BrokerClientChannel,
        auth: Option<Arc<SwappableJwtAuth>>,
        connection_ctx: RequestContext,
    ) -> Self {
        PageServerHandler {
            tenant_manager,
+            broker_client,
            auth,
            claims: None,
            connection_ctx,
@@ -410,6 +480,73 @@ impl PageServerHandler {
        )
    }

+    fn copyin_stream<'a, IO>(
+        &'a self,
+        pgb: &'a mut PostgresBackend<IO>,
+        cancel: &'a CancellationToken,
+    ) -> impl Stream<Item = io::Result<Bytes>> + 'a
+    where
+        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
+    {
+        async_stream::try_stream! {
+            loop {
+                let msg = tokio::select! {
+                    biased;
+
+                    _ = cancel.cancelled() => {
+                        // We were requested to shut down.
+                        let msg = "pageserver is shutting down";
+                        let _ = pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, None));
+                        Err(QueryError::Shutdown)
+                    }
+
+                    msg = pgb.read_message() => { msg.map_err(QueryError::from)}
+                };
+
+                match msg {
+                    Ok(Some(message)) => {
+                        let copy_data_bytes = match message {
+                            FeMessage::CopyData(bytes) => bytes,
+                            FeMessage::CopyDone => { break },
+                            FeMessage::Sync => continue,
+                            FeMessage::Terminate => {
+                                let msg = "client terminated connection with Terminate message during COPY";
+                                let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
+                                // error can't happen here, ErrorResponse serialization should be always ok
+                                pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?;
+                                Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
+                                break;
+                            }
+                            m => {
+                                let msg = format!("unexpected message {m:?}");
+                                // error can't happen here, ErrorResponse serialization should be always ok
+                                pgb.write_message_noflush(&BeMessage::ErrorResponse(&msg, None)).map_err(|e| e.into_io_error())?;
+                                Err(io::Error::new(io::ErrorKind::Other, msg))?;
+                                break;
+                            }
+                        };
+
+                        yield copy_data_bytes;
+                    }
+                    Ok(None) => {
+                        let msg = "client closed connection during COPY";
+                        let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
+                        // error can't happen here, ErrorResponse serialization should be always ok
+                        pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?;
+                        self.flush_cancellable(pgb, cancel).await.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
+                        Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
+                    }
+                    Err(QueryError::Disconnected(ConnectionError::Io(io_error))) => {
+                        Err(io_error)?;
+                    }
+                    Err(other) => {
+                        Err(io::Error::new(io::ErrorKind::Other, other.to_string()))?;
+                    }
+                };
+            }
+        }
+    }
+
    #[instrument(skip_all)]
    async fn handle_pagerequests<IO>(
        &mut self,
@@ -428,6 +565,18 @@ impl PageServerHandler {
            .get_active_tenant_with_timeout(tenant_id, ShardSelector::First, ACTIVE_TENANT_TIMEOUT)
            .await?;

+        // Make request tracer if needed
+        let mut tracer = if tenant.get_trace_read_requests() {
+            let connection_id = ConnectionId::generate();
+            let path =
+                tenant
+                    .conf
+                    .trace_path(&tenant.tenant_shard_id(), &timeline_id, &connection_id);
+            Some(Tracer::new(path))
+        } else {
+            None
+        };
+
        // switch client to COPYBOTH
        pgb.write_message_noflush(&BeMessage::CopyBothResponse)?;
        self.flush_cancellable(pgb, &tenant.cancel).await?;
@@ -459,6 +608,11 @@ impl PageServerHandler {
            trace!("query: {copy_data_bytes:?}");
            fail::fail_point!("ps::handle-pagerequest-message");

+            // Trace request if needed
+            if let Some(t) = tracer.as_mut() {
+                t.trace(&copy_data_bytes)
+            }
+
            let neon_fe_msg =
                PagestreamFeMessage::parse(&mut copy_data_bytes.reader(), protocol_version)?;

@@ -564,6 +718,128 @@ impl PageServerHandler {
        Ok(())
    }

+    #[allow(clippy::too_many_arguments)]
+    #[instrument(skip_all, fields(%base_lsn, end_lsn=%_end_lsn, %pg_version))]
+    async fn handle_import_basebackup<IO>(
+        &self,
+        pgb: &mut PostgresBackend<IO>,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        base_lsn: Lsn,
+        _end_lsn: Lsn,
+        pg_version: u32,
+        ctx: RequestContext,
+    ) -> Result<(), QueryError>
+    where
+        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
+    {
+        debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id();
+
+        // Create empty timeline
+        info!("creating new timeline");
+        let tenant = self
+            .get_active_tenant_with_timeout(tenant_id, ShardSelector::Zero, ACTIVE_TENANT_TIMEOUT)
+            .await?;
+        let timeline = tenant
+            .create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx)
+            .await?;
+
+        // TODO mark timeline as not ready until it reaches end_lsn.
+        // We might have some wal to import as well, and we should prevent compute
+        // from connecting before that and writing conflicting wal.
+        //
+        // This is not relevant for pageserver->pageserver migrations, since there's
+        // no wal to import. But should be fixed if we want to import from postgres.
+
+        // TODO leave clean state on error. For now you can use detach to clean
+        // up broken state from a failed import.
+
+        // Import basebackup provided via CopyData
+        info!("importing basebackup");
+        pgb.write_message_noflush(&BeMessage::CopyInResponse)?;
+        self.flush_cancellable(pgb, &tenant.cancel).await?;
+
+        let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb, &tenant.cancel)));
+        timeline
+            .import_basebackup_from_tar(
+                tenant.clone(),
+                &mut copyin_reader,
+                base_lsn,
+                self.broker_client.clone(),
+                &ctx,
+            )
+            .await?;
+
+        // Read the end of the tar archive.
+        read_tar_eof(copyin_reader).await?;
+
+        // TODO check checksum
+        // Meanwhile you can verify client-side by taking fullbackup
+        // and checking that it matches in size with what was imported.
+        // It wouldn't work if base came from vanilla postgres though,
+        // since we discard some log files.
+
+        info!("done");
+        Ok(())
+    }
+
+    #[instrument(skip_all, fields(shard_id, %start_lsn, %end_lsn))]
+    async fn handle_import_wal<IO>(
+        &self,
+        pgb: &mut PostgresBackend<IO>,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        start_lsn: Lsn,
+        end_lsn: Lsn,
+        ctx: RequestContext,
+    ) -> Result<(), QueryError>
+    where
+        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
+    {
+        let timeline = self
+            .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
+            .await?;
+        let last_record_lsn = timeline.get_last_record_lsn();
+        if last_record_lsn != start_lsn {
+            return Err(QueryError::Other(
+                anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}"))
+            );
+        }
+
+        // TODO leave clean state on error. For now you can use detach to clean
+        // up broken state from a failed import.
+
+        // Import wal provided via CopyData
+        info!("importing wal");
+        pgb.write_message_noflush(&BeMessage::CopyInResponse)?;
+        self.flush_cancellable(pgb, &timeline.cancel).await?;
+        let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb, &timeline.cancel)));
+        import_wal_from_tar(&timeline, &mut copyin_reader, start_lsn, end_lsn, &ctx).await?;
+        info!("wal import complete");
+
+        // Read the end of the tar archive.
+        read_tar_eof(copyin_reader).await?;
+
+        // TODO Does it make sense to overshoot?
+        if timeline.get_last_record_lsn() < end_lsn {
+            return Err(QueryError::Other(
+                anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}"))
+            );
+        }
+
+        // Flush data to disk, then upload to s3. No need for a forced checkpoint.
+        // We only want to persist the data, and it doesn't matter if it's in the
+        // shape of deltas or images.
+        info!("flushing layers");
+        timeline.freeze_and_flush().await.map_err(|e| match e {
+            FlushLayerError::Cancelled => QueryError::Shutdown,
+            other => QueryError::Other(other.into()),
+        })?;
+
+        info!("done");
+        Ok(())
+    }
+
    /// Helper function to handle the LSN from client request.
    ///
    /// Each GetPage (and Exists and Nblocks) request includes information about
@@ -1434,6 +1710,109 @@ where
            )
            .await?;
            pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
+        } else if query_string.starts_with("import basebackup ") {
+            // Import the `base` section (everything but the wal) of a basebackup.
+            // Assumes the tenant already exists on this pageserver.
+            //
+            // Files are scheduled to be persisted to remote storage, and the
+            // caller should poll the http api to check when that is done.
+            //
+            // Example import command:
+            // 1. Get start/end LSN from backup_manifest file
+            // 2. Run:
+            // cat my_backup/base.tar | psql -h $PAGESERVER \
+            //     -c "import basebackup $TENANT $TIMELINE $START_LSN $END_LSN $PG_VERSION"
+            let params = &parts[2..];
+            if params.len() != 5 {
+                return Err(QueryError::Other(anyhow::anyhow!(
+                    "invalid param number for import basebackup command"
+                )));
+            }
+            let tenant_id = TenantId::from_str(params[0])
+                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
+            let timeline_id = TimelineId::from_str(params[1])
+                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
+            let base_lsn = Lsn::from_str(params[2])
+                .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?;
+            let end_lsn = Lsn::from_str(params[3])
+                .with_context(|| format!("Failed to parse Lsn from {}", params[3]))?;
+            let pg_version = u32::from_str(params[4])
+                .with_context(|| format!("Failed to parse pg_version from {}", params[4]))?;
+
+            tracing::Span::current()
+                .record("tenant_id", field::display(tenant_id))
+                .record("timeline_id", field::display(timeline_id));
+
+            self.check_permission(Some(tenant_id))?;
+
+            COMPUTE_COMMANDS_COUNTERS
+                .for_command(ComputeCommandKind::ImportBasebackup)
+                .inc();
+
+            match self
+                .handle_import_basebackup(
+                    pgb,
+                    tenant_id,
+                    timeline_id,
+                    base_lsn,
+                    end_lsn,
+                    pg_version,
+                    ctx,
+                )
+                .await
+            {
+                Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?,
+                Err(e) => {
+                    error!("error importing base backup between {base_lsn} and {end_lsn}: {e:?}");
+                    pgb.write_message_noflush(&BeMessage::ErrorResponse(
+                        &e.to_string(),
+                        Some(e.pg_error_code()),
+                    ))?
+                }
+            };
+        } else if query_string.starts_with("import wal ") {
+            // Import the `pg_wal` section of a basebackup.
+            //
+            // Files are scheduled to be persisted to remote storage, and the
+            // caller should poll the http api to check when that is done.
+            let params = &parts[2..];
+            if params.len() != 4 {
+                return Err(QueryError::Other(anyhow::anyhow!(
+                    "invalid param number for import wal command"
+                )));
+            }
+            let tenant_id = TenantId::from_str(params[0])
+                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
+            let timeline_id = TimelineId::from_str(params[1])
+                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
+            let start_lsn = Lsn::from_str(params[2])
+                .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?;
+            let end_lsn = Lsn::from_str(params[3])
+                .with_context(|| format!("Failed to parse Lsn from {}", params[3]))?;
+
+            tracing::Span::current()
+                .record("tenant_id", field::display(tenant_id))
+                .record("timeline_id", field::display(timeline_id));
+
+            self.check_permission(Some(tenant_id))?;
+
+            COMPUTE_COMMANDS_COUNTERS
+                .for_command(ComputeCommandKind::ImportWal)
+                .inc();
+
+            match self
+                .handle_import_wal(pgb, tenant_id, timeline_id, start_lsn, end_lsn, ctx)
+                .await
+            {
+                Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?,
+                Err(e) => {
+                    error!("error importing WAL between {start_lsn} and {end_lsn}: {e:?}");
+                    pgb.write_message_noflush(&BeMessage::ErrorResponse(
+                        &e.to_string(),
+                        Some(e.pg_error_code()),
+                    ))?
+                }
+            };
        } else if query_string.to_ascii_lowercase().starts_with("set ") {
            // important because psycopg2 executes "SET datestyle TO 'ISO'"
            // on connect
@@ -1479,6 +1858,66 @@ where
                    ))?
                }
            };
+        } else if let Some(params) = parts.strip_prefix(&["show"]) {
+            // show <tenant_id>
+            if params.len() != 1 {
+                return Err(QueryError::Other(anyhow::anyhow!(
+                    "invalid param number for config command"
+                )));
+            }
+            let tenant_id = TenantId::from_str(params[0])
+                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
+
+            tracing::Span::current().record("tenant_id", field::display(tenant_id));
+
+            self.check_permission(Some(tenant_id))?;
+
+            COMPUTE_COMMANDS_COUNTERS
+                .for_command(ComputeCommandKind::Show)
+                .inc();
+
+            let tenant = self
+                .get_active_tenant_with_timeout(
+                    tenant_id,
+                    ShardSelector::Zero,
+                    ACTIVE_TENANT_TIMEOUT,
+                )
+                .await?;
+            pgb.write_message_noflush(&BeMessage::RowDescription(&[
+                RowDescriptor::int8_col(b"checkpoint_distance"),
+                RowDescriptor::int8_col(b"checkpoint_timeout"),
+                RowDescriptor::int8_col(b"compaction_target_size"),
+                RowDescriptor::int8_col(b"compaction_period"),
+                RowDescriptor::int8_col(b"compaction_threshold"),
+                RowDescriptor::int8_col(b"gc_horizon"),
+                RowDescriptor::int8_col(b"gc_period"),
+                RowDescriptor::int8_col(b"image_creation_threshold"),
+                RowDescriptor::int8_col(b"pitr_interval"),
+            ]))?
+            .write_message_noflush(&BeMessage::DataRow(&[
+                Some(tenant.get_checkpoint_distance().to_string().as_bytes()),
+                Some(
+                    tenant
+                        .get_checkpoint_timeout()
+                        .as_secs()
+                        .to_string()
+                        .as_bytes(),
+                ),
+                Some(tenant.get_compaction_target_size().to_string().as_bytes()),
+                Some(
+                    tenant
+                        .get_compaction_period()
+                        .as_secs()
+                        .to_string()
+                        .as_bytes(),
+                ),
+                Some(tenant.get_compaction_threshold().to_string().as_bytes()),
+                Some(tenant.get_gc_horizon().to_string().as_bytes()),
+                Some(tenant.get_gc_period().as_secs().to_string().as_bytes()),
+                Some(tenant.get_image_creation_threshold().to_string().as_bytes()),
+                Some(tenant.get_pitr_interval().as_secs().to_string().as_bytes()),
+            ]))?
+            .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
        } else {
            return Err(QueryError::Other(anyhow::anyhow!(
                "unknown command {query_string}"
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -522,7 +522,7 @@ impl Timeline {
        ctx: &RequestContext,
    ) -> Result<Option<TimestampTz>, PageReconstructError> {
        let mut max: Option<TimestampTz> = None;
-        self.map_all_timestamps::<()>(probe_lsn, ctx, |timestamp| {
+        self.map_all_timestamps(probe_lsn, ctx, |timestamp| {
            if let Some(max_prev) = max {
                max = Some(max_prev.max(timestamp));
            } else {
@@ -854,14 +854,13 @@ impl Timeline {
        result.add_key(DBDIR_KEY);

        // Fetch list of database dirs and iterate them
-        let dbdir = self.list_dbdirs(lsn, ctx).await?;
-        let mut dbs: Vec<((Oid, Oid), bool)> = dbdir.into_iter().collect();
+        let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
+        let dbdir = DbDirectory::des(&buf)?;

-        dbs.sort_unstable_by(|(k_a, _), (k_b, _)| k_a.cmp(k_b));
-        for ((spcnode, dbnode), has_relmap_file) in dbs {
-            if has_relmap_file {
-                result.add_key(relmap_file_key(spcnode, dbnode));
-            }
+        let mut dbs: Vec<(Oid, Oid)> = dbdir.dbdirs.keys().cloned().collect();
+        dbs.sort_unstable();
+        for (spcnode, dbnode) in dbs {
+            result.add_key(relmap_file_key(spcnode, dbnode));
            result.add_key(rel_dir_to_key(spcnode, dbnode));

            let mut rels: Vec<RelTag> = self
@@ -920,9 +919,6 @@ impl Timeline {
            result.add_key(AUX_FILES_KEY);
        }

-        // Add extra keyspaces in the test cases. Some test cases write keys into the storage without
-        // creating directory keys. These test cases will add such keyspaces into `extra_test_dense_keyspace`
-        // and the keys will not be garbage-colllected.
        #[cfg(test)]
        {
            let guard = self.extra_test_dense_keyspace.load();
@@ -931,48 +927,13 @@ impl Timeline {
            }
        }

-        let dense_keyspace = result.to_keyspace();
-        let sparse_keyspace = SparseKeySpace(KeySpace {
-            ranges: vec![Key::metadata_aux_key_range(), repl_origin_key_range()],
-        });
-
-        if cfg!(debug_assertions) {
-            // Verify if the sparse keyspaces are ordered and non-overlapping.
-
-            // We do not use KeySpaceAccum for sparse_keyspace because we want to ensure each
-            // category of sparse keys are split into their own image/delta files. If there
-            // are overlapping keyspaces, they will be automatically merged by keyspace accum,
-            // and we want the developer to keep the keyspaces separated.
-
-            let ranges = &sparse_keyspace.0.ranges;
-
-            // TODO: use a single overlaps_with across the codebase
-            fn overlaps_with<T: Ord>(a: &Range<T>, b: &Range<T>) -> bool {
-                !(a.end <= b.start || b.end <= a.start)
-            }
-            for i in 0..ranges.len() {
-                for j in 0..i {
-                    if overlaps_with(&ranges[i], &ranges[j]) {
-                        panic!(
-                            "overlapping sparse keyspace: {}..{} and {}..{}",
-                            ranges[i].start, ranges[i].end, ranges[j].start, ranges[j].end
-                        );
-                    }
-                }
-            }
-            for i in 1..ranges.len() {
-                assert!(
-                    ranges[i - 1].end <= ranges[i].start,
-                    "unordered sparse keyspace: {}..{} and {}..{}",
-                    ranges[i - 1].start,
-                    ranges[i - 1].end,
-                    ranges[i].start,
-                    ranges[i].end
-                );
-            }
-        }
-
-        Ok((dense_keyspace, sparse_keyspace))
+        Ok((
+            result.to_keyspace(),
+            /* AUX sparse key space */
+            SparseKeySpace(KeySpace {
+                ranges: vec![repl_origin_key_range(), Key::metadata_aux_key_range()],
+            }),
+        ))
    }

    /// Get cached size of relation if it not updated after specified LSN
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -39,7 +39,6 @@ use tokio::task::JoinSet;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::backoff;
-use utils::circuit_breaker::CircuitBreaker;
 use utils::completion;
 use utils::crashsafe::path_with_suffix_extension;
 use utils::failpoint_support;
@@ -77,8 +76,7 @@ use crate::is_uninit_mark;
 use crate::l0_flush::L0FlushGlobalState;
 use crate::metrics::TENANT;
 use crate::metrics::{
-    remove_tenant_metrics, BROKEN_TENANTS_SET, CIRCUIT_BREAKERS_BROKEN, CIRCUIT_BREAKERS_UNBROKEN,
-    TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC,
+    remove_tenant_metrics, BROKEN_TENANTS_SET, TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC,
 };
 use crate::repository::GcResult;
 use crate::task_mgr;
@@ -278,10 +276,6 @@ pub struct Tenant {

    eviction_task_tenant_state: tokio::sync::Mutex<EvictionTaskTenantState>,

-    /// Track repeated failures to compact, so that we can back off.
-    /// Overhead of mutex is acceptable because compaction is done with a multi-second period.
-    compaction_circuit_breaker: std::sync::Mutex<CircuitBreaker>,
-
    /// If the tenant is in Activating state, notify this to encourage it
    /// to proceed to Active as soon as possible, rather than waiting for lazy
    /// background warmup.
@@ -1647,31 +1641,13 @@ impl Tenant {
            timelines_to_compact
        };

-        // Before doing any I/O work, check our circuit breaker
-        if self.compaction_circuit_breaker.lock().unwrap().is_broken() {
-            info!("Skipping compaction due to previous failures");
-            return Ok(());
-        }
-
        for (timeline_id, timeline) in &timelines_to_compact {
            timeline
                .compact(cancel, EnumSet::empty(), ctx)
                .instrument(info_span!("compact_timeline", %timeline_id))
-                .await
-                .map_err(|e| {
-                    self.compaction_circuit_breaker
-                        .lock()
-                        .unwrap()
-                        .fail(&CIRCUIT_BREAKERS_BROKEN, &e);
-                    e
-                })?;
+                .await?;
        }

-        self.compaction_circuit_breaker
-            .lock()
-            .unwrap()
-            .success(&CIRCUIT_BREAKERS_UNBROKEN);
-
        Ok(())
    }

@@ -2365,6 +2341,13 @@ impl Tenant {
            .unwrap_or(self.conf.default_tenant_conf.pitr_interval)
    }

+    pub fn get_trace_read_requests(&self) -> bool {
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
+        tenant_conf
+            .trace_read_requests
+            .unwrap_or(self.conf.default_tenant_conf.trace_read_requests)
+    }
+
    pub fn get_min_resident_size_override(&self) -> Option<u64> {
        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
        tenant_conf
@@ -2587,14 +2570,6 @@ impl Tenant {
            cached_logical_sizes: tokio::sync::Mutex::new(HashMap::new()),
            cached_synthetic_tenant_size: Arc::new(AtomicU64::new(0)),
            eviction_task_tenant_state: tokio::sync::Mutex::new(EvictionTaskTenantState::default()),
-            compaction_circuit_breaker: std::sync::Mutex::new(CircuitBreaker::new(
-                format!("compaction-{tenant_shard_id}"),
-                5,
-                // Compaction can be a very expensive operation, and might leak disk space.  It also ought
-                // to be infallible, as long as remote storage is available.  So if it repeatedly fails,
-                // use an extremely long backoff.
-                Some(Duration::from_secs(3600 * 24)),
-            )),
            activate_now_sem: tokio::sync::Semaphore::new(0),
            cancel: CancellationToken::default(),
            gate: Gate::default(),
@@ -2912,7 +2887,7 @@ impl Tenant {
                if let Some(ancestor_id) = timeline.get_ancestor_timeline_id() {
                    if let Some(ancestor_gc_cutoffs) = gc_cutoffs.get(&ancestor_id) {
                        target.within_ancestor_pitr =
-                            timeline.get_ancestor_lsn() >= ancestor_gc_cutoffs.time;
+                            timeline.get_ancestor_lsn() >= ancestor_gc_cutoffs.pitr;
                    }
                }

@@ -2928,7 +2903,7 @@ impl Tenant {
                timeline.metrics.pitr_history_size.set(
                    timeline
                        .get_last_record_lsn()
-                        .checked_sub(target.cutoffs.time)
+                        .checked_sub(target.cutoffs.pitr)
                        .unwrap_or(Lsn(0))
                        .0,
                );
@@ -3743,6 +3718,7 @@ pub(crate) mod harness {
                walreceiver_connect_timeout: Some(tenant_conf.walreceiver_connect_timeout),
                lagging_wal_timeout: Some(tenant_conf.lagging_wal_timeout),
                max_lsn_wal_lag: Some(tenant_conf.max_lsn_wal_lag),
+                trace_read_requests: Some(tenant_conf.trace_read_requests),
                eviction_policy: Some(tenant_conf.eviction_policy),
                min_resident_size_override: tenant_conf.min_resident_size_override,
                evictions_low_residence_duration_metric_threshold: Some(
@@ -4262,7 +4238,7 @@ mod tests {
                    .source()
                    .unwrap()
                    .to_string()
-                    .contains("is earlier than latest GC cutoff"));
+                    .contains("is earlier than latest GC horizon"));
            }
        }

@@ -6718,8 +6694,8 @@ mod tests {
        {
            // Update GC info
            let mut guard = tline.gc_info.write().unwrap();
-            guard.cutoffs.time = Lsn(0x30);
-            guard.cutoffs.space = Lsn(0x30);
+            guard.cutoffs.pitr = Lsn(0x30);
+            guard.cutoffs.horizon = Lsn(0x30);
        }

        let expected_result = [
@@ -7109,8 +7085,8 @@ mod tests {
            *guard = GcInfo {
                retain_lsns: vec![],
                cutoffs: GcCutoffs {
-                    time: Lsn(0x30),
-                    space: Lsn(0x30),
+                    pitr: Lsn(0x30),
+                    horizon: Lsn(0x30),
                },
                leases: Default::default(),
                within_ancestor_pitr: false,
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -19,6 +19,7 @@ use bytes::{BufMut, BytesMut};
 use pageserver_api::models::ImageCompressionAlgorithm;
 use tokio::io::AsyncWriteExt;
 use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice};
+use tracing::warn;

 use crate::context::RequestContext;
 use crate::page_cache::PAGE_SZ;
@@ -72,14 +73,22 @@ impl<'a> BlockCursor<'a> {
                len_buf.copy_from_slice(&buf[off..off + 4]);
                off += 4;
            }
-            len_buf[0] &= !LEN_COMPRESSION_BIT_MASK;
+            let bit_mask = if self.read_compressed {
+                !LEN_COMPRESSION_BIT_MASK
+            } else {
+                0x7f
+            };
+            len_buf[0] &= bit_mask;
            u32::from_be_bytes(len_buf) as usize
        };
        let compression_bits = first_len_byte & LEN_COMPRESSION_BIT_MASK;

        let mut tmp_buf = Vec::new();
        let buf_to_write;
-        let compression = if compression_bits <= BYTE_UNCOMPRESSED {
+        let compression = if compression_bits <= BYTE_UNCOMPRESSED || !self.read_compressed {
+            if compression_bits > BYTE_UNCOMPRESSED {
+                warn!("reading key above future limit ({len} bytes)");
+            }
            buf_to_write = dstbuf;
            None
        } else if compression_bits == BYTE_ZSTD {
@@ -128,14 +137,14 @@ impl<'a> BlockCursor<'a> {
 }

 /// Reserved bits for length and compression
-pub(super) const LEN_COMPRESSION_BIT_MASK: u8 = 0xf0;
+const LEN_COMPRESSION_BIT_MASK: u8 = 0xf0;

 /// The maximum size of blobs we support. The highest few bits
 /// are reserved for compression and other further uses.
 const MAX_SUPPORTED_LEN: usize = 0x0fff_ffff;

-pub(super) const BYTE_UNCOMPRESSED: u8 = 0x80;
-pub(super) const BYTE_ZSTD: u8 = BYTE_UNCOMPRESSED | 0x10;
+const BYTE_UNCOMPRESSED: u8 = 0x80;
+const BYTE_ZSTD: u8 = BYTE_UNCOMPRESSED | 0x10;

 /// A wrapper of `VirtualFile` that allows users to write blobs.
 ///
@@ -264,8 +273,12 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
        srcbuf: B,
        ctx: &RequestContext,
    ) -> (B::Buf, Result<u64, Error>) {
-        self.write_blob_maybe_compressed(srcbuf, ctx, ImageCompressionAlgorithm::Disabled)
-            .await
+        self.write_blob_maybe_compressed(
+            srcbuf,
+            ctx,
+            ImageCompressionAlgorithm::DisabledNoDecompress,
+        )
+        .await
    }

    /// Write a blob of data. Returns the offset that it was written to,
@@ -322,13 +335,13 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
                        if compressed.len() < len {
                            let compressed_len = compressed.len();
                            compressed_buf = Some(compressed);
-
                            (BYTE_ZSTD, compressed_len, slice.into_inner())
                        } else {
                            (BYTE_UNCOMPRESSED, len, slice.into_inner())
                        }
                    }
-                    ImageCompressionAlgorithm::Disabled => {
+                    ImageCompressionAlgorithm::Disabled
+                    | ImageCompressionAlgorithm::DisabledNoDecompress => {
                        (BYTE_UNCOMPRESSED, len, srcbuf.slice_full().into_inner())
                    }
                };
@@ -382,63 +395,51 @@ impl BlobWriter<false> {
 }

 #[cfg(test)]
-pub(crate) mod tests {
+mod tests {
    use super::*;
    use crate::{context::DownloadBehavior, task_mgr::TaskKind, tenant::block_io::BlockReaderRef};
-    use camino::Utf8PathBuf;
-    use camino_tempfile::Utf8TempDir;
    use rand::{Rng, SeedableRng};

    async fn round_trip_test<const BUFFERED: bool>(blobs: &[Vec<u8>]) -> Result<(), Error> {
        round_trip_test_compressed::<BUFFERED>(blobs, false).await
    }

-    pub(crate) async fn write_maybe_compressed<const BUFFERED: bool>(
+    async fn round_trip_test_compressed<const BUFFERED: bool>(
        blobs: &[Vec<u8>],
        compression: bool,
-        ctx: &RequestContext,
-    ) -> Result<(Utf8TempDir, Utf8PathBuf, Vec<u64>), Error> {
+    ) -> Result<(), Error> {
        let temp_dir = camino_tempfile::tempdir()?;
        let pathbuf = temp_dir.path().join("file");
+        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);

        // Write part (in block to drop the file)
        let mut offsets = Vec::new();
        {
-            let file = VirtualFile::create(pathbuf.as_path(), ctx).await?;
+            let file = VirtualFile::create(pathbuf.as_path(), &ctx).await?;
            let mut wtr = BlobWriter::<BUFFERED>::new(file, 0);
            for blob in blobs.iter() {
                let (_, res) = if compression {
                    wtr.write_blob_maybe_compressed(
                        blob.clone(),
-                        ctx,
+                        &ctx,
                        ImageCompressionAlgorithm::Zstd { level: Some(1) },
                    )
                    .await
                } else {
-                    wtr.write_blob(blob.clone(), ctx).await
+                    wtr.write_blob(blob.clone(), &ctx).await
                };
                let offs = res?;
                offsets.push(offs);
            }
            // Write out one page worth of zeros so that we can
            // read again with read_blk
-            let (_, res) = wtr.write_blob(vec![0; PAGE_SZ], ctx).await;
+            let (_, res) = wtr.write_blob(vec![0; PAGE_SZ], &ctx).await;
            let offs = res?;
            println!("Writing final blob at offs={offs}");
-            wtr.flush_buffer(ctx).await?;
+            wtr.flush_buffer(&ctx).await?;
        }
-        Ok((temp_dir, pathbuf, offsets))
-    }

-    async fn round_trip_test_compressed<const BUFFERED: bool>(
-        blobs: &[Vec<u8>],
-        compression: bool,
-    ) -> Result<(), Error> {
-        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
-        let (_temp_dir, pathbuf, offsets) =
-            write_maybe_compressed::<BUFFERED>(blobs, compression, &ctx).await?;
-
-        let file = VirtualFile::open(pathbuf, &ctx).await?;
+        let file = VirtualFile::open(pathbuf.as_path(), &ctx).await?;
        let rdr = BlockReaderRef::VirtualFile(&file);
        let rdr = BlockCursor::new_with_compression(rdr, compression);
        for (idx, (blob, offset)) in blobs.iter().zip(offsets.iter()).enumerate() {
@@ -451,7 +452,7 @@ pub(crate) mod tests {
        Ok(())
    }

-    pub(crate) fn random_array(len: usize) -> Vec<u8> {
+    fn random_array(len: usize) -> Vec<u8> {
        let mut rng = rand::thread_rng();
        (0..len).map(|_| rng.gen()).collect::<_>()
    }
--- a/pageserver/src/tenant/block_io.rs
+++ b/pageserver/src/tenant/block_io.rs
@@ -149,19 +149,24 @@ impl<'a> BlockReaderRef<'a> {
 /// ```
 ///
 pub struct BlockCursor<'a> {
+    pub(super) read_compressed: bool,
    reader: BlockReaderRef<'a>,
 }

 impl<'a> BlockCursor<'a> {
    pub(crate) fn new(reader: BlockReaderRef<'a>) -> Self {
-        Self::new_with_compression(reader, true)
+        Self::new_with_compression(reader, false)
    }
-    pub(crate) fn new_with_compression(reader: BlockReaderRef<'a>, _read_compressed: bool) -> Self {
-        BlockCursor { reader }
+    pub(crate) fn new_with_compression(reader: BlockReaderRef<'a>, read_compressed: bool) -> Self {
+        BlockCursor {
+            read_compressed,
+            reader,
+        }
    }
    // Needed by cli
    pub fn new_fileblockreader(reader: &'a FileBlockReader) -> Self {
        BlockCursor {
+            read_compressed: false,
            reader: BlockReaderRef::FileBlockReader(reader),
        }
    }
@@ -191,11 +196,25 @@ pub struct FileBlockReader<'a> {

    /// Unique ID of this file, used as key in the page cache.
    file_id: page_cache::FileId,
+
+    compressed_reads: bool,
 }

 impl<'a> FileBlockReader<'a> {
    pub fn new(file: &'a VirtualFile, file_id: FileId) -> Self {
-        FileBlockReader { file_id, file }
+        Self::new_with_compression(file, file_id, false)
+    }
+
+    pub fn new_with_compression(
+        file: &'a VirtualFile,
+        file_id: FileId,
+        compressed_reads: bool,
+    ) -> Self {
+        FileBlockReader {
+            file_id,
+            file,
+            compressed_reads,
+        }
    }

    /// Read a page from the underlying file into given buffer.
@@ -242,7 +261,10 @@ impl<'a> FileBlockReader<'a> {

 impl BlockReader for FileBlockReader<'_> {
    fn block_cursor(&self) -> BlockCursor<'_> {
-        BlockCursor::new_with_compression(BlockReaderRef::FileBlockReader(self), true)
+        BlockCursor::new_with_compression(
+            BlockReaderRef::FileBlockReader(self),
+            self.compressed_reads,
+        )
    }
 }

--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -335,6 +335,7 @@ pub struct TenantConf {
    /// A lagging safekeeper will be changed after `lagging_wal_timeout` time elapses since the last WAL update,
    /// to avoid eager reconnects.
    pub max_lsn_wal_lag: NonZeroU64,
+    pub trace_read_requests: bool,
    pub eviction_policy: EvictionPolicy,
    pub min_resident_size_override: Option<u64>,
    // See the corresponding metric's help string.
@@ -435,6 +436,10 @@ pub struct TenantConfOpt {
    #[serde(default)]
    pub max_lsn_wal_lag: Option<NonZeroU64>,

+    #[serde(skip_serializing_if = "Option::is_none")]
+    #[serde(default)]
+    pub trace_read_requests: Option<bool>,
+
    #[serde(skip_serializing_if = "Option::is_none")]
    #[serde(default)]
    pub eviction_policy: Option<EvictionPolicy>,
@@ -514,6 +519,9 @@ impl TenantConfOpt {
                .lagging_wal_timeout
                .unwrap_or(global_conf.lagging_wal_timeout),
            max_lsn_wal_lag: self.max_lsn_wal_lag.unwrap_or(global_conf.max_lsn_wal_lag),
+            trace_read_requests: self
+                .trace_read_requests
+                .unwrap_or(global_conf.trace_read_requests),
            eviction_policy: self.eviction_policy.unwrap_or(global_conf.eviction_policy),
            min_resident_size_override: self
                .min_resident_size_override
@@ -573,6 +581,7 @@ impl Default for TenantConf {
                .expect("cannot parse default walreceiver lagging wal timeout"),
            max_lsn_wal_lag: NonZeroU64::new(DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG)
                .expect("cannot parse default max walreceiver Lsn wal lag"),
+            trace_read_requests: false,
            eviction_policy: EvictionPolicy::NoEviction,
            min_resident_size_override: None,
            evictions_low_residence_duration_metric_threshold: humantime::parse_duration(
@@ -650,6 +659,7 @@ impl From<TenantConfOpt> for models::TenantConfig {
            walreceiver_connect_timeout: value.walreceiver_connect_timeout.map(humantime),
            lagging_wal_timeout: value.lagging_wal_timeout.map(humantime),
            max_lsn_wal_lag: value.max_lsn_wal_lag,
+            trace_read_requests: value.trace_read_requests,
            eviction_policy: value.eviction_policy,
            min_resident_size_override: value.min_resident_size_override,
            evictions_low_residence_duration_metric_threshold: value
--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -550,10 +550,10 @@ where
    /// We maintain the length of the stack to be always greater than zero.
    /// Two exceptions are:
    /// 1. `Self::flush_node`. The method will push the new node if it extracted the last one.
-    ///    So because other methods cannot see the intermediate state invariant still holds.
+    ///   So because other methods cannot see the intermediate state invariant still holds.
    /// 2. `Self::finish`. It consumes self and does not return it back,
-    ///    which means that this is where the structure is destroyed.
-    ///    Thus stack of zero length cannot be observed by other methods.
+    ///  which means that this is where the structure is destroyed.
+    ///  Thus stack of zero length cannot be observed by other methods.
    stack: Vec<BuildNode<L>>,

    /// Last key that was appended to the tree. Used to sanity check that append
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -241,7 +241,7 @@ use self::index::IndexPart;

 use super::metadata::MetadataUpdate;
 use super::storage_layer::{Layer, LayerName, ResidentLayer};
-use super::upload_queue::{NotInitialized, SetDeletedFlagProgress};
+use super::upload_queue::SetDeletedFlagProgress;
 use super::Generation;

 pub(crate) use download::{
@@ -1930,31 +1930,6 @@ impl RemoteTimelineClient {
            }
        }
    }
-
-    /// Returns an accessor which will hold the UploadQueue mutex for accessing the upload queue
-    /// externally to RemoteTimelineClient.
-    pub(crate) fn initialized_upload_queue(
-        &self,
-    ) -> Result<UploadQueueAccessor<'_>, NotInitialized> {
-        let mut inner = self.upload_queue.lock().unwrap();
-        inner.initialized_mut()?;
-        Ok(UploadQueueAccessor { inner })
-    }
-}
-
-pub(crate) struct UploadQueueAccessor<'a> {
-    inner: std::sync::MutexGuard<'a, UploadQueue>,
-}
-
-impl<'a> UploadQueueAccessor<'a> {
-    pub(crate) fn latest_uploaded_index_part(&self) -> &IndexPart {
-        match &*self.inner {
-            UploadQueue::Initialized(x) => &x.clean.0,
-            UploadQueue::Uninitialized | UploadQueue::Stopped(_) => {
-                unreachable!("checked before constructing")
-            }
-        }
-    }
 }

 pub fn remote_tenant_path(tenant_shard_id: &TenantShardId) -> RemotePath {
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -176,24 +176,6 @@ pub(crate) struct Lineage {
    ///
    /// If you are adding support for detaching from a hierarchy, consider changing the ancestry
    /// into a `Vec<(TimelineId, Lsn)>` to be a path instead.
-    // FIXME: this is insufficient even for path of two timelines for future wal recovery
-    // purposes:
-    //
-    // assuming a "old main" which has received most of the WAL, and has a branch "new main",
-    // starting a bit before "old main" last_record_lsn. the current version works fine,
-    // because we will know to replay wal and branch at the recorded Lsn to do wal recovery.
-    //
-    // then assuming "new main" would similarly receive a branch right before its last_record_lsn,
-    // "new new main". the current implementation would just store ("new main", ancestor_lsn, _)
-    // here. however, we cannot recover from WAL using only that information, we would need the
-    // whole ancestry here:
-    //
-    // ```json
-    // [
-    //   ["old main", ancestor_lsn("new main"), _],
-    //   ["new main", ancestor_lsn("new new main"), _]
-    // ]
-    // ```
    #[serde(skip_serializing_if = "Option::is_none", default)]
    original_ancestor: Option<(TimelineId, Lsn, NaiveDateTime)>,
 }
@@ -235,14 +217,6 @@ impl Lineage {
        self.original_ancestor
            .is_some_and(|(_, ancestor_lsn, _)| ancestor_lsn == lsn)
    }
-
-    pub(crate) fn is_detached_from_original_ancestor(&self) -> bool {
-        self.original_ancestor.is_some()
-    }
-
-    pub(crate) fn is_reparented(&self) -> bool {
-        !self.reparenting_history.is_empty()
-    }
 }

 #[cfg(test)]
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -135,9 +135,11 @@ pub struct TimelineInputs {
    ancestor_lsn: Lsn,
    last_record: Lsn,
    latest_gc_cutoff: Lsn,
+    horizon_cutoff: Lsn,
+    pitr_cutoff: Lsn,

    /// Cutoff point based on GC settings
-    next_pitr_cutoff: Lsn,
+    next_gc_cutoff: Lsn,

    /// Cutoff point calculated from the user-supplied 'max_retention_period'
    retention_param_cutoff: Option<Lsn>,
@@ -148,7 +150,7 @@ pub struct TimelineInputs {

 /// Gathers the inputs for the tenant sizing model.
 ///
-/// Tenant size does not consider the latest state, but only the state until next_pitr_cutoff, which
+/// Tenant size does not consider the latest state, but only the state until next_gc_cutoff, which
 /// is updated on-demand, during the start of this calculation and separate from the
 /// [`TimelineInputs::latest_gc_cutoff`].
 ///
@@ -156,8 +158,11 @@ pub struct TimelineInputs {
 ///
 /// ```text
 /// 0-----|---------|----|------------| · · · · · |·> lsn
-///   initdb_lsn  branchpoints*  next_pitr_cutoff  latest
+///   initdb_lsn  branchpoints*  next_gc_cutoff  latest
 /// ```
+///
+/// Until gc_horizon_cutoff > `Timeline::last_record_lsn` for any of the tenant's timelines, the
+/// tenant size will be zero.
 pub(super) async fn gather_inputs(
    tenant: &Tenant,
    limit: &Arc<Semaphore>,
@@ -167,7 +172,7 @@ pub(super) async fn gather_inputs(
    cancel: &CancellationToken,
    ctx: &RequestContext,
 ) -> Result<ModelInputs, CalculateSyntheticSizeError> {
-    // refresh is needed to update [`timeline::GcCutoffs`]
+    // refresh is needed to update gc related pitr_cutoff and horizon_cutoff
    tenant.refresh_gc_info(cancel, ctx).await?;

    // Collect information about all the timelines
@@ -231,18 +236,20 @@ pub(super) async fn gather_inputs(
        // we don't consider the `Timeline::disk_consistent_lsn` at all, because we are not
        // actually removing files.
        //
-        // We only consider [`timeline::GcCutoffs::time`], and not [`timeline::GcCutoffs::space`], because from
+        // We only consider [`GcInfo::pitr_cutoff`], and not [`GcInfo::horizon_cutoff`], because from
        // a user's perspective they have only requested retention up to the time bound (pitr_cutoff), rather
-        // than our internal space cutoff.  This means that if someone drops a database and waits for their
+        // than a space bound (horizon cutoff).  This means that if someone drops a database and waits for their
        // PITR interval, they will see synthetic size decrease, even if we are still storing data inside
-        // the space cutoff.
-        let mut next_pitr_cutoff = gc_info.cutoffs.time;
+        // horizon_cutoff.
+        let pitr_cutoff = gc_info.cutoffs.pitr;
+        let horizon_cutoff = gc_info.cutoffs.horizon;
+        let mut next_gc_cutoff = pitr_cutoff;

        // If the caller provided a shorter retention period, use that instead of the GC cutoff.
        let retention_param_cutoff = if let Some(max_retention_period) = max_retention_period {
            let param_cutoff = Lsn(last_record_lsn.0.saturating_sub(max_retention_period));
-            if next_pitr_cutoff < param_cutoff {
-                next_pitr_cutoff = param_cutoff;
+            if next_gc_cutoff < param_cutoff {
+                next_gc_cutoff = param_cutoff;
            }
            Some(param_cutoff)
        } else {
@@ -256,7 +263,7 @@ pub(super) async fn gather_inputs(
            .copied()
            .collect::<Vec<_>>();

-        // next_pitr_cutoff in parent branch are not of interest (right now at least), nor do we
+        // next_gc_cutoff in parent branch are not of interest (right now at least), nor do we
        // want to query any logical size before initdb_lsn.
        let branch_start_lsn = cmp::max(ancestor_lsn, timeline.initdb_lsn);

@@ -284,10 +291,10 @@ pub(super) async fn gather_inputs(
            )
        }

-        // Add a point for the PITR cutoff
-        let branch_start_needed = next_pitr_cutoff <= branch_start_lsn;
+        // Add a point for the GC cutoff
+        let branch_start_needed = next_gc_cutoff <= branch_start_lsn;
        if !branch_start_needed {
-            lsns.push((next_pitr_cutoff, LsnKind::GcCutOff));
+            lsns.push((next_gc_cutoff, LsnKind::GcCutOff));
        }

        lsns.sort_unstable();
@@ -326,7 +333,7 @@ pub(super) async fn gather_inputs(
                    parent: Some(parent),
                    lsn: lsn.0,
                    size: None,
-                    needed: lsn > next_pitr_cutoff,
+                    needed: lsn > next_gc_cutoff,
                },
                timeline_id: timeline.timeline_id,
                kind,
@@ -350,8 +357,8 @@ pub(super) async fn gather_inputs(
                    segment: Segment {
                        parent: Some(lease_parent),
                        lsn: lsn.0,
-                        size: None,                     // Filled in later, if necessary
-                        needed: lsn > next_pitr_cutoff, // only needed if the point is within rentention.
+                        size: None,                   // Filled in later, if necessary
+                        needed: lsn > next_gc_cutoff, // only needed if the point is within rentention.
                    },
                    timeline_id: timeline.timeline_id,
                    kind: LsnKind::LeaseStart,
@@ -391,7 +398,9 @@ pub(super) async fn gather_inputs(
            last_record: last_record_lsn,
            // this is not used above, because it might not have updated recently enough
            latest_gc_cutoff: *timeline.get_latest_gc_cutoff_lsn(),
-            next_pitr_cutoff,
+            horizon_cutoff,
+            pitr_cutoff,
+            next_gc_cutoff,
            retention_param_cutoff,
            lease_points,
        });
@@ -733,7 +742,9 @@ fn verify_size_for_multiple_branches() {
      "ancestor_lsn": "0/18D3D98",
      "last_record": "0/2230CD0",
      "latest_gc_cutoff": "0/1698C48",
-      "next_pitr_cutoff": "0/2210CD0",
+      "horizon_cutoff": "0/2210CD0",
+      "pitr_cutoff": "0/2210CD0",
+      "next_gc_cutoff": "0/2210CD0",
      "retention_param_cutoff": null,
      "lease_points": []
    },
@@ -742,7 +753,9 @@ fn verify_size_for_multiple_branches() {
      "ancestor_lsn": "0/176D998",
      "last_record": "0/1837770",
      "latest_gc_cutoff": "0/1698C48",
-      "next_pitr_cutoff": "0/1817770",
+      "horizon_cutoff": "0/1817770",
+      "pitr_cutoff": "0/1817770",
+      "next_gc_cutoff": "0/1817770",
      "retention_param_cutoff": null,
      "lease_points": []
    },
@@ -751,7 +764,9 @@ fn verify_size_for_multiple_branches() {
      "ancestor_lsn": "0/0",
      "last_record": "0/18D3D98",
      "latest_gc_cutoff": "0/1698C48",
-      "next_pitr_cutoff": "0/18B3D98",
+      "horizon_cutoff": "0/18B3D98",
+      "pitr_cutoff": "0/18B3D98",
+      "next_gc_cutoff": "0/18B3D98",
      "retention_param_cutoff": null,
      "lease_points": []
    }
@@ -805,7 +820,9 @@ fn verify_size_for_one_branch() {
      "ancestor_lsn": "0/0",
      "last_record": "47/280A5860",
      "latest_gc_cutoff": "47/240A5860",
-      "next_pitr_cutoff": "47/240A5860",
+      "horizon_cutoff": "47/240A5860",
+      "pitr_cutoff": "47/240A5860",
+      "next_gc_cutoff": "47/240A5860",
      "retention_param_cutoff": "0/0",
      "lease_points": []
    }
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -7,9 +7,6 @@ pub(crate) mod layer;
 mod layer_desc;
 mod layer_name;

-#[cfg(test)]
-pub mod merge_iterator;
-
 use crate::context::{AccessStatsBehavior, RequestContext};
 use crate::repository::Value;
 use crate::task_mgr::TaskKind;
@@ -676,26 +673,6 @@ impl LayerAccessStats {
            },
        }
    }
-
-    /// Whether this layer has been accessed (excluding in [`AccessStatsBehavior::Skip`]).
-    ///
-    /// This indicates whether the layer has been used for some purpose that would motivate
-    /// us to keep it on disk, such as for serving a getpage request.
-    fn accessed(&self) -> bool {
-        let locked = self.0.lock().unwrap();
-        let inner = &locked.for_eviction_policy;
-
-        // Consider it accessed if the most recent access is more recent than
-        // the most recent change in residence status.
-        match (
-            inner.last_accesses.recent(),
-            inner.last_residence_changes.recent(),
-        ) {
-            (None, _) => false,
-            (Some(_), None) => true,
-            (Some(a), Some(r)) => a.when >= r.timestamp,
-        }
-    }
 }

 /// Get a layer descriptor from a layer.
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -223,11 +223,6 @@ pub struct DeltaLayerInner {
    file: VirtualFile,
    file_id: FileId,

-    #[allow(dead_code)]
-    layer_key_range: Range<Key>,
-    #[allow(dead_code)]
-    layer_lsn_range: Range<Lsn>,
-
    max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
 }

@@ -457,15 +452,8 @@ impl DeltaLayerWriterInner {
        ctx: &RequestContext,
    ) -> (Vec<u8>, anyhow::Result<()>) {
        assert!(self.lsn_range.start <= lsn);
-
-        let compression = if val.len() >= 8192 {
-            // For full page images, respect configured image compression algorithm.
-            self.conf.image_compression
-        } else {
-            // For small writes, do not use compression.  Compression ratios on tiny buffers do not justify CPU cost.
-            ImageCompressionAlgorithm::Disabled
-        };
-
+        // We don't want to use compression in delta layer creation
+        let compression = ImageCompressionAlgorithm::DisabledNoDecompress;
        let (val, res) = self
            .blob_writer
            .write_blob_maybe_compressed(val, ctx, compression)
@@ -486,10 +474,6 @@ impl DeltaLayerWriterInner {
        self.blob_writer.size() + self.tree.borrow_writer().size()
    }

-    fn size_values(&self) -> u64 {
-        self.blob_writer.size()
-    }
-
    ///
    /// Finish writing the delta layer.
    ///
@@ -679,10 +663,6 @@ impl DeltaLayerWriter {
        self.inner.as_ref().unwrap().size()
    }

-    pub fn size_values(&self) -> u64 {
-        self.inner.as_ref().unwrap().size_values()
-    }
-
    ///
    /// Finish writing the delta layer.
    ///
@@ -762,16 +742,6 @@ impl DeltaLayer {
 }

 impl DeltaLayerInner {
-    #[cfg(test)]
-    pub(crate) fn key_range(&self) -> &Range<Key> {
-        &self.layer_key_range
-    }
-
-    #[cfg(test)]
-    pub(crate) fn lsn_range(&self) -> &Range<Lsn> {
-        &self.layer_lsn_range
-    }
-
    /// Returns nested result following Result<Result<_, OpErr>, Critical>:
    /// - inner has the success or transient failure
    /// - outer has the permanent failure
@@ -820,8 +790,6 @@ impl DeltaLayerInner {
            index_start_blk: actual_summary.index_start_blk,
            index_root_blk: actual_summary.index_root_blk,
            max_vectored_read_bytes,
-            layer_key_range: actual_summary.key_range,
-            layer_lsn_range: actual_summary.lsn_range,
        }))
    }

@@ -1195,7 +1163,9 @@ impl DeltaLayerInner {
                    let delta_key = DeltaKey::from_slice(key);
                    let val_ref = ValueRef {
                        blob_ref: BlobRef(value),
-                        layer: self,
+                        reader: BlockCursor::new(crate::tenant::block_io::BlockReaderRef::Adapter(
+                            Adapter(self),
+                        )),
                    };
                    let pos = BlobRef(value).pos();
                    if let Some(last) = all_keys.last_mut() {
@@ -1334,7 +1304,7 @@ impl DeltaLayerInner {
                        offsets.start.pos(),
                        offsets.end.pos(),
                        meta,
-                        max_read_size,
+                        Some(max_read_size),
                    ))
                }
            } else {
@@ -1439,7 +1409,7 @@ impl DeltaLayerInner {
        let keys = self.load_keys(ctx).await?;

        async fn dump_blob(val: &ValueRef<'_>, ctx: &RequestContext) -> anyhow::Result<String> {
-            let buf = val.load_raw(ctx).await?;
+            let buf = val.reader.read_blob(val.blob_ref.pos(), ctx).await?;
            let val = Value::des(&buf)?;
            let desc = match val {
                Value::Image(img) => {
@@ -1474,7 +1444,8 @@ impl DeltaLayerInner {
            use pageserver_api::key::CHECKPOINT_KEY;
            use postgres_ffi::CheckPoint;
            if key == CHECKPOINT_KEY {
-                let val = val.load(ctx).await?;
+                let buf = val.reader.read_blob(val.blob_ref.pos(), ctx).await?;
+                let val = Value::des(&buf)?;
                match val {
                    Value::Image(img) => {
                        let checkpoint = CheckPoint::decode(&img)?;
@@ -1559,24 +1530,17 @@ pub struct DeltaEntry<'a> {
 /// Reference to an on-disk value
 pub struct ValueRef<'a> {
    blob_ref: BlobRef,
-    layer: &'a DeltaLayerInner,
+    reader: BlockCursor<'a>,
 }

 impl<'a> ValueRef<'a> {
    /// Loads the value from disk
    pub async fn load(&self, ctx: &RequestContext) -> Result<Value> {
-        let buf = self.load_raw(ctx).await?;
+        // theoretically we *could* record an access time for each, but it does not really matter
+        let buf = self.reader.read_blob(self.blob_ref.pos(), ctx).await?;
        let val = Value::des(&buf)?;
        Ok(val)
    }
-
-    async fn load_raw(&self, ctx: &RequestContext) -> Result<Vec<u8>> {
-        let reader = BlockCursor::new(crate::tenant::block_io::BlockReaderRef::Adapter(Adapter(
-            self.layer,
-        )));
-        let buf = reader.read_blob(self.blob_ref.pos(), ctx).await?;
-        Ok(buf)
-    }
 }

 pub(crate) struct Adapter<T>(T);
@@ -1634,17 +1598,13 @@ impl<'a> DeltaLayerIterator<'a> {
                let lsn = DeltaKey::extract_lsn_from_buf(&raw_key);
                let blob_ref = BlobRef(value);
                let offset = blob_ref.pos();
-                if let Some(batch_plan) = self.planner.handle(key, lsn, offset) {
+                if let Some(batch_plan) = self.planner.handle(key, lsn, offset, BlobFlag::None) {
                    break batch_plan;
                }
            } else {
                self.is_end = true;
                let data_end_offset = self.delta_layer.index_start_offset();
-                if let Some(item) = self.planner.handle_range_end(data_end_offset) {
-                    break item;
-                } else {
-                    return Ok(()); // TODO: test empty iterator
-                }
+                break self.planner.handle_range_end(data_end_offset);
            }
        };
        let vectored_blob_reader = VectoredBlobReader::new(&self.delta_layer.file);
@@ -1679,7 +1639,7 @@ impl<'a> DeltaLayerIterator<'a> {
 }

 #[cfg(test)]
-pub(crate) mod test {
+mod test {
    use std::collections::BTreeMap;

    use itertools::MinMaxResult;
@@ -1687,7 +1647,6 @@ pub(crate) mod test {
    use rand::RngCore;

    use super::*;
-    use crate::repository::Value;
    use crate::tenant::harness::TIMELINE_ID;
    use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner;
    use crate::tenant::Tenant;
@@ -1697,7 +1656,6 @@ pub(crate) mod test {
        tenant::{disk_btree::tests::TestDisk, harness::TenantHarness},
        DEFAULT_PG_VERSION,
    };
-    use bytes::Bytes;

    /// Construct an index for a fictional delta layer and and then
    /// traverse in order to plan vectored reads for a query. Finally,
@@ -2259,31 +2217,15 @@ pub(crate) mod test {
        }
    }

-    pub(crate) fn sort_delta(
-        (k1, l1, _): &(Key, Lsn, Value),
-        (k2, l2, _): &(Key, Lsn, Value),
-    ) -> std::cmp::Ordering {
-        (k1, l1).cmp(&(k2, l2))
-    }
-
-    pub(crate) fn sort_delta_value(
-        (k1, l1, v1): &(Key, Lsn, Value),
-        (k2, l2, v2): &(Key, Lsn, Value),
-    ) -> std::cmp::Ordering {
-        let order_1 = if v1.is_image() { 0 } else { 1 };
-        let order_2 = if v2.is_image() { 0 } else { 1 };
-        (k1, l1, order_1).cmp(&(k2, l2, order_2))
-    }
-
-    pub(crate) async fn produce_delta_layer(
+    async fn produce_delta_layer(
        tenant: &Tenant,
        tline: &Arc<Timeline>,
        mut deltas: Vec<(Key, Lsn, Value)>,
        ctx: &RequestContext,
    ) -> anyhow::Result<ResidentLayer> {
-        deltas.sort_by(sort_delta);
+        deltas.sort_by(|(k1, l1, _), (k2, l2, _)| (k1, l1).cmp(&(k2, l2)));
        let (key_start, _, _) = deltas.first().unwrap();
-        let (key_max, _, _) = deltas.last().unwrap();
+        let (key_max, _, _) = deltas.first().unwrap();
        let lsn_min = deltas.iter().map(|(_, lsn, _)| lsn).min().unwrap();
        let lsn_max = deltas.iter().map(|(_, lsn, _)| lsn).max().unwrap();
        let lsn_end = Lsn(lsn_max.0 + 1);
@@ -2328,6 +2270,9 @@ pub(crate) mod test {

    #[tokio::test]
    async fn delta_layer_iterator() {
+        use crate::repository::Value;
+        use bytes::Bytes;
+
        let harness = TenantHarness::create("delta_layer_iterator").unwrap();
        let (tenant, ctx) = harness.load().await;

--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -165,6 +165,7 @@ pub struct ImageLayerInner {
    file_id: FileId,

    max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
+    compressed_reads: bool,
 }

 impl std::fmt::Debug for ImageLayerInner {
@@ -178,7 +179,8 @@ impl std::fmt::Debug for ImageLayerInner {

 impl ImageLayerInner {
    pub(super) async fn dump(&self, ctx: &RequestContext) -> anyhow::Result<()> {
-        let block_reader = FileBlockReader::new(&self.file, self.file_id);
+        let block_reader =
+            FileBlockReader::new_with_compression(&self.file, self.file_id, self.compressed_reads);
        let tree_reader = DiskBtreeReader::<_, KEY_SIZE>::new(
            self.index_start_blk,
            self.index_root_blk,
@@ -266,9 +268,10 @@ impl ImageLayer {
    async fn load_inner(&self, ctx: &RequestContext) -> Result<ImageLayerInner> {
        let path = self.path();

-        let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, None, ctx)
-            .await
-            .and_then(|res| res)?;
+        let loaded =
+            ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, None, false, ctx)
+                .await
+                .and_then(|res| res)?;

        // not production code
        let actual_layer_name = LayerName::from_str(path.file_name().unwrap()).unwrap();
@@ -369,16 +372,6 @@ impl ImageLayer {
 }

 impl ImageLayerInner {
-    #[cfg(test)]
-    pub(crate) fn key_range(&self) -> &Range<Key> {
-        &self.key_range
-    }
-
-    #[cfg(test)]
-    pub(crate) fn lsn(&self) -> Lsn {
-        self.lsn
-    }
-
    /// Returns nested result following Result<Result<_, OpErr>, Critical>:
    /// - inner has the success or transient failure
    /// - outer has the permanent failure
@@ -387,6 +380,7 @@ impl ImageLayerInner {
        lsn: Lsn,
        summary: Option<Summary>,
        max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
+        support_compressed_reads: bool,
        ctx: &RequestContext,
    ) -> Result<Result<Self, anyhow::Error>, anyhow::Error> {
        let file = match VirtualFile::open(path, ctx).await {
@@ -430,6 +424,7 @@ impl ImageLayerInner {
            file,
            file_id,
            max_vectored_read_bytes,
+            compressed_reads: support_compressed_reads,
            key_range: actual_summary.key_range,
        }))
    }
@@ -440,7 +435,8 @@ impl ImageLayerInner {
        reconstruct_state: &mut ValueReconstructState,
        ctx: &RequestContext,
    ) -> anyhow::Result<ValueReconstructResult> {
-        let block_reader = FileBlockReader::new(&self.file, self.file_id);
+        let block_reader =
+            FileBlockReader::new_with_compression(&self.file, self.file_id, self.compressed_reads);
        let tree_reader =
            DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, &block_reader);

@@ -500,12 +496,14 @@ impl ImageLayerInner {
        &self,
        ctx: &RequestContext,
    ) -> anyhow::Result<Vec<(Key, Lsn, Value)>> {
-        let block_reader = FileBlockReader::new(&self.file, self.file_id);
+        let block_reader =
+            FileBlockReader::new_with_compression(&self.file, self.file_id, self.compressed_reads);
        let tree_reader =
            DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, &block_reader);
        let mut result = Vec::new();
        let mut stream = Box::pin(tree_reader.into_stream(&[0; KEY_SIZE], ctx));
-        let block_reader = FileBlockReader::new(&self.file, self.file_id);
+        let block_reader =
+            FileBlockReader::new_with_compression(&self.file, self.file_id, self.compressed_reads);
        let cursor = block_reader.block_cursor();
        while let Some(item) = stream.next().await {
            // TODO: dedup code with get_reconstruct_value
@@ -540,7 +538,8 @@ impl ImageLayerInner {
                .into(),
        );

-        let block_reader = FileBlockReader::new(&self.file, self.file_id);
+        let block_reader =
+            FileBlockReader::new_with_compression(&self.file, self.file_id, self.compressed_reads);
        let tree_reader =
            DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, block_reader);

@@ -701,7 +700,8 @@ impl ImageLayerInner {

    #[cfg(test)]
    pub(crate) fn iter<'a>(&'a self, ctx: &'a RequestContext) -> ImageLayerIterator<'a> {
-        let block_reader = FileBlockReader::new(&self.file, self.file_id);
+        let block_reader =
+            FileBlockReader::new_with_compression(&self.file, self.file_id, self.compressed_reads);
        let tree_reader =
            DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, block_reader);
        ImageLayerIterator {
@@ -809,11 +809,7 @@ impl ImageLayerWriterInner {
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
        ensure!(self.key_range.contains(&key));
-        let compression = self.conf.image_compression;
-        let (_img, res) = self
-            .blob_writer
-            .write_blob_maybe_compressed(img, ctx, compression)
-            .await;
+        let (_img, res) = self.blob_writer.write_blob(img, ctx).await;
        // TODO: re-use the buffer for `img` further upstack
        let off = res?;

@@ -998,17 +994,14 @@ impl<'a> ImageLayerIterator<'a> {
                    Key::from_slice(&raw_key[..KEY_SIZE]),
                    self.image_layer.lsn,
                    offset,
+                    BlobFlag::None,
                ) {
                    break batch_plan;
                }
            } else {
                self.is_end = true;
                let payload_end = self.image_layer.index_start_blk as u64 * PAGE_SZ as u64;
-                if let Some(item) = self.planner.handle_range_end(payload_end) {
-                    break item;
-                } else {
-                    return Ok(()); // TODO: a test case on empty iterator
-                }
+                break self.planner.handle_range_end(payload_end);
            }
        };
        let vectored_blob_reader = VectoredBlobReader::new(&self.image_layer.file);
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -715,22 +715,16 @@ impl InMemoryLayer {
                        res?;
                    }
                }
+
+                // Hold the permit until the IO is done; if we didn't, one could drop this future,
+                // thereby releasing the permit, but the Vec<u8> remains allocated until the IO completes.
+                // => we'd have more concurrenct Vec<u8> than allowed as per the semaphore.
+                drop(_concurrency_permit);
            }
        }

        // MAX is used here because we identify L0 layers by full key range
        let delta_layer = delta_layer_writer.finish(Key::MAX, timeline, ctx).await?;
-
-        // Hold the permit until all the IO is done, including the fsync in `delta_layer_writer.finish()``.
-        //
-        // If we didn't and our caller drops this future, tokio-epoll-uring would extend the lifetime of
-        // the `file_contents: Vec<u8>` until the IO is done, but not the permit's lifetime.
-        // Thus, we'd have more concurrenct `Vec<u8>` in existence than the semaphore allows.
-        //
-        // We hold across the fsync so that on ext4 mounted with data=ordered, all the kernel page cache pages
-        // we dirtied when writing to the filesystem have been flushed and marked !dirty.
-        drop(_concurrency_permit);
-
        Ok(Some(delta_layer))
    }
 }
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -693,18 +693,6 @@ impl Drop for LayerInner {
            // and we could be delaying shutdown for nothing.
        }

-        if let Some(timeline) = self.timeline.upgrade() {
-            // Only need to decrement metrics if the timeline still exists: otherwise
-            // it will have already de-registered these metrics via TimelineMetrics::shutdown
-            if self.desc.is_delta() {
-                timeline.metrics.layer_count_delta.dec();
-                timeline.metrics.layer_size_delta.sub(self.desc.file_size);
-            } else {
-                timeline.metrics.layer_count_image.dec();
-                timeline.metrics.layer_size_image.sub(self.desc.file_size);
-            }
-        }
-
        if !*self.wanted_deleted.get_mut() {
            return;
        }
@@ -803,15 +791,6 @@ impl LayerInner {
            (heavier_once_cell::OnceCell::default(), 0, Status::Evicted)
        };

-        // This object acts as a RAII guard on these metrics: increment on construction
-        if desc.is_delta() {
-            timeline.metrics.layer_count_delta.inc();
-            timeline.metrics.layer_size_delta.add(desc.file_size);
-        } else {
-            timeline.metrics.layer_count_image.inc();
-            timeline.metrics.layer_size_image.add(desc.file_size);
-        }
-
        LayerInner {
            conf,
            debug_str: {
@@ -1490,22 +1469,14 @@ impl LayerInner {
                let duration = SystemTime::now().duration_since(local_layer_mtime);
                match duration {
                    Ok(elapsed) => {
-                        let accessed = self.access_stats.accessed();
-                        if accessed {
-                            // Only layers used for reads contribute to our "low residence" metric that is used
-                            // to detect thrashing.  Layers promoted for other reasons (e.g. compaction) are allowed
-                            // to be rapidly evicted without contributing to this metric.
-                            timeline
-                                .metrics
-                                .evictions_with_low_residence_duration
-                                .read()
-                                .unwrap()
-                                .observe(elapsed);
-                        }
-
+                        timeline
+                            .metrics
+                            .evictions_with_low_residence_duration
+                            .read()
+                            .unwrap()
+                            .observe(elapsed);
                        tracing::info!(
                            residence_millis = elapsed.as_millis(),
-                            accessed,
                            "evicted layer after known residence period"
                        );
                    }
@@ -1714,6 +1685,7 @@ impl DownloadedLayer {
                    lsn,
                    summary,
                    Some(owner.conf.max_vectored_read_bytes),
+                    owner.conf.image_compression.allow_decompression(),
                    ctx,
                )
                .await
--- a/pageserver/src/tenant/storage_layer/layer_desc.rs
+++ b/pageserver/src/tenant/storage_layer/layer_desc.rs
@@ -25,7 +25,7 @@ pub struct PersistentLayerDesc {
    ///
    /// - For an open in-memory layer, the end bound is MAX_LSN
    /// - For a frozen in-memory layer or a delta layer, the end bound is a valid lsn after the
-    ///   range start
+    /// range start
    /// - An image layer represents snapshot at one LSN, so end_lsn is always the snapshot LSN + 1
    pub lsn_range: Range<Lsn>,
    /// Whether this is a delta layer, and also, is this incremental.
--- a/pageserver/src/tenant/storage_layer/merge_iterator.rs
+++ b/pageserver/src/tenant/storage_layer/merge_iterator.rs
@@ -1,551 +0,0 @@
-use std::{
-    cmp::Ordering,
-    collections::{binary_heap, BinaryHeap},
-};
-
-use pageserver_api::key::Key;
-use utils::lsn::Lsn;
-
-use crate::{context::RequestContext, repository::Value};
-
-use super::{
-    delta_layer::{DeltaLayerInner, DeltaLayerIterator},
-    image_layer::{ImageLayerInner, ImageLayerIterator},
-};
-
-#[derive(Clone, Copy)]
-enum LayerRef<'a> {
-    Image(&'a ImageLayerInner),
-    Delta(&'a DeltaLayerInner),
-}
-
-impl<'a> LayerRef<'a> {
-    fn iter(self, ctx: &'a RequestContext) -> LayerIterRef<'a> {
-        match self {
-            Self::Image(x) => LayerIterRef::Image(x.iter(ctx)),
-            Self::Delta(x) => LayerIterRef::Delta(x.iter(ctx)),
-        }
-    }
-}
-
-enum LayerIterRef<'a> {
-    Image(ImageLayerIterator<'a>),
-    Delta(DeltaLayerIterator<'a>),
-}
-
-impl LayerIterRef<'_> {
-    async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
-        match self {
-            Self::Delta(x) => x.next().await,
-            Self::Image(x) => x.next().await,
-        }
-    }
-}
-
-/// This type plays several roles at once
-/// 1. Unified iterator for image and delta layers.
-/// 2. `Ord` for use in [`MergeIterator::heap`] (for the k-merge).
-/// 3. Lazy creation of the real delta/image iterator.
-enum IteratorWrapper<'a> {
-    NotLoaded {
-        ctx: &'a RequestContext,
-        first_key_lower_bound: (Key, Lsn),
-        layer: LayerRef<'a>,
-    },
-    Loaded {
-        iter: PeekableLayerIterRef<'a>,
-    },
-}
-
-struct PeekableLayerIterRef<'a> {
-    iter: LayerIterRef<'a>,
-    peeked: Option<(Key, Lsn, Value)>, // None == end
-}
-
-impl<'a> PeekableLayerIterRef<'a> {
-    async fn create(mut iter: LayerIterRef<'a>) -> anyhow::Result<Self> {
-        let peeked = iter.next().await?;
-        Ok(Self { iter, peeked })
-    }
-
-    fn peek(&self) -> &Option<(Key, Lsn, Value)> {
-        &self.peeked
-    }
-
-    async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
-        let result = self.peeked.take();
-        self.peeked = self.iter.next().await?;
-        Ok(result)
-    }
-}
-
-impl<'a> std::cmp::PartialEq for IteratorWrapper<'a> {
-    fn eq(&self, other: &Self) -> bool {
-        self.cmp(other) == Ordering::Equal
-    }
-}
-
-impl<'a> std::cmp::Eq for IteratorWrapper<'a> {}
-
-impl<'a> std::cmp::PartialOrd for IteratorWrapper<'a> {
-    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
-        Some(self.cmp(other))
-    }
-}
-
-impl<'a> std::cmp::Ord for IteratorWrapper<'a> {
-    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
-        use std::cmp::Ordering;
-        let a = self.peek_next_key_lsn_value();
-        let b = other.peek_next_key_lsn_value();
-        match (a, b) {
-            (Some((k1, l1, v1)), Some((k2, l2, v2))) => {
-                fn map_value_to_num(val: &Option<&Value>) -> usize {
-                    match val {
-                        None => 0,
-                        Some(Value::Image(_)) => 1,
-                        Some(Value::WalRecord(_)) => 2,
-                    }
-                }
-                let order_1 = map_value_to_num(&v1);
-                let order_2 = map_value_to_num(&v2);
-                // When key_lsn are the same, the unloaded iter will always appear before the loaded one.
-                // And note that we do a reverse at the end of the comparison, so it works with the max heap.
-                (k1, l1, order_1).cmp(&(k2, l2, order_2))
-            }
-            (Some(_), None) => Ordering::Less,
-            (None, Some(_)) => Ordering::Greater,
-            (None, None) => Ordering::Equal,
-        }
-        .reverse()
-    }
-}
-
-impl<'a> IteratorWrapper<'a> {
-    pub fn create_from_image_layer(
-        image_layer: &'a ImageLayerInner,
-        ctx: &'a RequestContext,
-    ) -> Self {
-        Self::NotLoaded {
-            layer: LayerRef::Image(image_layer),
-            first_key_lower_bound: (image_layer.key_range().start, image_layer.lsn()),
-            ctx,
-        }
-    }
-
-    pub fn create_from_delta_layer(
-        delta_layer: &'a DeltaLayerInner,
-        ctx: &'a RequestContext,
-    ) -> Self {
-        Self::NotLoaded {
-            layer: LayerRef::Delta(delta_layer),
-            first_key_lower_bound: (delta_layer.key_range().start, delta_layer.lsn_range().start),
-            ctx,
-        }
-    }
-
-    fn peek_next_key_lsn_value(&self) -> Option<(&Key, Lsn, Option<&Value>)> {
-        match self {
-            Self::Loaded { iter } => iter
-                .peek()
-                .as_ref()
-                .map(|(key, lsn, val)| (key, *lsn, Some(val))),
-            Self::NotLoaded {
-                first_key_lower_bound: (key, lsn),
-                ..
-            } => Some((key, *lsn, None)),
-        }
-    }
-
-    // CORRECTNESS: this function must always take `&mut self`, never `&self`.
-    //
-    // The reason is that `impl Ord for Self` evaluates differently after this function
-    // returns. We're called through a `PeekMut::deref_mut`, which causes heap repair when
-    // the PeekMut gets returned. So, it's critical that we actually run through `PeekMut::deref_mut`
-    // and not just `PeekMut::deref`
-    // If we don't take `&mut self`
-    async fn load(&mut self) -> anyhow::Result<()> {
-        assert!(!self.is_loaded());
-        let Self::NotLoaded {
-            ctx,
-            first_key_lower_bound,
-            layer,
-        } = self
-        else {
-            unreachable!()
-        };
-        let iter = layer.iter(ctx);
-        let iter = PeekableLayerIterRef::create(iter).await?;
-        if let Some((k1, l1, _)) = iter.peek() {
-            let (k2, l2) = first_key_lower_bound;
-            debug_assert!((k1, l1) >= (k2, l2));
-        }
-        *self = Self::Loaded { iter };
-        Ok(())
-    }
-
-    fn is_loaded(&self) -> bool {
-        matches!(self, Self::Loaded { .. })
-    }
-
-    /// Correctness: must load the iterator before using.
-    ///
-    /// Given this iterator wrapper is private to the merge iterator, users won't be able to mis-use it.
-    /// The public interfaces to use are [`crate::tenant::storage_layer::delta_layer::DeltaLayerIterator`] and
-    /// [`crate::tenant::storage_layer::image_layer::ImageLayerIterator`].
-    async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
-        let Self::Loaded { iter } = self else {
-            panic!("must load the iterator before using")
-        };
-        iter.next().await
-    }
-}
-
-/// A merge iterator over delta/image layer iterators. When duplicated records are
-/// found, the iterator will not perform any deduplication, and the caller should handle
-/// these situation. By saying duplicated records, there are many possibilities:
-/// * Two same delta at the same LSN.
-/// * Two same image at the same LSN.
-/// * Delta/image at the same LSN where the image has already applied the delta.
-/// The iterator will always put the image before the delta.
-pub struct MergeIterator<'a> {
-    heap: BinaryHeap<IteratorWrapper<'a>>,
-}
-
-impl<'a> MergeIterator<'a> {
-    pub fn create(
-        deltas: &[&'a DeltaLayerInner],
-        images: &[&'a ImageLayerInner],
-        ctx: &'a RequestContext,
-    ) -> Self {
-        let mut heap = Vec::with_capacity(images.len() + deltas.len());
-        for image in images {
-            heap.push(IteratorWrapper::create_from_image_layer(image, ctx));
-        }
-        for delta in deltas {
-            heap.push(IteratorWrapper::create_from_delta_layer(delta, ctx));
-        }
-        Self {
-            heap: BinaryHeap::from(heap),
-        }
-    }
-
-    pub async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
-        while let Some(mut iter) = self.heap.peek_mut() {
-            if !iter.is_loaded() {
-                // Once we load the iterator, we can know the real first key-value pair in the iterator.
-                // We put it back into the heap so that a potentially unloaded layer may have a key between
-                // [potential_first_key, loaded_first_key).
-                iter.load().await?;
-                continue;
-            }
-            let Some(item) = iter.next().await? else {
-                // If the iterator returns None, we pop this iterator. Actually, in the current implementation,
-                // we order None > Some, and all the rest of the iterators should return None.
-                binary_heap::PeekMut::pop(iter);
-                continue;
-            };
-            return Ok(Some(item));
-        }
-        Ok(None)
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    use itertools::Itertools;
-    use pageserver_api::key::Key;
-    use utils::lsn::Lsn;
-
-    use crate::{
-        tenant::{
-            harness::{TenantHarness, TIMELINE_ID},
-            storage_layer::delta_layer::test::{produce_delta_layer, sort_delta, sort_delta_value},
-        },
-        walrecord::NeonWalRecord,
-        DEFAULT_PG_VERSION,
-    };
-
-    async fn assert_merge_iter_equal(
-        merge_iter: &mut MergeIterator<'_>,
-        expect: &[(Key, Lsn, Value)],
-    ) {
-        let mut expect_iter = expect.iter();
-        loop {
-            let o1 = merge_iter.next().await.unwrap();
-            let o2 = expect_iter.next();
-            assert_eq!(o1.is_some(), o2.is_some());
-            if o1.is_none() && o2.is_none() {
-                break;
-            }
-            let (k1, l1, v1) = o1.unwrap();
-            let (k2, l2, v2) = o2.unwrap();
-            assert_eq!(&k1, k2);
-            assert_eq!(l1, *l2);
-            assert_eq!(&v1, v2);
-        }
-    }
-
-    #[tokio::test]
-    async fn merge_in_between() {
-        use crate::repository::Value;
-        use bytes::Bytes;
-
-        let harness = TenantHarness::create("merge_iterator_merge_in_between").unwrap();
-        let (tenant, ctx) = harness.load().await;
-
-        let tline = tenant
-            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
-            .await
-            .unwrap();
-
-        fn get_key(id: u32) -> Key {
-            let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap();
-            key.field6 = id;
-            key
-        }
-        let test_deltas1 = vec![
-            (
-                get_key(0),
-                Lsn(0x10),
-                Value::Image(Bytes::copy_from_slice(b"test")),
-            ),
-            (
-                get_key(5),
-                Lsn(0x10),
-                Value::Image(Bytes::copy_from_slice(b"test")),
-            ),
-        ];
-        let resident_layer_1 = produce_delta_layer(&tenant, &tline, test_deltas1.clone(), &ctx)
-            .await
-            .unwrap();
-        let test_deltas2 = vec![
-            (
-                get_key(3),
-                Lsn(0x10),
-                Value::Image(Bytes::copy_from_slice(b"test")),
-            ),
-            (
-                get_key(4),
-                Lsn(0x10),
-                Value::Image(Bytes::copy_from_slice(b"test")),
-            ),
-        ];
-        let resident_layer_2 = produce_delta_layer(&tenant, &tline, test_deltas2.clone(), &ctx)
-            .await
-            .unwrap();
-        let mut merge_iter = MergeIterator::create(
-            &[
-                resident_layer_2.get_as_delta(&ctx).await.unwrap(),
-                resident_layer_1.get_as_delta(&ctx).await.unwrap(),
-            ],
-            &[],
-            &ctx,
-        );
-        let mut expect = Vec::new();
-        expect.extend(test_deltas1);
-        expect.extend(test_deltas2);
-        expect.sort_by(sort_delta);
-        assert_merge_iter_equal(&mut merge_iter, &expect).await;
-    }
-
-    #[tokio::test]
-    async fn delta_merge() {
-        use crate::repository::Value;
-        use bytes::Bytes;
-
-        let harness = TenantHarness::create("merge_iterator_delta_merge").unwrap();
-        let (tenant, ctx) = harness.load().await;
-
-        let tline = tenant
-            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
-            .await
-            .unwrap();
-
-        fn get_key(id: u32) -> Key {
-            let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap();
-            key.field6 = id;
-            key
-        }
-        const N: usize = 1000;
-        let test_deltas1 = (0..N)
-            .map(|idx| {
-                (
-                    get_key(idx as u32 / 10),
-                    Lsn(0x20 * ((idx as u64) % 10 + 1)),
-                    Value::Image(Bytes::from(format!("img{idx:05}"))),
-                )
-            })
-            .collect_vec();
-        let resident_layer_1 = produce_delta_layer(&tenant, &tline, test_deltas1.clone(), &ctx)
-            .await
-            .unwrap();
-        let test_deltas2 = (0..N)
-            .map(|idx| {
-                (
-                    get_key(idx as u32 / 10),
-                    Lsn(0x20 * ((idx as u64) % 10 + 1) + 0x10),
-                    Value::Image(Bytes::from(format!("img{idx:05}"))),
-                )
-            })
-            .collect_vec();
-        let resident_layer_2 = produce_delta_layer(&tenant, &tline, test_deltas2.clone(), &ctx)
-            .await
-            .unwrap();
-        let test_deltas3 = (0..N)
-            .map(|idx| {
-                (
-                    get_key(idx as u32 / 10 + N as u32),
-                    Lsn(0x10 * ((idx as u64) % 10 + 1)),
-                    Value::Image(Bytes::from(format!("img{idx:05}"))),
-                )
-            })
-            .collect_vec();
-        let resident_layer_3 = produce_delta_layer(&tenant, &tline, test_deltas3.clone(), &ctx)
-            .await
-            .unwrap();
-        let mut merge_iter = MergeIterator::create(
-            &[
-                resident_layer_1.get_as_delta(&ctx).await.unwrap(),
-                resident_layer_2.get_as_delta(&ctx).await.unwrap(),
-                resident_layer_3.get_as_delta(&ctx).await.unwrap(),
-            ],
-            &[],
-            &ctx,
-        );
-        let mut expect = Vec::new();
-        expect.extend(test_deltas1);
-        expect.extend(test_deltas2);
-        expect.extend(test_deltas3);
-        expect.sort_by(sort_delta);
-        assert_merge_iter_equal(&mut merge_iter, &expect).await;
-
-        // TODO: test layers are loaded only when needed, reducing num of active iterators in k-merge
-    }
-
-    #[tokio::test]
-    async fn delta_image_mixed_merge() {
-        use crate::repository::Value;
-        use bytes::Bytes;
-
-        let harness = TenantHarness::create("merge_iterator_delta_image_mixed_merge").unwrap();
-        let (tenant, ctx) = harness.load().await;
-
-        let tline = tenant
-            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
-            .await
-            .unwrap();
-
-        fn get_key(id: u32) -> Key {
-            let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap();
-            key.field6 = id;
-            key
-        }
-        // In this test case, we want to test if the iterator still works correctly with multiple copies
-        // of a delta+image at the same LSN, for example, the following sequence a@10=+a, a@10=+a, a@10=ab, a@10=ab.
-        // Duplicated deltas/images are possible for old tenants before the full L0 compaction file name fix.
-        // An incomplete compaction could produce multiple exactly-the-same delta layers. Force image generation
-        // could produce overlapping images. Apart from duplicated deltas/images, in the current storage implementation
-        // one key-lsn could have a delta in the delta layer and one image in the image layer. The iterator should
-        // correctly process these situations and return everything as-is, and the upper layer of the system
-        // will handle duplicated LSNs.
-        let test_deltas1 = vec![
-            (
-                get_key(0),
-                Lsn(0x10),
-                Value::WalRecord(NeonWalRecord::wal_init()),
-            ),
-            (
-                get_key(0),
-                Lsn(0x18),
-                Value::WalRecord(NeonWalRecord::wal_append("a")),
-            ),
-            (
-                get_key(5),
-                Lsn(0x10),
-                Value::WalRecord(NeonWalRecord::wal_init()),
-            ),
-            (
-                get_key(5),
-                Lsn(0x18),
-                Value::WalRecord(NeonWalRecord::wal_append("b")),
-            ),
-        ];
-        let resident_layer_1 = produce_delta_layer(&tenant, &tline, test_deltas1.clone(), &ctx)
-            .await
-            .unwrap();
-        let mut test_deltas2 = test_deltas1.clone();
-        test_deltas2.push((
-            get_key(10),
-            Lsn(0x20),
-            Value::Image(Bytes::copy_from_slice(b"test")),
-        ));
-        let resident_layer_2 = produce_delta_layer(&tenant, &tline, test_deltas2.clone(), &ctx)
-            .await
-            .unwrap();
-        let test_deltas3 = vec![
-            (
-                get_key(0),
-                Lsn(0x10),
-                Value::Image(Bytes::copy_from_slice(b"")),
-            ),
-            (
-                get_key(5),
-                Lsn(0x18),
-                Value::Image(Bytes::copy_from_slice(b"b")),
-            ),
-            (
-                get_key(15),
-                Lsn(0x20),
-                Value::Image(Bytes::copy_from_slice(b"test")),
-            ),
-        ];
-        let resident_layer_3 = produce_delta_layer(&tenant, &tline, test_deltas3.clone(), &ctx)
-            .await
-            .unwrap();
-        let mut test_deltas4 = test_deltas3.clone();
-        test_deltas4.push((
-            get_key(20),
-            Lsn(0x20),
-            Value::Image(Bytes::copy_from_slice(b"test")),
-        ));
-        let resident_layer_4 = produce_delta_layer(&tenant, &tline, test_deltas4.clone(), &ctx)
-            .await
-            .unwrap();
-        let mut expect = Vec::new();
-        expect.extend(test_deltas1);
-        expect.extend(test_deltas2);
-        expect.extend(test_deltas3);
-        expect.extend(test_deltas4);
-        expect.sort_by(sort_delta_value);
-
-        // Test with different layer order for MergeIterator::create to ensure the order
-        // is stable.
-
-        let mut merge_iter = MergeIterator::create(
-            &[
-                resident_layer_4.get_as_delta(&ctx).await.unwrap(),
-                resident_layer_1.get_as_delta(&ctx).await.unwrap(),
-                resident_layer_3.get_as_delta(&ctx).await.unwrap(),
-                resident_layer_2.get_as_delta(&ctx).await.unwrap(),
-            ],
-            &[],
-            &ctx,
-        );
-        assert_merge_iter_equal(&mut merge_iter, &expect).await;
-
-        let mut merge_iter = MergeIterator::create(
-            &[
-                resident_layer_1.get_as_delta(&ctx).await.unwrap(),
-                resident_layer_4.get_as_delta(&ctx).await.unwrap(),
-                resident_layer_3.get_as_delta(&ctx).await.unwrap(),
-                resident_layer_2.get_as_delta(&ctx).await.unwrap(),
-            ],
-            &[],
-            &ctx,
-        );
-        assert_merge_iter_equal(&mut merge_iter, &expect).await;
-    }
-}
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -66,13 +66,12 @@ use std::{
    ops::{Deref, Range},
 };

+use crate::pgdatadir_mapping::MAX_AUX_FILE_V2_DELTAS;
 use crate::{
    aux_file::AuxFileSizeEstimator,
    tenant::{
-        config::defaults::DEFAULT_PITR_INTERVAL,
        layer_map::{LayerMap, SearchResult},
        metadata::TimelineMetadata,
-        storage_layer::PersistentLayerDesc,
    },
 };
 use crate::{
@@ -99,7 +98,6 @@ use crate::{
    metrics::ScanLatencyOngoingRecording, tenant::timeline::logical_size::CurrentLogicalSize,
 };
 use crate::{pgdatadir_mapping::LsnForTimestamp, tenant::tasks::BackgroundLoopKind};
-use crate::{pgdatadir_mapping::MAX_AUX_FILE_V2_DELTAS, tenant::storage_layer::PersistentLayerKey};
 use crate::{
    pgdatadir_mapping::{AuxFilesDirectory, DirectoryKind},
    virtual_file::{MaybeFatalIo, VirtualFile},
@@ -198,7 +196,7 @@ impl PartialOrd for Hole {

 /// Temporary function for immutable storage state refactor, ensures we are dropping mutex guard instead of other things.
 /// Can be removed after all refactors are done.
-fn drop_rlock<T>(rlock: tokio::sync::RwLockReadGuard<T>) {
+fn drop_rlock<T>(rlock: tokio::sync::OwnedRwLockReadGuard<T>) {
    drop(rlock)
 }

@@ -271,7 +269,7 @@ pub struct Timeline {
    ///
    /// In the future, we'll be able to split up the tuple of LayerMap and `LayerFileManager`,
    /// so that e.g. on-demand-download/eviction, and layer spreading, can operate just on `LayerFileManager`.
-    pub(crate) layers: tokio::sync::RwLock<LayerManager>,
+    pub(crate) layers: Arc<tokio::sync::RwLock<LayerManager>>,

    last_freeze_at: AtomicLsn,
    // Atomic would be more appropriate here.
@@ -478,32 +476,37 @@ impl GcInfo {
    }
 }

-/// The `GcInfo` component describing which Lsns need to be retained.  Functionally, this
-/// is a single number (the oldest LSN which we must retain), but it internally distinguishes
-/// between time-based and space-based retention for observability and consumption metrics purposes.
+/// The `GcInfo` component describing which Lsns need to be retained.
 #[derive(Debug)]
 pub(crate) struct GcCutoffs {
-    /// Calculated from the [`TenantConf::gc_horizon`], this LSN indicates how much
-    /// history we must keep to retain a specified number of bytes of WAL.
-    pub(crate) space: Lsn,
+    /// Keep everything newer than this point.
+    ///
+    /// This is calculated by subtracting 'gc_horizon' setting from
+    /// last-record LSN
+    ///
+    /// FIXME: is this inclusive or exclusive?
+    pub(crate) horizon: Lsn,

-    /// Calculated from [`TenantConf::pitr_interval`], this LSN indicates how much
-    /// history we must keep to enable reading back at least the PITR interval duration.
-    pub(crate) time: Lsn,
+    /// In addition to 'retain_lsns' and 'horizon_cutoff', keep everything newer than this
+    /// point.
+    ///
+    /// This is calculated by finding a number such that a record is needed for PITR
+    /// if only if its LSN is larger than 'pitr_cutoff'.
+    pub(crate) pitr: Lsn,
 }

 impl Default for GcCutoffs {
    fn default() -> Self {
        Self {
-            space: Lsn::INVALID,
-            time: Lsn::INVALID,
+            horizon: Lsn::INVALID,
+            pitr: Lsn::INVALID,
        }
    }
 }

 impl GcCutoffs {
    fn select_min(&self) -> Lsn {
-        std::cmp::min(self.space, self.time)
+        std::cmp::min(self.horizon, self.pitr)
    }
 }

@@ -725,9 +728,6 @@ impl From<CreateImageLayersError> for CompactionError {
    fn from(e: CreateImageLayersError) -> Self {
        match e {
            CreateImageLayersError::Cancelled => CompactionError::ShuttingDown,
-            CreateImageLayersError::Other(e) => {
-                CompactionError::Other(e.context("create image layers"))
-            }
            _ => CompactionError::Other(e.into()),
        }
    }
@@ -862,7 +862,7 @@ impl Timeline {
        let gc_info = self.gc_info.read().unwrap();
        let history = self
            .get_last_record_lsn()
-            .checked_sub(gc_info.cutoffs.time)
+            .checked_sub(gc_info.cutoffs.pitr)
            .unwrap_or(Lsn(0))
            .0;
        (history, gc_info.within_ancestor_pitr)
@@ -1561,7 +1561,7 @@ impl Timeline {
    ) -> anyhow::Result<()> {
        ensure!(
            lsn >= **latest_gc_cutoff_lsn,
-            "LSN {} is earlier than latest GC cutoff {} (we might've already garbage collected needed data)",
+            "LSN {} is earlier than latest GC horizon {} (we might've already garbage collected needed data)",
            lsn,
            **latest_gc_cutoff_lsn,
        );
@@ -3404,8 +3404,6 @@ impl Timeline {
        }
    }

-    #[allow(unknown_lints)] // doc_lazy_continuation is still a new lint
-    #[allow(clippy::doc_lazy_continuation)]
    /// Get the data needed to reconstruct all keys in the provided keyspace
    ///
    /// The algorithm is as follows:
@@ -4472,10 +4470,10 @@ impl Timeline {
    /// are required. Since checking if new image layers are required is expensive in
    /// terms of CPU, we only do it in the following cases:
    /// 1. If the timeline has ingested sufficient WAL to justify the cost
-    /// 2. If enough time has passed since the last check:
-    ///     1. For large tenants, we wish to perform the check more often since they
-    ///        suffer from the lack of image layers
-    ///     2. For small tenants (that can mostly fit in RAM), we use a much longer interval
+    /// 2. If enough time has passed since the last check
+    /// 2.1. For large tenants, we wish to perform the check more often since they
+    /// suffer from the lack of image layers
+    /// 2.2. For small tenants (that can mostly fit in RAM), we use a much longer interval
    fn should_check_if_image_layers_required(self: &Arc<Timeline>, lsn: Lsn) -> bool {
        const LARGE_TENANT_THRESHOLD: u64 = 2 * 1024 * 1024 * 1024;

@@ -4571,22 +4569,6 @@ impl Timeline {
                    start = img_range.end;
                    continue;
                }
-            } else if let ImageLayerCreationMode::Force = mode {
-                // When forced to create image layers, we might try and create them where they already
-                // exist.  This mode is only used in tests/debug.
-                let layers = self.layers.read().await;
-                if layers.contains_key(&PersistentLayerKey {
-                    key_range: img_range.clone(),
-                    lsn_range: PersistentLayerDesc::image_layer_lsn_range(lsn),
-                    is_delta: false,
-                }) {
-                    tracing::info!(
-                        "Skipping image layer at {lsn} {}..{}, already exists",
-                        img_range.start,
-                        img_range.end
-                    );
-                    continue;
-                }
            }

            let image_layer_writer = ImageLayerWriter::new(
@@ -4717,7 +4699,7 @@ impl Timeline {
    /// Requires a timeline that:
    /// - has an ancestor to detach from
    /// - the ancestor does not have an ancestor -- follows from the original RFC limitations, not
-    ///   a technical requirement
+    /// a technical requirement
    ///
    /// After the operation has been started, it cannot be canceled. Upon restart it needs to be
    /// polled again until completion.
@@ -4729,7 +4711,13 @@ impl Timeline {
        tenant: &crate::tenant::Tenant,
        options: detach_ancestor::Options,
        ctx: &RequestContext,
-    ) -> Result<detach_ancestor::Progress, detach_ancestor::Error> {
+    ) -> Result<
+        (
+            completion::Completion,
+            detach_ancestor::PreparedTimelineDetach,
+        ),
+        detach_ancestor::Error,
+    > {
        detach_ancestor::prepare(self, tenant, options, ctx).await
    }

@@ -4936,21 +4924,24 @@ impl Timeline {
    }

    /// Find the Lsns above which layer files need to be retained on
-    /// garbage collection.
+    /// garbage collection. This is separate from actually performing the GC,
+    /// and is updated more frequently, so that compaction can remove obsolete
+    /// page versions more aggressively.
    ///
-    /// We calculate two cutoffs, one based on time and one based on WAL size.  `pitr`
-    /// controls the time cutoff (or ZERO to disable time-based retention), and `space_cutoff` controls
-    /// the space-based retention.
+    /// TODO: that's wishful thinking, compaction doesn't actually do that
+    /// currently.
    ///
-    /// This function doesn't simply to calculate time & space based retention: it treats time-based
-    /// retention as authoritative if enabled, and falls back to space-based retention if calculating
-    /// the LSN for a time point isn't possible.  Therefore the GcCutoffs::horizon in the response might
-    /// be different to the `space_cutoff` input.  Callers should treat the min() of the two cutoffs
-    /// in the response as the GC cutoff point for the timeline.
+    /// The 'cutoff_horizon' point is used to retain recent versions that might still be
+    /// needed by read-only nodes. (As of this writing, the caller just passes
+    /// the latest LSN subtracted by a constant, and doesn't do anything smart
+    /// to figure out what read-only nodes might actually need.)
+    ///
+    /// The 'pitr' duration is used to calculate a 'pitr_cutoff', which can be used to determine
+    /// whether a record is needed for PITR.
    #[instrument(skip_all, fields(timeline_id=%self.timeline_id))]
    pub(super) async fn find_gc_cutoffs(
        &self,
-        space_cutoff: Lsn,
+        cutoff_horizon: Lsn,
        pitr: Duration,
        cancel: &CancellationToken,
        ctx: &RequestContext,
@@ -4963,87 +4954,58 @@ impl Timeline {

        pausable_failpoint!("Timeline::find_gc_cutoffs-pausable");

-        if cfg!(test) {
-            // Unit tests which specify zero PITR interval expect to avoid doing any I/O for timestamp lookup
-            if pitr == Duration::ZERO {
-                return Ok(GcCutoffs {
-                    time: self.get_last_record_lsn(),
-                    space: space_cutoff,
-                });
-            }
-        }
-
-        // Calculate a time-based limit on how much to retain:
-        // - if PITR interval is set, then this is our cutoff.
-        // - if PITR interval is not set, then we do a lookup
-        //   based on DEFAULT_PITR_INTERVAL, so that size-based retention does not result in keeping history around permanently on idle databases.
-        let time_cutoff = {
+        // First, calculate pitr_cutoff_timestamp and then convert it to LSN.
+        //
+        // Some unit tests depend on garbage-collection working even when
+        // CLOG data is missing, so that find_lsn_for_timestamp() doesn't
+        // work, so avoid calling it altogether if time-based retention is not
+        // configured. It would be pointless anyway.
+        let pitr_cutoff = if pitr != Duration::ZERO {
            let now = SystemTime::now();
-            let time_range = if pitr == Duration::ZERO {
-                humantime::parse_duration(DEFAULT_PITR_INTERVAL).expect("constant is invalid")
+            if let Some(pitr_cutoff_timestamp) = now.checked_sub(pitr) {
+                let pitr_timestamp = to_pg_timestamp(pitr_cutoff_timestamp);
+
+                match self
+                    .find_lsn_for_timestamp(pitr_timestamp, cancel, ctx)
+                    .await?
+                {
+                    LsnForTimestamp::Present(lsn) => lsn,
+                    LsnForTimestamp::Future(lsn) => {
+                        // The timestamp is in the future. That sounds impossible,
+                        // but what it really means is that there hasn't been
+                        // any commits since the cutoff timestamp.
+                        //
+                        // In this case we should use the LSN of the most recent commit,
+                        // which is implicitly the last LSN in the log.
+                        debug!("future({})", lsn);
+                        self.get_last_record_lsn()
+                    }
+                    LsnForTimestamp::Past(lsn) => {
+                        debug!("past({})", lsn);
+                        // conservative, safe default is to remove nothing, when we
+                        // have no commit timestamp data available
+                        *self.get_latest_gc_cutoff_lsn()
+                    }
+                    LsnForTimestamp::NoData(lsn) => {
+                        debug!("nodata({})", lsn);
+                        // conservative, safe default is to remove nothing, when we
+                        // have no commit timestamp data available
+                        *self.get_latest_gc_cutoff_lsn()
+                    }
+                }
            } else {
-                pitr
-            };
-
-            // If PITR is so large or `now` is so small that this underflows, we will retain no history (highly unexpected case)
-            let time_cutoff = now.checked_sub(time_range).unwrap_or(now);
-            let timestamp = to_pg_timestamp(time_cutoff);
-
-            match self.find_lsn_for_timestamp(timestamp, cancel, ctx).await? {
-                LsnForTimestamp::Present(lsn) => Some(lsn),
-                LsnForTimestamp::Future(lsn) => {
-                    // The timestamp is in the future. That sounds impossible,
-                    // but what it really means is that there hasn't been
-                    // any commits since the cutoff timestamp.
-                    //
-                    // In this case we should use the LSN of the most recent commit,
-                    // which is implicitly the last LSN in the log.
-                    debug!("future({})", lsn);
-                    Some(self.get_last_record_lsn())
-                }
-                LsnForTimestamp::Past(lsn) => {
-                    debug!("past({})", lsn);
-                    None
-                }
-                LsnForTimestamp::NoData(lsn) => {
-                    debug!("nodata({})", lsn);
-                    None
-                }
+                // If we don't have enough data to convert to LSN,
+                // play safe and don't remove any layers.
+                *self.get_latest_gc_cutoff_lsn()
            }
+        } else {
+            // No time-based retention was configured. Interpret this as "keep no history".
+            self.get_last_record_lsn()
        };

-        Ok(match (pitr, time_cutoff) {
-            (Duration::ZERO, Some(time_cutoff)) => {
-                // PITR is not set. Retain the size-based limit, or the default time retention,
-                // whichever requires less data.
-                GcCutoffs {
-                    time: self.get_last_record_lsn(),
-                    space: std::cmp::max(time_cutoff, space_cutoff),
-                }
-            }
-            (Duration::ZERO, None) => {
-                // PITR is not set, and time lookup failed
-                GcCutoffs {
-                    time: self.get_last_record_lsn(),
-                    space: space_cutoff,
-                }
-            }
-            (_, None) => {
-                // PITR interval is set & we didn't look up a timestamp successfully.  Conservatively assume PITR
-                // cannot advance beyond what was already GC'd, and respect space-based retention
-                GcCutoffs {
-                    time: *self.get_latest_gc_cutoff_lsn(),
-                    space: space_cutoff,
-                }
-            }
-            (_, Some(time_cutoff)) => {
-                // PITR interval is set and we looked up timestamp successfully.  Ignore
-                // size based retention and make time cutoff authoritative
-                GcCutoffs {
-                    time: time_cutoff,
-                    space: time_cutoff,
-                }
-            }
+        Ok(GcCutoffs {
+            horizon: cutoff_horizon,
+            pitr: pitr_cutoff,
        })
    }

@@ -5068,11 +5030,11 @@ impl Timeline {
            return Err(GcError::TimelineCancelled);
        }

-        let (space_cutoff, time_cutoff, retain_lsns, max_lsn_with_valid_lease) = {
+        let (horizon_cutoff, pitr_cutoff, retain_lsns, max_lsn_with_valid_lease) = {
            let gc_info = self.gc_info.read().unwrap();

-            let space_cutoff = min(gc_info.cutoffs.space, self.get_disk_consistent_lsn());
-            let time_cutoff = gc_info.cutoffs.time;
+            let horizon_cutoff = min(gc_info.cutoffs.horizon, self.get_disk_consistent_lsn());
+            let pitr_cutoff = gc_info.cutoffs.pitr;
            let retain_lsns = gc_info.retain_lsns.clone();

            // Gets the maximum LSN that holds the valid lease.
@@ -5082,14 +5044,14 @@ impl Timeline {
            let max_lsn_with_valid_lease = gc_info.leases.last_key_value().map(|(lsn, _)| *lsn);

            (
-                space_cutoff,
-                time_cutoff,
+                horizon_cutoff,
+                pitr_cutoff,
                retain_lsns,
                max_lsn_with_valid_lease,
            )
        };

-        let mut new_gc_cutoff = Lsn::min(space_cutoff, time_cutoff);
+        let mut new_gc_cutoff = Lsn::min(horizon_cutoff, pitr_cutoff);
        let standby_horizon = self.standby_horizon.load();
        // Hold GC for the standby, but as a safety guard do it only within some
        // reasonable lag.
@@ -5118,8 +5080,8 @@ impl Timeline {

        let res = self
            .gc_timeline(
-                space_cutoff,
-                time_cutoff,
+                horizon_cutoff,
+                pitr_cutoff,
                retain_lsns,
                max_lsn_with_valid_lease,
                new_gc_cutoff,
@@ -5137,8 +5099,8 @@ impl Timeline {

    async fn gc_timeline(
        &self,
-        space_cutoff: Lsn,
-        time_cutoff: Lsn,
+        horizon_cutoff: Lsn,
+        pitr_cutoff: Lsn,
        retain_lsns: Vec<Lsn>,
        max_lsn_with_valid_lease: Option<Lsn>,
        new_gc_cutoff: Lsn,
@@ -5199,22 +5161,22 @@ impl Timeline {
            result.layers_total += 1;

            // 1. Is it newer than GC horizon cutoff point?
-            if l.get_lsn_range().end > space_cutoff {
+            if l.get_lsn_range().end > horizon_cutoff {
                debug!(
-                    "keeping {} because it's newer than space_cutoff {}",
+                    "keeping {} because it's newer than horizon_cutoff {}",
                    l.layer_name(),
-                    space_cutoff,
+                    horizon_cutoff,
                );
                result.layers_needed_by_cutoff += 1;
                continue 'outer;
            }

            // 2. It is newer than PiTR cutoff point?
-            if l.get_lsn_range().end > time_cutoff {
+            if l.get_lsn_range().end > pitr_cutoff {
                debug!(
-                    "keeping {} because it's newer than time_cutoff {}",
+                    "keeping {} because it's newer than pitr_cutoff {}",
                    l.layer_name(),
-                    time_cutoff,
+                    pitr_cutoff,
                );
                result.layers_needed_by_pitr += 1;
                continue 'outer;
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -26,10 +26,9 @@ use utils::id::TimelineId;

 use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
 use crate::page_cache;
-use crate::tenant::config::defaults::{DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD};
 use crate::tenant::storage_layer::{AsLayerDesc, PersistentLayerDesc};
-use crate::tenant::timeline::{drop_rlock, DeltaLayerWriter, ImageLayerWriter};
-use crate::tenant::timeline::{Hole, ImageLayerCreationOutcome};
+use crate::tenant::timeline::{drop_rlock, Hole, ImageLayerCreationOutcome};
+use crate::tenant::timeline::{DeltaLayerWriter, ImageLayerWriter};
 use crate::tenant::timeline::{Layer, ResidentLayer};
 use crate::tenant::DeltaLayer;
 use crate::virtual_file::{MaybeFatalIo, VirtualFile};
@@ -196,7 +195,7 @@ impl Timeline {
        tracing::info!(
            "latest_gc_cutoff: {}, pitr cutoff {}",
            *latest_gc_cutoff,
-            self.gc_info.read().unwrap().cutoffs.time
+            self.gc_info.read().unwrap().cutoffs.pitr
        );

        let layers = self.layers.read().await;
@@ -380,7 +379,7 @@ impl Timeline {
            };

            let begin = tokio::time::Instant::now();
-            let phase1_layers_locked = self.layers.read().await;
+            let phase1_layers_locked = Arc::clone(&self.layers).read_owned().await;
            let now = tokio::time::Instant::now();
            stats.read_lock_acquisition_micros =
                DurationRecorder::Recorded(RecordedDuration(now - begin), now);
@@ -400,9 +399,9 @@ impl Timeline {
    }

    /// Level0 files first phase of compaction, explained in the [`Self::compact_legacy`] comment.
-    async fn compact_level0_phase1<'a>(
-        self: &'a Arc<Self>,
-        guard: tokio::sync::RwLockReadGuard<'a, LayerManager>,
+    async fn compact_level0_phase1(
+        self: &Arc<Self>,
+        guard: tokio::sync::OwnedRwLockReadGuard<LayerManager>,
        mut stats: CompactLevel0Phase1StatsBuilder,
        target_file_size: u64,
        ctx: &RequestContext,
@@ -416,7 +415,6 @@ impl Timeline {
            .map(|x| guard.get_from_desc(&x))
            .collect_vec();
        stats.level0_deltas_count = Some(level0_deltas.len());
-
        // Only compact if enough layers have accumulated.
        let threshold = self.get_compaction_threshold();
        if level0_deltas.is_empty() || level0_deltas.len() < threshold {
@@ -447,22 +445,6 @@ impl Timeline {
        let mut prev_lsn_end = first_level0_delta.layer_desc().lsn_range.end;
        let mut deltas_to_compact = Vec::with_capacity(level0_deltas.len());

-        // Accumulate the size of layers in `deltas_to_compact`
-        let mut deltas_to_compact_bytes = 0;
-
-        // Under normal circumstances, we will accumulate up to compaction_interval L0s of size
-        // checkpoint_distance each.  To avoid edge cases using extra system resources, bound our
-        // work in this function to only operate on this much delta data at once.
-        //
-        // Take the max of the configured value & the default, so that tests that configure tiny values
-        // can still use a sensible amount of memory, but if a deployed system configures bigger values we
-        // still let them compact a full stack of L0s in one go.
-        let delta_size_limit = std::cmp::max(
-            self.get_compaction_threshold(),
-            DEFAULT_COMPACTION_THRESHOLD,
-        ) as u64
-            * std::cmp::max(self.get_checkpoint_distance(), DEFAULT_CHECKPOINT_DISTANCE);
-
        deltas_to_compact.push(first_level0_delta.download_and_keep_resident().await?);
        for l in level0_deltas_iter {
            let lsn_range = &l.layer_desc().lsn_range;
@@ -471,20 +453,7 @@ impl Timeline {
                break;
            }
            deltas_to_compact.push(l.download_and_keep_resident().await?);
-            deltas_to_compact_bytes += l.metadata().file_size;
            prev_lsn_end = lsn_range.end;
-
-            if deltas_to_compact_bytes >= delta_size_limit {
-                info!(
-                    l0_deltas_selected = deltas_to_compact.len(),
-                    l0_deltas_total = level0_deltas.len(),
-                    "L0 compaction picker hit max delta layer size limit: {}",
-                    delta_size_limit
-                );
-
-                // Proceed with compaction, but only a subset of L0s
-                break;
-            }
        }
        let lsn_range = Range {
            start: deltas_to_compact
@@ -1021,7 +990,7 @@ impl Timeline {
                    "enhanced legacy compaction currently does not support retain_lsns (branches)"
                )));
            }
-            let gc_cutoff = gc_info.cutoffs.select_min();
+            let gc_cutoff = Lsn::min(gc_info.cutoffs.horizon, gc_info.cutoffs.pitr);
            let mut selected_layers = Vec::new();
            // TODO: consider retain_lsns
            drop(gc_info);
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -182,15 +182,13 @@ async fn remove_timeline_from_tenant(
 /// 5. Delete index part
 /// 6. Delete meta, timeline directory
 /// 7. Delete mark file
-///
 /// It is resumable from any step in case a crash/restart occurs.
 /// There are three entrypoints to the process:
 /// 1. [`DeleteTimelineFlow::run`] this is the main one called by a management api handler.
 /// 2. [`DeleteTimelineFlow::resume_deletion`] is called during restarts when local metadata is still present
-///    and we possibly neeed to continue deletion of remote files.
+/// and we possibly neeed to continue deletion of remote files.
 /// 3. [`DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces`] is used when we deleted remote
-///    index but still have local metadata, timeline directory and delete mark.
-///
+/// index but still have local metadata, timeline directory and delete mark.
 /// Note the only other place that messes around timeline delete mark is the logic that scans directory with timelines during tenant load.
 #[derive(Default)]
 pub enum DeleteTimelineFlow {
--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -10,7 +10,6 @@ use crate::{
    },
    virtual_file::{MaybeFatalIo, VirtualFile},
 };
-use pageserver_api::models::detach_ancestor::AncestorDetached;
 use tokio_util::sync::CancellationToken;
 use tracing::Instrument;
 use utils::{completion, generation::Generation, http::error::ApiError, id::TimelineId, lsn::Lsn};
@@ -40,9 +39,6 @@ pub(crate) enum Error {

    #[error("unexpected error")]
    Unexpected(#[source] anyhow::Error),
-
-    #[error("failpoint: {}", .0)]
-    Failpoint(&'static str),
 }

 impl From<Error> for ApiError {
@@ -61,41 +57,11 @@ impl From<Error> for ApiError {
            | e @ Error::CopyDeltaPrefix(_)
            | e @ Error::UploadRewritten(_)
            | e @ Error::CopyFailed(_)
-            | e @ Error::Unexpected(_)
-            | e @ Error::Failpoint(_) => ApiError::InternalServerError(e.into()),
+            | e @ Error::Unexpected(_) => ApiError::InternalServerError(e.into()),
        }
    }
 }

-impl From<crate::tenant::upload_queue::NotInitialized> for Error {
-    fn from(_: crate::tenant::upload_queue::NotInitialized) -> Self {
-        // treat all as shutting down signals, even though that is not entirely correct
-        // (uninitialized state)
-        Error::ShuttingDown
-    }
-}
-
-impl From<FlushLayerError> for Error {
-    fn from(value: FlushLayerError) -> Self {
-        match value {
-            FlushLayerError::Cancelled => Error::ShuttingDown,
-            FlushLayerError::NotRunning(_) => {
-                // FIXME(#6424): technically statically unreachable right now, given how we never
-                // drop the sender
-                Error::ShuttingDown
-            }
-            FlushLayerError::CreateImageLayersError(_) | FlushLayerError::Other(_) => {
-                Error::FlushAncestor(value)
-            }
-        }
-    }
-}
-
-pub(crate) enum Progress {
-    Prepared(completion::Completion, PreparedTimelineDetach),
-    Done(AncestorDetached),
-}
-
 pub(crate) struct PreparedTimelineDetach {
    layers: Vec<Layer>,
 }
@@ -122,7 +88,7 @@ pub(super) async fn prepare(
    tenant: &Tenant,
    options: Options,
    ctx: &RequestContext,
-) -> Result<Progress, Error> {
+) -> Result<(completion::Completion, PreparedTimelineDetach), Error> {
    use Error::*;

    let Some((ancestor, ancestor_lsn)) = detached
@@ -130,67 +96,15 @@ pub(super) async fn prepare(
        .as_ref()
        .map(|tl| (tl.clone(), detached.ancestor_lsn))
    else {
-        {
-            let accessor = detached.remote_client.initialized_upload_queue()?;
-
-            // we are safe to inspect the latest uploaded, because we can only witness this after
-            // restart is complete and ancestor is no more.
-            let latest = accessor.latest_uploaded_index_part();
-            if !latest.lineage.is_detached_from_original_ancestor() {
-                return Err(NoAncestor);
-            }
-        }
-
-        // detached has previously been detached; let's inspect each of the current timelines and
-        // report back the timelines which have been reparented by our detach
-        let mut all_direct_children = tenant
-            .timelines
-            .lock()
-            .unwrap()
-            .values()
-            .filter(|tl| matches!(tl.ancestor_timeline.as_ref(), Some(ancestor) if Arc::ptr_eq(ancestor, detached)))
-            .map(|tl| (tl.ancestor_lsn, tl.clone()))
-            .collect::<Vec<_>>();
-
-        let mut any_shutdown = false;
-
-        all_direct_children.retain(
-            |(_, tl)| match tl.remote_client.initialized_upload_queue() {
-                Ok(accessor) => accessor
-                    .latest_uploaded_index_part()
-                    .lineage
-                    .is_reparented(),
-                Err(_shutdownalike) => {
-                    // not 100% a shutdown, but let's bail early not to give inconsistent results in
-                    // sharded enviroment.
-                    any_shutdown = true;
-                    true
-                }
-            },
-        );
-
-        if any_shutdown {
-            // it could be one or many being deleted; have client retry
-            return Err(Error::ShuttingDown);
-        }
-
-        let mut reparented = all_direct_children;
-        // why this instead of hashset? there is a reason, but I've forgotten it many times.
+        // TODO: check if we have already been detached; for this we need to read the stored data
+        // on remote client, for that we need a follow-up which makes uploads cheaper and maintains
+        // a projection of the commited data.
        //
-        // maybe if this was a hashset we would not be able to distinguish some race condition.
-        reparented.sort_unstable_by_key(|(lsn, tl)| (*lsn, tl.timeline_id));
-
-        return Ok(Progress::Done(AncestorDetached {
-            reparented_timelines: reparented
-                .into_iter()
-                .map(|(_, tl)| tl.timeline_id)
-                .collect(),
-        }));
+        // the error is wrong per openapi
+        return Err(NoAncestor);
    };

    if !ancestor_lsn.is_valid() {
-        // rare case, probably wouldn't even load
-        tracing::error!("ancestor is set, but ancestor_lsn is invalid, this timeline needs fixing");
        return Err(NoAncestor);
    }

@@ -217,15 +131,6 @@ pub(super) async fn prepare(

    let _gate_entered = detached.gate.enter().map_err(|_| ShuttingDown)?;

-    utils::pausable_failpoint!("timeline-detach-ancestor::before_starting_after_locking_pausable");
-
-    fail::fail_point!(
-        "timeline-detach-ancestor::before_starting_after_locking",
-        |_| Err(Error::Failpoint(
-            "timeline-detach-ancestor::before_starting_after_locking"
-        ))
-    );
-
    if ancestor_lsn >= ancestor.get_disk_consistent_lsn() {
        let span =
            tracing::info_span!("freeze_and_flush", ancestor_timeline_id=%ancestor.timeline_id);
@@ -246,7 +151,7 @@ pub(super) async fn prepare(
                }
            };

-            res?;
+            res.map_err(FlushAncestor)?;

            // we do not need to wait for uploads to complete but we do need `struct Layer`,
            // copying delta prefix is unsupported currently for `InMemoryLayer`.
@@ -254,7 +159,7 @@ pub(super) async fn prepare(
                elapsed_ms = started_at.elapsed().as_millis(),
                "froze and flushed the ancestor"
            );
-            Ok::<_, Error>(())
+            Ok(())
        }
        .instrument(span)
        .await?;
@@ -378,7 +283,7 @@ pub(super) async fn prepare(

    let prepared = PreparedTimelineDetach { layers: new_layers };

-    Ok(Progress::Prepared(guard, prepared))
+    Ok((guard, prepared))
 }

 fn partition_work(
@@ -445,11 +350,7 @@ async fn copy_lsn_prefix(
    target_timeline: &Arc<Timeline>,
    ctx: &RequestContext,
 ) -> Result<Option<ResidentLayer>, Error> {
-    use Error::{CopyDeltaPrefix, RewrittenDeltaDownloadFailed, ShuttingDown};
-
-    if target_timeline.cancel.is_cancelled() {
-        return Err(ShuttingDown);
-    }
+    use Error::{CopyDeltaPrefix, RewrittenDeltaDownloadFailed};

    tracing::debug!(%layer, %end_lsn, "copying lsn prefix");

@@ -628,7 +529,7 @@ pub(super) async fn complete(
        match res {
            Ok(Some(timeline)) => {
                tracing::info!(reparented=%timeline.timeline_id, "reparenting done");
-                reparented.push((timeline.ancestor_lsn, timeline.timeline_id));
+                reparented.push(timeline.timeline_id);
            }
            Ok(None) => {
                // lets just ignore this for now. one or all reparented timelines could had
@@ -650,12 +551,5 @@ pub(super) async fn complete(
        tracing::info!("failed to reparent some candidates");
    }

-    reparented.sort_unstable();
-
-    let reparented = reparented
-        .into_iter()
-        .map(|(_, timeline_id)| timeline_id)
-        .collect();
-
    Ok(reparented)
 }
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -339,10 +339,6 @@ impl LayerManager {
        self.layer_fmgr.contains(layer)
    }

-    pub(crate) fn contains_key(&self, key: &PersistentLayerKey) -> bool {
-        self.layer_fmgr.contains_key(key)
-    }
-
    pub(crate) fn all_persistent_layers(&self) -> Vec<PersistentLayerKey> {
        self.layer_fmgr.0.keys().cloned().collect_vec()
    }
@@ -367,10 +363,6 @@ impl<T: AsLayerDesc + Clone> LayerFileManager<T> {
            .clone()
    }

-    fn contains_key(&self, key: &PersistentLayerKey) -> bool {
-        self.0.contains_key(key)
-    }
-
    pub(crate) fn insert(&mut self, layer: T) {
        let present = self.0.insert(layer.layer_desc().key(), layer.clone());
        if present.is_some() && cfg!(debug_assertions) {
--- a/pageserver/src/tenant/timeline/logical_size.rs
+++ b/pageserver/src/tenant/timeline/logical_size.rs
@@ -11,11 +11,11 @@ use std::sync::atomic::{AtomicBool, AtomicI64, Ordering as AtomicOrdering};
 /// Calculation consists of two stages:
 ///
 /// 1. Initial size calculation. That might take a long time, because it requires
-///    reading all layers containing relation sizes at `initial_part_end`.
+/// reading all layers containing relation sizes at `initial_part_end`.
 ///
 /// 2. Collecting an incremental part and adding that to the initial size.
-///    Increments are appended on walreceiver writing new timeline data,
-///    which result in increase or decrease of the logical size.
+/// Increments are appended on walreceiver writing new timeline data,
+/// which result in increase or decrease of the logical size.
 pub(super) struct LogicalSize {
    /// Size, potentially slow to compute. Calculating this might require reading multiple
    /// layers, and even ancestor's layers.
@@ -45,17 +45,17 @@ pub(super) struct LogicalSize {
    /// Size shouldn't ever be negative, but this is signed for two reasons:
    ///
    /// 1. If we initialized the "baseline" size lazily, while we already
-    ///    process incoming WAL, the incoming WAL records could decrement the
-    ///    variable and temporarily make it negative. (This is just future-proofing;
-    ///    the initialization is currently not done lazily.)
+    /// process incoming WAL, the incoming WAL records could decrement the
+    /// variable and temporarily make it negative. (This is just future-proofing;
+    /// the initialization is currently not done lazily.)
    ///
    /// 2. If there is a bug and we e.g. forget to increment it in some cases
-    ///    when size grows, but remember to decrement it when it shrinks again, the
-    ///    variable could go negative. In that case, it seems better to at least
-    ///    try to keep tracking it, rather than clamp or overflow it. Note that
-    ///    get_current_logical_size() will clamp the returned value to zero if it's
-    ///    negative, and log an error. Could set it permanently to zero or some
-    ///    special value to indicate "broken" instead, but this will do for now.
+    /// when size grows, but remember to decrement it when it shrinks again, the
+    /// variable could go negative. In that case, it seems better to at least
+    /// try to keep tracking it, rather than clamp or overflow it. Note that
+    /// get_current_logical_size() will clamp the returned value to zero if it's
+    /// negative, and log an error. Could set it permanently to zero or some
+    /// special value to indicate "broken" instead, but this will do for now.
    ///
    /// Note that we also expose a copy of this value as a prometheus metric,
    /// see `current_logical_size_gauge`. Use the `update_current_logical_size`
--- a/pageserver/src/tenant/timeline/walreceiver.rs
+++ b/pageserver/src/tenant/timeline/walreceiver.rs
@@ -2,13 +2,13 @@
 //! To do so, a current implementation needs to do the following:
 //!
 //! * acknowledge the timelines that it needs to stream WAL into.
-//!   Pageserver is able to dynamically (un)load tenants on attach and detach,
-//!   hence WAL receiver needs to react on such events.
+//! Pageserver is able to dynamically (un)load tenants on attach and detach,
+//! hence WAL receiver needs to react on such events.
 //!
 //! * get a broker subscription, stream data from it to determine that a timeline needs WAL streaming.
-//!   For that, it watches specific keys in storage_broker and pulls the relevant data periodically.
-//!   The data is produced by safekeepers, that push it periodically and pull it to synchronize between each other.
-//!   Without this data, no WAL streaming is possible currently.
+//! For that, it watches specific keys in storage_broker and pulls the relevant data periodically.
+//! The data is produced by safekeepers, that push it periodically and pull it to synchronize between each other.
+//! Without this data, no WAL streaming is possible currently.
 //!
 //! Only one active WAL streaming connection is allowed at a time.
 //! The connection is supposed to be updated periodically, based on safekeeper timeline data.
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -26,7 +26,7 @@ use tracing::{debug, error, info, trace, warn, Instrument};
 use super::TaskStateUpdate;
 use crate::{
    context::RequestContext,
-    metrics::{LIVE_CONNECTIONS, WALRECEIVER_STARTED_CONNECTIONS, WAL_INGEST},
+    metrics::{LIVE_CONNECTIONS_COUNT, WALRECEIVER_STARTED_CONNECTIONS, WAL_INGEST},
    task_mgr::TaskKind,
    task_mgr::WALRECEIVER_RUNTIME,
    tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline, WalReceiverInfo},
@@ -208,9 +208,14 @@ pub(super) async fn handle_walreceiver_connection(
        .instrument(tracing::info_span!("poller")),
    );

-    let _guard = LIVE_CONNECTIONS
-        .with_label_values(&["wal_receiver"])
-        .guard();
+    // Immediately increment the gauge, then create a job to decrement it on task exit.
+    // One of the pros of `defer!` is that this will *most probably*
+    // get called, even in presence of panics.
+    let gauge = LIVE_CONNECTIONS_COUNT.with_label_values(&["wal_receiver"]);
+    gauge.inc();
+    scopeguard::defer! {
+        gauge.dec();
+    }

    let identify = identify_system(&replication_client).await?;
    info!("{identify:?}");
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -228,20 +228,18 @@ impl UploadQueue {
        Ok(self.initialized_mut().expect("we just set it"))
    }

-    pub(crate) fn initialized_mut(
-        &mut self,
-    ) -> Result<&mut UploadQueueInitialized, NotInitialized> {
+    pub(crate) fn initialized_mut(&mut self) -> anyhow::Result<&mut UploadQueueInitialized> {
        use UploadQueue::*;
        match self {
-            Uninitialized => Err(NotInitialized::Uninitialized),
+            Uninitialized => Err(NotInitialized::Uninitialized.into()),
            Initialized(x) => {
                if x.shutting_down {
-                    Err(NotInitialized::ShuttingDown)
+                    Err(NotInitialized::ShuttingDown.into())
                } else {
                    Ok(x)
                }
            }
-            Stopped(_) => Err(NotInitialized::Stopped),
+            Stopped(_) => Err(NotInitialized::Stopped.into()),
        }
    }

--- a/pageserver/src/tenant/vectored_blob_io.rs
+++ b/pageserver/src/tenant/vectored_blob_io.rs
@@ -20,13 +20,11 @@ use std::num::NonZeroUsize;

 use bytes::BytesMut;
 use pageserver_api::key::Key;
-use tokio::io::AsyncWriteExt;
 use tokio_epoll_uring::BoundedBuf;
 use utils::lsn::Lsn;
 use utils::vec_map::VecMap;

 use crate::context::RequestContext;
-use crate::tenant::blob_io::{BYTE_UNCOMPRESSED, BYTE_ZSTD, LEN_COMPRESSION_BIT_MASK};
 use crate::virtual_file::VirtualFile;

 #[derive(Copy, Clone, Debug, PartialEq, Eq)]
@@ -70,7 +68,7 @@ impl VectoredRead {
    }
 }

-#[derive(Eq, PartialEq, Debug)]
+#[derive(Eq, PartialEq)]
 pub(crate) enum VectoredReadExtended {
    Yes,
    No,
@@ -93,7 +91,7 @@ impl VectoredReadBuilder {
        start_offset: u64,
        end_offset: u64,
        meta: BlobMeta,
-        max_read_size: usize,
+        max_read_size: Option<usize>,
    ) -> Self {
        let mut blobs_at = VecMap::default();
        blobs_at
@@ -104,9 +102,10 @@ impl VectoredReadBuilder {
            start: start_offset,
            end: end_offset,
            blobs_at,
-            max_read_size: Some(max_read_size),
+            max_read_size,
        }
    }
+
    /// Attempt to extend the current read with a new blob if the start
    /// offset matches with the current end of the vectored read
    /// and the resuting size is below the max read size
@@ -165,7 +164,7 @@ pub struct VectoredReadPlanner {
    // Arguments for previous blob passed into [`VectoredReadPlanner::handle`]
    prev: Option<(Key, Lsn, u64, BlobFlag)>,

-    max_read_size: usize,
+    max_read_size: Option<usize>,
 }

 impl VectoredReadPlanner {
@@ -173,7 +172,20 @@ impl VectoredReadPlanner {
        Self {
            blobs: BTreeMap::new(),
            prev: None,
-            max_read_size,
+            max_read_size: Some(max_read_size),
+        }
+    }
+
+    /// This function should *only* be used if the caller has a way to control the limit. e.g., in [`StreamingVectoredReadPlanner`],
+    /// it uses the vectored read planner to avoid duplicated logic on handling blob start/end, while expecting the vectored
+    /// read planner to give a single read to a continuous range of bytes in the image layer. Therefore, it does not need the
+    /// code path to split reads into chunks of `max_read_size`, and controls the read size itself.
+    #[cfg(test)]
+    pub(crate) fn new_caller_controlled_max_limit() -> Self {
+        Self {
+            blobs: BTreeMap::new(),
+            prev: None,
+            max_read_size: None,
        }
    }

@@ -191,9 +203,9 @@ impl VectoredReadPlanner {
    ///
    /// The `flag` argument has two interesting values:
    /// * [`BlobFlag::ReplaceAll`]: The blob for this key should replace all existing blobs.
-    ///   This is used for WAL records that `will_init`.
+    /// This is used for WAL records that `will_init`.
    /// * [`BlobFlag::Ignore`]: This blob should not be included in the read. This happens
-    ///   if the blob is cached.
+    /// if the blob is cached.
    pub fn handle(&mut self, key: Key, lsn: Lsn, offset: u64, flag: BlobFlag) {
        // Implementation note: internally lag behind by one blob such that
        // we have a start and end offset when initialising [`VectoredRead`]
@@ -303,7 +315,7 @@ impl<'a> VectoredBlobReader<'a> {
            read.size(),
            buf.capacity()
        );
-        let mut buf = self
+        let buf = self
            .file
            .read_exact_at(buf.slice(0..read.size()), read.start, ctx)
            .await?
@@ -325,68 +337,38 @@ impl<'a> VectoredBlobReader<'a> {
                .chain(std::iter::once(None)),
        );

-        // Some scratch space, put here for reusing the allocation
-        let mut decompressed_vec = Vec::new();
-
        for ((offset, meta), next) in pairs {
            let offset_in_buf = offset - start_offset;
            let first_len_byte = buf[offset_in_buf as usize];

-            // Each blob is prefixed by a header containing its size and compression information.
+            // Each blob is prefixed by a header containing it's size.
            // Extract the size and skip that header to find the start of the data.
            // The size can be 1 or 4 bytes. The most significant bit is 0 in the
            // 1 byte case and 1 in the 4 byte case.
-            let (size_length, blob_size, compression_bits) = if first_len_byte < 0x80 {
-                (1, first_len_byte as u64, BYTE_UNCOMPRESSED)
+            let (size_length, blob_size) = if first_len_byte < 0x80 {
+                (1, first_len_byte as u64)
            } else {
                let mut blob_size_buf = [0u8; 4];
                let offset_in_buf = offset_in_buf as usize;

                blob_size_buf.copy_from_slice(&buf[offset_in_buf..offset_in_buf + 4]);
-                blob_size_buf[0] &= !LEN_COMPRESSION_BIT_MASK;
-
-                let compression_bits = first_len_byte & LEN_COMPRESSION_BIT_MASK;
-                (
-                    4,
-                    u32::from_be_bytes(blob_size_buf) as u64,
-                    compression_bits,
-                )
+                blob_size_buf[0] &= 0x7f;
+                (4, u32::from_be_bytes(blob_size_buf) as u64)
            };

-            let start_raw = offset_in_buf + size_length;
-            let end_raw = match next {
+            let start = offset_in_buf + size_length;
+            let end = match next {
                Some((next_blob_start_offset, _)) => next_blob_start_offset - start_offset,
-                None => start_raw + blob_size,
+                None => start + blob_size,
            };
-            assert_eq!(end_raw - start_raw, blob_size);
-            let (start, end);
-            if compression_bits == BYTE_UNCOMPRESSED {
-                start = start_raw as usize;
-                end = end_raw as usize;
-            } else if compression_bits == BYTE_ZSTD {
-                let mut decoder =
-                    async_compression::tokio::write::ZstdDecoder::new(&mut decompressed_vec);
-                decoder
-                    .write_all(&buf[start_raw as usize..end_raw as usize])
-                    .await?;
-                decoder.flush().await?;
-                start = buf.len();
-                buf.extend_from_slice(&decompressed_vec);
-                end = buf.len();
-                decompressed_vec.clear();
-            } else {
-                let error = std::io::Error::new(
-                    std::io::ErrorKind::InvalidData,
-                    format!("invalid compression byte {compression_bits:x}"),
-                );
-                return Err(error);
-            }
+
+            assert_eq!(end - start, blob_size);

            metas.push(VectoredBlob {
-                start,
-                end,
+                start: start as usize,
+                end: end as usize,
                meta: *meta,
-            });
+            })
        }

        Ok(VectoredBlobsBuf { buf, blobs: metas })
@@ -394,18 +376,17 @@ impl<'a> VectoredBlobReader<'a> {
 }

 /// Read planner used in [`crate::tenant::storage_layer::image_layer::ImageLayerIterator`]. It provides a streaming API for
-/// getting read blobs. It returns a batch when `handle` gets called and when the current key would just exceed the read_size and
-/// max_cnt constraints.
+/// getting read blobs. It returns a batch when `handle` gets called and when the current key would exceed the read_size and
+/// max_cnt constraints. Underlying it uses [`VectoredReadPlanner`].
 #[cfg(test)]
 pub struct StreamingVectoredReadPlanner {
-    read_builder: Option<VectoredReadBuilder>,
-    // Arguments for previous blob passed into [`StreamingVectoredReadPlanner::handle`]
-    prev: Option<(Key, Lsn, u64)>,
-    /// Max read size per batch. This is not a strict limit. If there are [0, 100) and [100, 200), while the `max_read_size` is 150,
-    /// we will produce a single batch instead of split them.
+    planner: VectoredReadPlanner,
+    /// Max read size per batch
    max_read_size: u64,
    /// Max item count per batch
    max_cnt: usize,
+    /// The first offset of this batch
+    this_batch_first_offset: Option<u64>,
    /// Size of the current batch
    cnt: usize,
 }
@@ -416,100 +397,67 @@ impl StreamingVectoredReadPlanner {
        assert!(max_cnt > 0);
        assert!(max_read_size > 0);
        Self {
-            read_builder: None,
-            prev: None,
+            // We want to have exactly one read syscall (plus several others for index lookup) for each `next_batch` call.
+            // Therefore, we enforce `self.max_read_size` by ourselves instead of using the VectoredReadPlanner's capability,
+            // to avoid splitting into two I/Os.
+            planner: VectoredReadPlanner::new_caller_controlled_max_limit(),
            max_cnt,
            max_read_size,
+            this_batch_first_offset: None,
            cnt: 0,
        }
    }

-    pub fn handle(&mut self, key: Key, lsn: Lsn, offset: u64) -> Option<VectoredRead> {
-        // Implementation note: internally lag behind by one blob such that
-        // we have a start and end offset when initialising [`VectoredRead`]
-        let (prev_key, prev_lsn, prev_offset) = match self.prev {
-            None => {
-                self.prev = Some((key, lsn, offset));
-                return None;
-            }
-            Some(prev) => prev,
-        };
-
-        let res = self.add_blob(prev_key, prev_lsn, prev_offset, offset, false);
-
-        self.prev = Some((key, lsn, offset));
-
-        res
+    fn emit(&mut self, this_batch_first_offset: u64) -> VectoredRead {
+        let planner = std::mem::replace(
+            &mut self.planner,
+            VectoredReadPlanner::new_caller_controlled_max_limit(),
+        );
+        self.this_batch_first_offset = Some(this_batch_first_offset);
+        self.cnt = 1;
+        let mut batch = planner.finish();
+        assert_eq!(batch.len(), 1, "should have exactly one read batch");
+        batch.pop().unwrap()
    }

-    pub fn handle_range_end(&mut self, offset: u64) -> Option<VectoredRead> {
-        let res = if let Some((prev_key, prev_lsn, prev_offset)) = self.prev {
-            self.add_blob(prev_key, prev_lsn, prev_offset, offset, true)
-        } else {
-            None
-        };
-
-        self.prev = None;
-
-        res
-    }
-
-    fn add_blob(
+    pub fn handle(
        &mut self,
        key: Key,
        lsn: Lsn,
-        start_offset: u64,
-        end_offset: u64,
-        is_last_blob_in_read: bool,
+        offset: u64,
+        flag: BlobFlag,
    ) -> Option<VectoredRead> {
-        match &mut self.read_builder {
-            Some(read_builder) => {
-                let extended = read_builder.extend(start_offset, end_offset, BlobMeta { key, lsn });
-                assert_eq!(extended, VectoredReadExtended::Yes);
-            }
-            None => {
-                self.read_builder = {
-                    let mut blobs_at = VecMap::default();
-                    blobs_at
-                        .append(start_offset, BlobMeta { key, lsn })
-                        .expect("First insertion always succeeds");
-
-                    Some(VectoredReadBuilder {
-                        start: start_offset,
-                        end: end_offset,
-                        blobs_at,
-                        max_read_size: None,
-                    })
-                };
+        if let Some(begin_offset) = self.this_batch_first_offset {
+            // Each batch will have at least one item b/c `self.this_batch_first_offset` is set
+            // after one item gets processed
+            if offset - begin_offset > self.max_read_size {
+                self.planner.handle_range_end(offset); // End the current batch with the offset
+                let batch = self.emit(offset); // Produce a batch
+                self.planner.handle(key, lsn, offset, flag); // Add this key to the next batch
+                return Some(batch);
            }
+        } else {
+            self.this_batch_first_offset = Some(offset)
        }
-        let read_builder = self.read_builder.as_mut().unwrap();
+        if self.cnt >= self.max_cnt {
+            self.planner.handle_range_end(offset); // End the current batch with the offset
+            let batch = self.emit(offset); // Produce a batch
+            self.planner.handle(key, lsn, offset, flag); // Add this key to the next batch
+            return Some(batch);
+        }
+        self.planner.handle(key, lsn, offset, flag); // Add this key to the current batch
        self.cnt += 1;
-        if is_last_blob_in_read
-            || read_builder.size() >= self.max_read_size as usize
-            || self.cnt >= self.max_cnt
-        {
-            let prev_read_builder = self.read_builder.take();
-            self.cnt = 0;
-
-            // `current_read_builder` is None in the first iteration
-            if let Some(read_builder) = prev_read_builder {
-                return Some(read_builder.build());
-            }
-        }
        None
    }
+
+    pub fn handle_range_end(&mut self, offset: u64) -> VectoredRead {
+        self.planner.handle_range_end(offset);
+        self.emit(offset)
+    }
 }

 #[cfg(test)]
 mod tests {
-    use anyhow::Error;
-
-    use crate::context::DownloadBehavior;
-    use crate::page_cache::PAGE_SZ;
-    use crate::task_mgr::TaskKind;
-
-    use super::super::blob_io::tests::{random_array, write_maybe_compressed};
    use super::*;

    fn validate_read(read: &VectoredRead, offset_range: &[(Key, Lsn, u64, BlobFlag)]) {
@@ -561,11 +509,8 @@ mod tests {
        planner.handle_range_end(652 * 1024);

        let reads = planner.finish();
-
        assert_eq!(reads.len(), 6);

-        // TODO: could remove zero reads to produce 5 reads here
-
        for (idx, read) in reads.iter().enumerate() {
            validate_read(read, ranges[idx]);
        }
@@ -603,187 +548,4 @@ mod tests {
            validate_read(read, ranges[idx]);
        }
    }
-
-    #[test]
-    fn streaming_planner_max_read_size_test() {
-        let max_read_size = 128 * 1024;
-        let key = Key::MIN;
-        let lsn = Lsn(0);
-
-        let blob_descriptions = vec![
-            (key, lsn, 0, BlobFlag::None),
-            (key, lsn, 32 * 1024, BlobFlag::None),
-            (key, lsn, 96 * 1024, BlobFlag::None),
-            (key, lsn, 128 * 1024, BlobFlag::None),
-            (key, lsn, 198 * 1024, BlobFlag::None),
-            (key, lsn, 268 * 1024, BlobFlag::None),
-            (key, lsn, 396 * 1024, BlobFlag::None),
-            (key, lsn, 652 * 1024, BlobFlag::None),
-        ];
-
-        let ranges = [
-            &blob_descriptions[0..3],
-            &blob_descriptions[3..5],
-            &blob_descriptions[5..6],
-            &blob_descriptions[6..7],
-            &blob_descriptions[7..],
-        ];
-
-        let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 1000);
-        let mut reads = Vec::new();
-        for (key, lsn, offset, _) in blob_descriptions.clone() {
-            reads.extend(planner.handle(key, lsn, offset));
-        }
-        reads.extend(planner.handle_range_end(652 * 1024));
-
-        assert_eq!(reads.len(), ranges.len());
-
-        for (idx, read) in reads.iter().enumerate() {
-            validate_read(read, ranges[idx]);
-        }
-    }
-
-    #[test]
-    fn streaming_planner_max_cnt_test() {
-        let max_read_size = 1024 * 1024;
-        let key = Key::MIN;
-        let lsn = Lsn(0);
-
-        let blob_descriptions = vec![
-            (key, lsn, 0, BlobFlag::None),
-            (key, lsn, 32 * 1024, BlobFlag::None),
-            (key, lsn, 96 * 1024, BlobFlag::None),
-            (key, lsn, 128 * 1024, BlobFlag::None),
-            (key, lsn, 198 * 1024, BlobFlag::None),
-            (key, lsn, 268 * 1024, BlobFlag::None),
-            (key, lsn, 396 * 1024, BlobFlag::None),
-            (key, lsn, 652 * 1024, BlobFlag::None),
-        ];
-
-        let ranges = [
-            &blob_descriptions[0..2],
-            &blob_descriptions[2..4],
-            &blob_descriptions[4..6],
-            &blob_descriptions[6..],
-        ];
-
-        let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 2);
-        let mut reads = Vec::new();
-        for (key, lsn, offset, _) in blob_descriptions.clone() {
-            reads.extend(planner.handle(key, lsn, offset));
-        }
-        reads.extend(planner.handle_range_end(652 * 1024));
-
-        assert_eq!(reads.len(), ranges.len());
-
-        for (idx, read) in reads.iter().enumerate() {
-            validate_read(read, ranges[idx]);
-        }
-    }
-
-    #[test]
-    fn streaming_planner_edge_test() {
-        let max_read_size = 1024 * 1024;
-        let key = Key::MIN;
-        let lsn = Lsn(0);
-        {
-            let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 1);
-            let mut reads = Vec::new();
-            reads.extend(planner.handle_range_end(652 * 1024));
-            assert!(reads.is_empty());
-        }
-        {
-            let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 1);
-            let mut reads = Vec::new();
-            reads.extend(planner.handle(key, lsn, 0));
-            reads.extend(planner.handle_range_end(652 * 1024));
-            assert_eq!(reads.len(), 1);
-            validate_read(&reads[0], &[(key, lsn, 0, BlobFlag::None)]);
-        }
-        {
-            let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 1);
-            let mut reads = Vec::new();
-            reads.extend(planner.handle(key, lsn, 0));
-            reads.extend(planner.handle(key, lsn, 128 * 1024));
-            reads.extend(planner.handle_range_end(652 * 1024));
-            assert_eq!(reads.len(), 2);
-            validate_read(&reads[0], &[(key, lsn, 0, BlobFlag::None)]);
-            validate_read(&reads[1], &[(key, lsn, 128 * 1024, BlobFlag::None)]);
-        }
-        {
-            let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 2);
-            let mut reads = Vec::new();
-            reads.extend(planner.handle(key, lsn, 0));
-            reads.extend(planner.handle(key, lsn, 128 * 1024));
-            reads.extend(planner.handle_range_end(652 * 1024));
-            assert_eq!(reads.len(), 1);
-            validate_read(
-                &reads[0],
-                &[
-                    (key, lsn, 0, BlobFlag::None),
-                    (key, lsn, 128 * 1024, BlobFlag::None),
-                ],
-            );
-        }
-    }
-
-    async fn round_trip_test_compressed(blobs: &[Vec<u8>], compression: bool) -> Result<(), Error> {
-        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
-        let (_temp_dir, pathbuf, offsets) =
-            write_maybe_compressed::<true>(blobs, compression, &ctx).await?;
-
-        let file = VirtualFile::open(&pathbuf, &ctx).await?;
-        let file_len = std::fs::metadata(&pathbuf)?.len();
-
-        // Multiply by two (compressed data might need more space), and add a few bytes for the header
-        let reserved_bytes = blobs.iter().map(|bl| bl.len()).max().unwrap() * 2 + 16;
-        let mut buf = BytesMut::with_capacity(reserved_bytes);
-
-        let vectored_blob_reader = VectoredBlobReader::new(&file);
-        let meta = BlobMeta {
-            key: Key::MIN,
-            lsn: Lsn(0),
-        };
-
-        for (idx, (blob, offset)) in blobs.iter().zip(offsets.iter()).enumerate() {
-            let end = offsets.get(idx + 1).unwrap_or(&file_len);
-            if idx + 1 == offsets.len() {
-                continue;
-            }
-            let read_builder = VectoredReadBuilder::new(*offset, *end, meta, 16 * 4096);
-            let read = read_builder.build();
-            let result = vectored_blob_reader.read_blobs(&read, buf, &ctx).await?;
-            assert_eq!(result.blobs.len(), 1);
-            let read_blob = &result.blobs[0];
-            let read_buf = &result.buf[read_blob.start..read_blob.end];
-            assert_eq!(blob, read_buf, "mismatch for idx={idx} at offset={offset}");
-            buf = result.buf;
-        }
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn test_really_big_array() -> Result<(), Error> {
-        let blobs = &[
-            b"test".to_vec(),
-            random_array(10 * PAGE_SZ),
-            b"hello".to_vec(),
-            random_array(66 * PAGE_SZ),
-            vec![0xf3; 24 * PAGE_SZ],
-            b"foobar".to_vec(),
-        ];
-        round_trip_test_compressed(blobs, false).await?;
-        round_trip_test_compressed(blobs, true).await?;
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn test_arrays_inc() -> Result<(), Error> {
-        let blobs = (0..PAGE_SZ / 8)
-            .map(|v| random_array(v * 16))
-            .collect::<Vec<_>>();
-        round_trip_test_compressed(&blobs, false).await?;
-        round_trip_test_compressed(&blobs, true).await?;
-        Ok(())
-    }
 }
--- a/pageserver/src/trace.rs
+++ b/pageserver/src/trace.rs
@@ -0,0 +1,36 @@
+use bytes::Bytes;
+use camino::Utf8PathBuf;
+use std::{
+    fs::{create_dir_all, File},
+    io::{BufWriter, Write},
+};
+
+pub struct Tracer {
+    writer: BufWriter<File>,
+}
+
+impl Drop for Tracer {
+    fn drop(&mut self) {
+        self.flush()
+    }
+}
+
+impl Tracer {
+    pub fn new(path: Utf8PathBuf) -> Self {
+        let parent = path.parent().expect("failed to parse parent path");
+        create_dir_all(parent).expect("failed to create trace dir");
+
+        let file = File::create(path).expect("failed to create trace file");
+        Tracer {
+            writer: BufWriter::new(file),
+        }
+    }
+
+    pub fn trace(&mut self, msg: &Bytes) {
+        self.writer.write_all(msg).expect("failed to write trace");
+    }
+
+    pub fn flush(&mut self) {
+        self.writer.flush().expect("failed to flush trace file");
+    }
+}
--- a/pageserver/src/virtual_file/owned_buffers_io/write.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/write.rs
@@ -33,7 +33,6 @@ pub struct BufferedWriter<B, W> {
    /// invariant: always remains Some(buf) except
    /// - while IO is ongoing => goes back to Some() once the IO completed successfully
    /// - after an IO error => stays `None` forever
-    ///
    /// In these exceptional cases, it's `None`.
    buf: Option<B>,
 }
--- a/patches/rum.patch
+++ b/patches/rum.patch
@@ -1,54 +0,0 @@
-commit 68f3b3b0d594f08aacc4a082ee210749ed5677eb
-Author: Anastasia Lubennikova <anastasia@neon.tech>
-Date:   Mon Jul 15 12:31:56 2024 +0100
-
-    Neon: fix unlogged index build patch
-
-diff --git a/src/ruminsert.c b/src/ruminsert.c
-index e8b209d..e89bf2a 100644
--- a/src/ruminsert.c
-+++ b/src/ruminsert.c
-@@ -628,6 +628,10 @@ rumbuild(Relation heap, Relation index, struct IndexInfo *indexInfo)
- 		elog(ERROR, "index \"%s\" already contains data",
- 			 RelationGetRelationName(index));
- 
-+#ifdef NEON_SMGR
-+	smgr_start_unlogged_build(index->rd_smgr);
-+#endif
-+
- 	initRumState(&buildstate.rumstate, index);
- 	buildstate.rumstate.isBuild = true;
- 	buildstate.indtuples = 0;
-@@ -693,6 +697,10 @@ rumbuild(Relation heap, Relation index, struct IndexInfo *indexInfo)
- 	buildstate.buildStats.nTotalPages = RelationGetNumberOfBlocks(index);
- 	rumUpdateStats(index, &buildstate.buildStats, buildstate.rumstate.isBuild);
- 
-+#ifdef NEON_SMGR
-+	smgr_finish_unlogged_build_phase_1(index->rd_smgr);
-+#endif
-+
- 	/*
- 	 * Write index to xlog
- 	 */
-@@ -713,6 +721,21 @@ rumbuild(Relation heap, Relation index, struct IndexInfo *indexInfo)
- 		UnlockReleaseBuffer(buffer);
- 	}
- 
-+#ifdef NEON_SMGR
-+	{
-+#if PG_VERSION_NUM >= 160000
-+		RelFileLocator rlocator = RelationGetSmgr(index)->smgr_rlocator.locator;
-+#else
-+		RelFileNode rlocator = RelationGetSmgr(index)->smgr_rnode.node;
-+#endif
-+
-+		SetLastWrittenLSNForBlockRange(XactLastRecEnd, rlocator, MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index));
-+		SetLastWrittenLSNForRelation(XactLastRecEnd, rlocator, MAIN_FORKNUM);
-+
-+		smgr_end_unlogged_build(index->rd_smgr);
-+	}
-+#endif
-+
- 	/*
- 	 * Return statistics
- 	 */
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -46,21 +46,6 @@ void		_PG_init(void);

 static int	logical_replication_max_snap_files = 300;

-static int  running_xacts_overflow_policy;
-
-enum RunningXactsOverflowPolicies {
-	OP_IGNORE,
-	OP_SKIP,
-	OP_WAIT
-};
-
-static const struct config_enum_entry running_xacts_overflow_policies[] = {
-	{"ignore", OP_IGNORE, false},
-	{"skip", OP_SKIP, false},
-	{"wait", OP_WAIT, false},
-	{NULL, 0, false}
-};
-
 static void
 InitLogicalReplicationMonitor(void)
 {
@@ -429,7 +414,6 @@ RestoreRunningXactsFromClog(CheckPoint *checkpoint, TransactionId **xids, int *n
 	restored_xids = (TransactionId *) palloc(max_xcnt * sizeof(TransactionId));
 	n_restored_xids = 0;
 	next_prepared_idx = 0;
-
 	for (TransactionId xid = from; xid != till;)
 	{
 		XLogRecPtr	xidlsn;
@@ -440,7 +424,7 @@ RestoreRunningXactsFromClog(CheckPoint *checkpoint, TransactionId **xids, int *n
 		/*
 		 * "Merge" the prepared transactions into the restored_xids array as
 		 * we go.  The prepared transactions array is sorted. This is mostly
-		 * a sanity check to ensure that all the prepared transactions are
+		 * a sanity check to ensure that all the prpeared transactions are
 		 * seen as in-progress. (There is a check after the loop that we didn't
 		 * miss any.)
 		 */
@@ -538,23 +522,14 @@ RestoreRunningXactsFromClog(CheckPoint *checkpoint, TransactionId **xids, int *n
 			elog(LOG, "too many running xacts to restore from the CLOG; oldestXid=%u oldestActiveXid=%u nextXid %u",
 				 checkpoint->oldestXid, checkpoint->oldestActiveXid,
 				 XidFromFullTransactionId(checkpoint->nextXid));
-
-			switch (running_xacts_overflow_policy)
-			{
-				case OP_WAIT:
-					goto fail;
-				case OP_IGNORE:
-					goto success;
-				case OP_SKIP:
-					n_restored_xids = 0;
-					goto success;
-			}
+			goto fail;
 		}

 		restored_xids[n_restored_xids++] = xid;

 	skip:
 		TransactionIdAdvance(xid);
+		continue;
 	}

 	/* sanity check */
@@ -565,13 +540,11 @@ RestoreRunningXactsFromClog(CheckPoint *checkpoint, TransactionId **xids, int *n
 		Assert(false);
 		goto fail;
 	}
-   success:
+
 	elog(LOG, "restored %d running xacts by scanning the CLOG; oldestXid=%u oldestActiveXid=%u nextXid %u",
 		 n_restored_xids, checkpoint->oldestXid, checkpoint->oldestActiveXid, XidFromFullTransactionId(checkpoint->nextXid));
 	*nxids = n_restored_xids;
 	*xids = restored_xids;
-	if (prepared_xids)
-		pfree(prepared_xids);
 	return true;

 fail:
@@ -608,18 +581,6 @@ _PG_init(void)

 	restore_running_xacts_callback = RestoreRunningXactsFromClog;

-
-	DefineCustomEnumVariable(
-							"neon.running_xacts_overflow_policy",
-							"Action performed on snapshot overflow when restoring runnings xacts from CLOG",
-							NULL,
-							&running_xacts_overflow_policy,
-							OP_IGNORE,
-							running_xacts_overflow_policies,
-							PGC_POSTMASTER,
-							0,
-							NULL, NULL, NULL);
-
 	/*
 	 * Important: This must happen after other parts of the extension are
 	 * loaded, otherwise any settings to GUCs that were set before the
--- a/pgxn/neon/neon_walreader.c
+++ b/pgxn/neon/neon_walreader.c
@@ -109,12 +109,11 @@ NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, char *log_
 {
 	NeonWALReader *reader;

-	/*
-	 * Note: we allocate in TopMemoryContext, reusing the reader for all process
-	 * reads.
-	 */
 	reader = (NeonWALReader *)
-		MemoryContextAllocZero(TopMemoryContext, sizeof(NeonWALReader));
+		palloc_extended(sizeof(NeonWALReader),
+						MCXT_ALLOC_NO_OOM | MCXT_ALLOC_ZERO);
+	if (!reader)
+		return NULL;

 	reader->available_lsn = available_lsn;
 	reader->seg.ws_file = -1;
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand.

 [[package]]
 name = "aiohttp"
@@ -734,13 +734,13 @@ typing-extensions = ">=4.1.0"

 [[package]]
 name = "certifi"
-version = "2024.7.4"
+version = "2023.7.22"
 description = "Python package for providing Mozilla's CA Bundle."
 optional = false
 python-versions = ">=3.6"
 files = [
-    {file = "certifi-2024.7.4-py3-none-any.whl", hash = "sha256:c198e21b1289c2ab85ee4e67bb4b4ef3ead0892059901a8d5b622f24a1101e90"},
-    {file = "certifi-2024.7.4.tar.gz", hash = "sha256:5a1e7645bc0ec61a09e26c36f6106dd4cf40c6db3a1fb6352b0244e7fb057c7b"},
+    {file = "certifi-2023.7.22-py3-none-any.whl", hash = "sha256:92d6037539857d8206b8f6ae472e8b77db8058fec5937a1ef3f54304089edbb9"},
+    {file = "certifi-2023.7.22.tar.gz", hash = "sha256:539cc1d13202e33ca466e88b2807e29f4c13049d6d87031a3c110744495cb082"},
 ]

 [[package]]
@@ -2641,18 +2641,19 @@ pbr = "*"

 [[package]]
 name = "setuptools"
-version = "70.0.0"
+version = "65.5.1"
 description = "Easily download, build, install, upgrade, and uninstall Python packages"
 optional = false
-python-versions = ">=3.8"
+python-versions = ">=3.7"
 files = [
-    {file = "setuptools-70.0.0-py3-none-any.whl", hash = "sha256:54faa7f2e8d2d11bcd2c07bed282eef1046b5c080d1c32add737d7b5817b1ad4"},
-    {file = "setuptools-70.0.0.tar.gz", hash = "sha256:f211a66637b8fa059bb28183da127d4e86396c991a942b028c6650d4319c3fd0"},
+    {file = "setuptools-65.5.1-py3-none-any.whl", hash = "sha256:d0b9a8433464d5800cbe05094acf5c6d52a91bfac9b52bcfc4d41382be5d5d31"},
+    {file = "setuptools-65.5.1.tar.gz", hash = "sha256:e197a19aa8ec9722928f2206f8de752def0e4c9fc6953527360d1c36d94ddb2f"},
 ]

 [package.extras]
-docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier"]
-testing = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "importlib-metadata", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "mypy (==1.9)", "packaging (>=23.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.1)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy", "pytest-perf", "pytest-ruff (>=0.2.1)", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"]
+docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-hoverxref (<2)", "sphinx-inline-tabs", "sphinx-notfound-page (==0.8.3)", "sphinx-reredirects", "sphinxcontrib-towncrier"]
+testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8 (<5)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pip (>=19.1)", "pip-run (>=8.8)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-timeout", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"]
+testing-integration = ["build[virtualenv]", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"]

 [[package]]
 name = "six"
@@ -3132,18 +3133,18 @@ multidict = ">=4.0"

 [[package]]
 name = "zipp"
-version = "3.19.1"
+version = "3.8.1"
 description = "Backport of pathlib-compatible object wrapper for zip files"
 optional = false
-python-versions = ">=3.8"
+python-versions = ">=3.7"
 files = [
-    {file = "zipp-3.19.1-py3-none-any.whl", hash = "sha256:2828e64edb5386ea6a52e7ba7cdb17bb30a73a858f5eb6eb93d8d36f5ea26091"},
-    {file = "zipp-3.19.1.tar.gz", hash = "sha256:35427f6d5594f4acf82d25541438348c26736fa9b3afa2754bcd63cdb99d8e8f"},
+    {file = "zipp-3.8.1-py3-none-any.whl", hash = "sha256:47c40d7fe183a6f21403a199b3e4192cca5774656965b0a4988ad2f8feb5f009"},
+    {file = "zipp-3.8.1.tar.gz", hash = "sha256:05b45f1ee8f807d0cc928485ca40a07cb491cf092ff587c0df9cb1fd154848d2"},
 ]

 [package.extras]
-doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"]
-test = ["big-O", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-itertools", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-ignore-flaky", "pytest-mypy", "pytest-ruff (>=0.2.1)"]
+docs = ["jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx"]
+testing = ["func-timeout", "jaraco.itertools", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)"]

 [[package]]
 name = "zstandard"
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -92,7 +92,6 @@ tracing-opentelemetry.workspace = true
 tracing-subscriber.workspace = true
 tracing-utils.workspace = true
 tracing.workspace = true
-typed-json.workspace = true
 url.workspace = true
 urlencoding.workspace = true
 utils.workspace = true
--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -216,11 +216,10 @@ async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
    use pq_proto::FeStartupPacket::*;

    match msg {
-        SslRequest { direct: false } => {
+        SslRequest => {
            stream
                .write_message(&pq_proto::BeMessage::EncryptionResponse(true))
                .await?;
-
            // Upgrade raw stream into a secure TLS-backed stream.
            // NOTE: We've consumed `tls`; this fact will be used later.

--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -319,7 +319,7 @@ impl ConnCfg {
        let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute);
        let (client, connection) = self.0.connect_raw(stream, tls).await?;
        drop(pause);
-        tracing::Span::current().record("pid", tracing::field::display(client.get_process_id()));
+        tracing::Span::current().record("pid", &tracing::field::display(client.get_process_id()));
        let stream = connection.stream.into_inner();

        info!(
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -75,9 +75,6 @@ impl TlsConfig {
    }
 }

-/// <https://github.com/postgres/postgres/blob/ca481d3c9ab7bf69ff0c8d71ad3951d407f6a33c/src/include/libpq/pqcomm.h#L159>
-pub const PG_ALPN_PROTOCOL: &[u8] = b"postgresql";
-
 /// Configure TLS for the main endpoint.
 pub fn configure_tls(
    key_path: &str,
@@ -114,17 +111,16 @@ pub fn configure_tls(
    let cert_resolver = Arc::new(cert_resolver);

    // allow TLS 1.2 to be compatible with older client libraries
-    let mut config = rustls::ServerConfig::builder_with_protocol_versions(&[
+    let config = rustls::ServerConfig::builder_with_protocol_versions(&[
        &rustls::version::TLS13,
        &rustls::version::TLS12,
    ])
    .with_no_client_auth()
-    .with_cert_resolver(cert_resolver.clone());
-
-    config.alpn_protocols = vec![PG_ALPN_PROTOCOL.to_vec()];
+    .with_cert_resolver(cert_resolver.clone())
+    .into();

    Ok(TlsConfig {
-        config: Arc::new(config),
+        config,
        common_names,
        cert_resolver,
    })
--- a/proxy/src/console/mgmt.rs
+++ b/proxy/src/console/mgmt.rs
@@ -6,9 +6,8 @@ use anyhow::Context;
 use once_cell::sync::Lazy;
 use postgres_backend::{AuthType, PostgresBackend, PostgresBackendTCP, QueryError};
 use pq_proto::{BeMessage, SINGLE_COL_ROWDESC};
-use std::convert::Infallible;
+use std::{convert::Infallible, future};
 use tokio::net::{TcpListener, TcpStream};
-use tokio_util::sync::CancellationToken;
 use tracing::{error, info, info_span, Instrument};

 static CPLANE_WAITERS: Lazy<Waiters<ComputeReady>> = Lazy::new(Default::default);
@@ -68,9 +67,7 @@ pub async fn task_main(listener: TcpListener) -> anyhow::Result<Infallible> {

 async fn handle_connection(socket: TcpStream) -> Result<(), QueryError> {
    let pgbackend = PostgresBackend::new(socket, AuthType::Trust, None)?;
-    pgbackend
-        .run(&mut MgmtHandler, &CancellationToken::new())
-        .await
+    pgbackend.run(&mut MgmtHandler, future::pending::<()>).await
 }

 /// A message received by `mgmt` when a compute node is ready.
--- a/proxy/src/http.rs
+++ b/proxy/src/http.rs
@@ -4,11 +4,14 @@

 pub mod health_server;

-use std::time::Duration;
+use std::{str::FromStr, sync::Arc, time::Duration};

+use futures::FutureExt;
 pub use reqwest::{Request, Response, StatusCode};
 pub use reqwest_middleware::{ClientWithMiddleware, Error};
 pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
+use tokio::time::Instant;
+use tracing::trace;

 use crate::{
    metrics::{ConsoleRequest, Metrics},
@@ -21,6 +24,8 @@ use reqwest_middleware::RequestBuilder;
 /// We deliberately don't want to replace this with a public static.
 pub fn new_client() -> ClientWithMiddleware {
    let client = reqwest::ClientBuilder::new()
+        .dns_resolver(Arc::new(GaiResolver::default()))
+        .connection_verbose(true)
        .build()
        .expect("Failed to create http client");

@@ -31,6 +36,8 @@ pub fn new_client() -> ClientWithMiddleware {

 pub fn new_client_with_timeout(default_timout: Duration) -> ClientWithMiddleware {
    let timeout_client = reqwest::ClientBuilder::new()
+        .dns_resolver(Arc::new(GaiResolver::default()))
+        .connection_verbose(true)
        .timeout(default_timout)
        .build()
        .expect("Failed to create http client with timeout");
@@ -96,6 +103,38 @@ impl Endpoint {
    }
 }

+use hyper_util::client::legacy::connect::dns::{
+    GaiResolver as HyperGaiResolver, Name as HyperName,
+};
+use reqwest::dns::{Addrs, Name, Resolve, Resolving};
+/// https://docs.rs/reqwest/0.11.18/src/reqwest/dns/gai.rs.html
+use tower_service::Service;
+#[derive(Debug)]
+pub struct GaiResolver(HyperGaiResolver);
+
+impl Default for GaiResolver {
+    fn default() -> Self {
+        Self(HyperGaiResolver::new())
+    }
+}
+
+impl Resolve for GaiResolver {
+    fn resolve(&self, name: Name) -> Resolving {
+        let this = &mut self.0.clone();
+        let hyper_name = HyperName::from_str(name.as_str()).expect("name should be valid");
+        let start = Instant::now();
+        Box::pin(
+            Service::<HyperName>::call(this, hyper_name).map(move |result| {
+                let resolve_duration = start.elapsed();
+                trace!(duration = ?resolve_duration, addr = %name.as_str(), "resolve host complete");
+                result
+                    .map(|addrs| -> Addrs { Box::new(addrs) })
+                    .map_err(|err| -> Box<dyn std::error::Error + Send + Sync> { Box::new(err) })
+            }),
+        )
+    }
+}
+
 #[cfg(test)]
 mod tests {
    use super::*;
--- a/proxy/src/jemalloc.rs
+++ b/proxy/src/jemalloc.rs
@@ -3,8 +3,8 @@ use std::marker::PhantomData;
 use measured::{
    label::NoLabels,
    metric::{
-        gauge::GaugeState, group::Encoding, name::MetricNameEncoder, MetricEncoding,
-        MetricFamilyEncoding, MetricType,
+        gauge::GaugeState, group::Encoding, group::MetricValue, name::MetricNameEncoder,
+        MetricEncoding, MetricFamilyEncoding, MetricType,
    },
    text::TextEncoder,
    LabelGroup, MetricGroup,
@@ -100,7 +100,7 @@ macro_rules! jemalloc_gauge {
                enc: &mut TextEncoder<W>,
            ) -> Result<(), std::io::Error> {
                if let Ok(v) = mib.read() {
-                    GaugeState::new(v as i64).collect_into(&(), labels, name, enc)?;
+                    enc.write_metric_value(name, labels, MetricValue::Int(v as i64))?;
                }
                Ok(())
            }
--- a/proxy/src/logging.rs
+++ b/proxy/src/logging.rs
@@ -15,8 +15,7 @@ use tracing_subscriber::{
 pub async fn init() -> anyhow::Result<LoggingGuard> {
    let env_filter = EnvFilter::builder()
        .with_default_directive(LevelFilter::INFO.into())
-        .from_env_lossy()
-        .add_directive("azure_core::policies::transport=off".parse().unwrap());
+        .from_env_lossy();

    let fmt_layer = tracing_subscriber::fmt::layer()
        .with_ansi(false)
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -2,7 +2,7 @@ use std::sync::{Arc, OnceLock};

 use lasso::ThreadedRodeo;
 use measured::{
-    label::{FixedCardinalitySet, LabelGroupSet, LabelName, LabelSet, LabelValue, StaticLabelSet},
+    label::{FixedCardinalitySet, LabelName, LabelSet, LabelValue, StaticLabelSet},
    metric::{histogram::Thresholds, name::MetricName},
    Counter, CounterVec, FixedCardinalityLabel, Gauge, GaugeVec, Histogram, HistogramVec,
    LabelGroup, MetricGroup,
@@ -577,32 +577,6 @@ impl LabelGroup for ThreadPoolWorkerId {
    }
 }

-impl LabelGroupSet for ThreadPoolWorkers {
-    type Group<'a> = ThreadPoolWorkerId;
-
-    fn cardinality(&self) -> Option<usize> {
-        Some(self.0)
-    }
-
-    fn encode_dense(&self, value: Self::Unique) -> Option<usize> {
-        Some(value)
-    }
-
-    fn decode_dense(&self, value: usize) -> Self::Group<'_> {
-        ThreadPoolWorkerId(value)
-    }
-
-    type Unique = usize;
-
-    fn encode(&self, value: Self::Group<'_>) -> Option<Self::Unique> {
-        Some(value.0)
-    }
-
-    fn decode(&self, value: &Self::Unique) -> Self::Group<'_> {
-        ThreadPoolWorkerId(*value)
-    }
-}
-
 impl LabelSet for ThreadPoolWorkers {
    type Value<'a> = ThreadPoolWorkerId;

--- a/proxy/src/proxy/handshake.rs
+++ b/proxy/src/proxy/handshake.rs
@@ -1,17 +1,11 @@
-use bytes::Buf;
-use pq_proto::{
-    framed::Framed, BeMessage as Be, CancelKeyData, FeStartupPacket, ProtocolVersion,
-    StartupMessageParams,
-};
+use pq_proto::{BeMessage as Be, CancelKeyData, FeStartupPacket, StartupMessageParams};
 use thiserror::Error;
 use tokio::io::{AsyncRead, AsyncWrite};
-use tracing::{info, warn};
+use tracing::info;

 use crate::{
-    auth::endpoint_sni,
-    config::{TlsConfig, PG_ALPN_PROTOCOL},
+    config::TlsConfig,
    error::ReportableError,
-    metrics::Metrics,
    proxy::ERR_INSECURE_CONNECTION,
    stream::{PqStream, Stream, StreamUpgradeError},
 };
@@ -74,9 +68,6 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
    // Client may try upgrading to each protocol only once
    let (mut tried_ssl, mut tried_gss) = (false, false);

-    const PG_PROTOCOL_EARLIEST: ProtocolVersion = ProtocolVersion::new(3, 0);
-    const PG_PROTOCOL_LATEST: ProtocolVersion = ProtocolVersion::new(3, 0);
-
    let mut stream = PqStream::new(Stream::from_raw(stream));
    loop {
        let msg = stream.read_startup_packet().await?;
@@ -84,96 +75,40 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(

        use FeStartupPacket::*;
        match msg {
-            SslRequest { direct } => match stream.get_ref() {
+            SslRequest => match stream.get_ref() {
                Stream::Raw { .. } if !tried_ssl => {
                    tried_ssl = true;

                    // We can't perform TLS handshake without a config
-                    let have_tls = tls.is_some();
-                    if !direct {
-                        stream
-                            .write_message(&Be::EncryptionResponse(have_tls))
-                            .await?;
-                    } else if !have_tls {
-                        return Err(HandshakeError::ProtocolViolation);
-                    }
-
+                    let enc = tls.is_some();
+                    stream.write_message(&Be::EncryptionResponse(enc)).await?;
                    if let Some(tls) = tls.take() {
                        // Upgrade raw stream into a secure TLS-backed stream.
                        // NOTE: We've consumed `tls`; this fact will be used later.

-                        let Framed {
-                            stream: raw,
-                            read_buf,
-                            write_buf,
-                        } = stream.framed;
-
-                        let Stream::Raw { raw } = raw else {
-                            return Err(HandshakeError::StreamUpgradeError(
-                                StreamUpgradeError::AlreadyTls,
-                            ));
-                        };
-
-                        let mut read_buf = read_buf.reader();
-                        let mut res = Ok(());
-                        let accept = tokio_rustls::TlsAcceptor::from(tls.to_server_config())
-                            .accept_with(raw, |session| {
-                                // push the early data to the tls session
-                                while !read_buf.get_ref().is_empty() {
-                                    match session.read_tls(&mut read_buf) {
-                                        Ok(_) => {}
-                                        Err(e) => {
-                                            res = Err(e);
-                                            break;
-                                        }
-                                    }
-                                }
-                            });
-
-                        res?;
-
-                        let read_buf = read_buf.into_inner();
+                        let (raw, read_buf) = stream.into_inner();
+                        // TODO: Normally, client doesn't send any data before
+                        // server says TLS handshake is ok and read_buf is empy.
+                        // However, you could imagine pipelining of postgres
+                        // SSLRequest + TLS ClientHello in one hunk similar to
+                        // pipelining in our node js driver. We should probably
+                        // support that by chaining read_buf with the stream.
                        if !read_buf.is_empty() {
                            return Err(HandshakeError::EarlyData);
                        }
-
-                        let tls_stream = accept.await.inspect_err(|_| {
-                            if record_handshake_error {
-                                Metrics::get().proxy.tls_handshake_failures.inc()
-                            }
-                        })?;
-
-                        let conn_info = tls_stream.get_ref().1;
-
-                        // check the ALPN, if exists, as required.
-                        match conn_info.alpn_protocol() {
-                            None | Some(PG_ALPN_PROTOCOL) => {}
-                            Some(other) => {
-                                // try parse ep for better error
-                                let ep = conn_info.server_name().and_then(|sni| {
-                                    endpoint_sni(sni, &tls.common_names).ok().flatten()
-                                });
-                                let alpn = String::from_utf8_lossy(other);
-                                warn!(?ep, %alpn, "unexpected ALPN");
-                                return Err(HandshakeError::ProtocolViolation);
-                            }
-                        }
+                        let tls_stream = raw
+                            .upgrade(tls.to_server_config(), record_handshake_error)
+                            .await?;

                        let (_, tls_server_end_point) = tls
                            .cert_resolver
-                            .resolve(conn_info.server_name())
+                            .resolve(tls_stream.get_ref().1.server_name())
                            .ok_or(HandshakeError::MissingCertificate)?;

-                        stream = PqStream {
-                            framed: Framed {
-                                stream: Stream::Tls {
-                                    tls: Box::new(tls_stream),
-                                    tls_server_end_point,
-                                },
-                                read_buf,
-                                write_buf,
-                            },
-                        };
+                        stream = PqStream::new(Stream::Tls {
+                            tls: Box::new(tls_stream),
+                            tls_server_end_point,
+                        });
                    }
                }
                _ => return Err(HandshakeError::ProtocolViolation),
@@ -187,9 +122,7 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
                }
                _ => return Err(HandshakeError::ProtocolViolation),
            },
-            StartupMessage { params, version }
-                if PG_PROTOCOL_EARLIEST <= version && version <= PG_PROTOCOL_LATEST =>
-            {
+            StartupMessage { params, .. } => {
                // Check that the config has been consumed during upgrade
                // OR we didn't provide it at all (for dev purposes).
                if tls.is_some() {
@@ -198,48 +131,9 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
                        .await?;
                }

-                info!(?version, session_type = "normal", "successful handshake");
+                info!(session_type = "normal", "successful handshake");
                break Ok(HandshakeData::Startup(stream, params));
            }
-            // downgrade protocol version
-            StartupMessage { params, version }
-                if version.major() == 3 && version > PG_PROTOCOL_LATEST =>
-            {
-                warn!(?version, "unsupported minor version");
-
-                // no protocol extensions are supported.
-                // <https://github.com/postgres/postgres/blob/ca481d3c9ab7bf69ff0c8d71ad3951d407f6a33c/src/backend/tcop/backend_startup.c#L744-L753>
-                let mut unsupported = vec![];
-                for (k, _) in params.iter() {
-                    if k.starts_with("_pq_.") {
-                        unsupported.push(k);
-                    }
-                }
-
-                // TODO: remove unsupported options so we don't send them to compute.
-
-                stream
-                    .write_message(&Be::NegotiateProtocolVersion {
-                        version: PG_PROTOCOL_LATEST,
-                        options: &unsupported,
-                    })
-                    .await?;
-
-                info!(
-                    ?version,
-                    session_type = "normal",
-                    "successful handshake; unsupported minor version requested"
-                );
-                break Ok(HandshakeData::Startup(stream, params));
-            }
-            StartupMessage { version, .. } => {
-                warn!(
-                    ?version,
-                    session_type = "normal",
-                    "unsuccessful handshake; unsupported version"
-                );
-                return Err(HandshakeError::ProtocolViolation);
-            }
            CancelRequest(cancel_key_data) => {
                info!(session_type = "cancellation", "successful handshake");
                break Ok(HandshakeData::Cancel(cancel_key_data));
--- a/proxy/src/redis/cancellation_publisher.rs
+++ b/proxy/src/redis/cancellation_publisher.rs
@@ -106,7 +106,7 @@ impl RedisPublisherClient {
            cancel_key_data,
            session_id,
        }))?;
-        let _: () = self.client.publish(PROXY_CHANNEL_NAME, payload).await?;
+        self.client.publish(PROXY_CHANNEL_NAME, payload).await?;
        Ok(())
    }
    pub async fn try_connect(&mut self) -> anyhow::Result<()> {
--- a/Show More
+++ b/Show More