Merge pull request #8451 from neondatabase/rc/2024-07-22

## Storage & Compute release 2024-07-22 This PR has so many commits because the release branch diverged from `main`. Details https://neondb.slack.com/archives/C033A2WE6BZ/p1721650938949059?thread_ts=1721308848.034069&cid=C033A2WE6BZ The commit range that is truly new since the last storage release are the the `main` commit which I cherry-picked using this command ``` git cherry-pick 8a8b83df27383a07bb7dbba519325c15d2f46357..4e547e6 ```
2026-05-15 04:00:38 +00:00 · 2024-07-22 19:17:01 +02:00
parent d74fb7b879 7b63092958
commit 28ee7cdede
115 changed files with 3864 additions and 1460 deletions
--- a/.github/actions/neon-project-create/action.yml
+++ b/.github/actions/neon-project-create/action.yml
@@ -9,8 +9,8 @@ inputs:
    description: 'Region ID, if not set the project will be created in the default region'
    default: aws-us-east-2
  postgres_version:
-    description: 'Postgres version; default is 15'
-    default: '15'
+    description: 'Postgres version; default is 16'
+    default: '16'
  api_host:
    description: 'Neon API host'
    default: console-stage.neon.build
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -57,9 +57,10 @@ jobs:
  bench:
    if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
    strategy:
+      fail-fast: false
      matrix:
        include:
-          - DEFAULT_PG_VERSION: 14
+          - DEFAULT_PG_VERSION: 16
            PLATFORM: "neon-staging"
            region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
            provisioner: 'k8s-pod' 
@@ -146,6 +147,7 @@ jobs:
        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}

  replication-tests:
+    if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
    env:
      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
      DEFAULT_PG_VERSION: 14
@@ -190,6 +192,7 @@ jobs:
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 5400
+        pg_version: ${{ env.DEFAULT_PG_VERSION }}
      env:
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
@@ -215,11 +218,14 @@ jobs:
    # Available platforms:
    # - neon-captest-new: Freshly created project (1 CU)
    # - neon-captest-freetier: Use freetier-sized compute (0.25 CU)
+    # - neonvm-captest-azure-new: Freshly created project (1 CU) in azure region
+    # - neonvm-captest-azure-freetier: Use freetier-sized compute (0.25 CU) in azure region
    # - neon-captest-reuse: Reusing existing project
    # - rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs
    # - rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
    env:
      RUN_AWS_RDS_AND_AURORA: ${{ github.event.inputs.run_AWS_RDS_AND_AURORA || 'false' }}
+      DEFAULT_REGION_ID: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
    runs-on: ubuntu-22.04
    outputs:
      pgbench-compare-matrix: ${{ steps.pgbench-compare-matrix.outputs.matrix }}
@@ -230,23 +236,33 @@ jobs:
    - name: Generate matrix for pgbench benchmark
      id: pgbench-compare-matrix
      run: |
+        region_id_default=${{ env.DEFAULT_REGION_ID }}
        matrix='{
+          "pg_version" : [
+            16
+          ],
+          "region_id" : [
+            "'"$region_id_default"'"
+            ],
          "platform": [
            "neon-captest-new",
            "neon-captest-reuse",
            "neonvm-captest-new"
          ],
          "db_size": [ "10gb" ],
-          "include": [{ "platform": "neon-captest-freetier",         "db_size": "3gb"  },
-                      { "platform": "neon-captest-new",              "db_size": "50gb" },
-                      { "platform": "neonvm-captest-freetier",       "db_size": "3gb"  },
-                      { "platform": "neonvm-captest-new",            "db_size": "50gb" },
-                      { "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb" }]
+          "include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neon-captest-freetier",         "db_size": "3gb"  },
+                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neon-captest-new",              "db_size": "50gb" },
+                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier",       "db_size": "3gb"  },
+                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new",            "db_size": "50gb" },
+                      { "pg_version": 16, "region_id": "azure-eastus2",      "platform": "neonvm-azure-captest-freetier", "db_size": "3gb"  },
+                      { "pg_version": 16, "region_id": "azure-eastus2",      "platform": "neonvm-azure-captest-new",      "db_size": "10gb" },
+                      { "pg_version": 16, "region_id": "azure-eastus2",      "platform": "neonvm-azure-captest-new",      "db_size": "50gb" },
+                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb" }]
        }'

        if [ "$(date +%A)" = "Saturday" ]; then
-          matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "db_size": "10gb"},
-                                                     { "platform": "rds-aurora",   "db_size": "50gb"}]')
+          matrix=$(echo "$matrix" | jq '.include += [{ "pg_version": 14, "region_id": "'"$region_id_default"'", "platform": "rds-postgres", "db_size": "10gb"},
+                                                     { "pg_version": 14, "region_id": "'"$region_id_default"'", "platform": "rds-aurora",   "db_size": "50gb"}]')
        fi

        echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
@@ -298,7 +314,7 @@ jobs:
      TEST_PG_BENCH_DURATIONS_MATRIX: "60m"
      TEST_PG_BENCH_SCALES_MATRIX: ${{ matrix.db_size }}
      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
-      DEFAULT_PG_VERSION: 14
+      DEFAULT_PG_VERSION: ${{ matrix.pg_version }}
      TEST_OUTPUT: /tmp/test_output
      BUILD_TYPE: remote
      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
@@ -323,14 +339,14 @@ jobs:
        prefix: latest

    - name: Create Neon Project
-      if: contains(fromJson('["neon-captest-new", "neon-captest-freetier", "neonvm-captest-new", "neonvm-captest-freetier"]'), matrix.platform)
+      if: contains(fromJson('["neon-captest-new", "neon-captest-freetier", "neonvm-captest-new", "neonvm-captest-freetier", "neonvm-azure-captest-freetier", "neonvm-azure-captest-new"]'), matrix.platform)
      id: create-neon-project
      uses: ./.github/actions/neon-project-create
      with:
-        region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
+        region_id: ${{ matrix.region_id }}
        postgres_version: ${{ env.DEFAULT_PG_VERSION }}
        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
-        compute_units: ${{ (matrix.platform == 'neon-captest-freetier' && '[0.25, 0.25]') || '[1, 1]' }}
+        compute_units: ${{ (contains(matrix.platform, 'captest-freetier') && '[0.25, 0.25]') || '[1, 1]' }}
        provisioner: ${{ (contains(matrix.platform, 'neonvm-') && 'k8s-neonvm') || 'k8s-pod' }}

    - name: Set up Connection String
@@ -343,7 +359,7 @@ jobs:
          neonvm-captest-sharding-reuse)
            CONNSTR=${{ secrets.BENCHMARK_CAPTEST_SHARDING_CONNSTR }}
            ;;
-          neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier)
+          neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier | neonvm-azure-captest-new | neonvm-azure-captest-freetier)
            CONNSTR=${{ steps.create-neon-project.outputs.dsn }}
            ;;
          rds-aurora)
@@ -368,6 +384,7 @@ jobs:
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_init
+        pg_version: ${{ env.DEFAULT_PG_VERSION }}
      env:
        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -381,6 +398,7 @@ jobs:
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_simple_update
+        pg_version: ${{ env.DEFAULT_PG_VERSION }}
      env:
        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -394,6 +412,7 @@ jobs:
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_select_only
+        pg_version: ${{ env.DEFAULT_PG_VERSION }}
      env:
        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -420,6 +439,13 @@ jobs:
        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}

  pgbench-pgvector:
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - PLATFORM: "neon-captest-pgvector"
+          - PLATFORM: "azure-captest-pgvector"
+            
    env:
      TEST_PG_BENCH_DURATIONS_MATRIX: "15m"
      TEST_PG_BENCH_SCALES_MATRIX: "1"
@@ -427,8 +453,9 @@ jobs:
      DEFAULT_PG_VERSION: 16
      TEST_OUTPUT: /tmp/test_output
      BUILD_TYPE: remote
+      LD_LIBRARY_PATH: /home/nonroot/pg/usr/lib/x86_64-linux-gnu
      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
-      PLATFORM: "neon-captest-pgvector"
+      PLATFORM: ${{ matrix.PLATFORM }}

    runs-on: [ self-hosted, us-east-2, x64 ]
    container:
@@ -438,17 +465,39 @@ jobs:
    steps:
    - uses: actions/checkout@v4

-    - name: Download Neon artifact
-      uses: ./.github/actions/download
-      with:
-        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
-        path: /tmp/neon/
-        prefix: latest
+    # until https://github.com/neondatabase/neon/issues/8275 is fixed we temporarily install postgresql-16
+    # instead of using Neon artifacts containing pgbench
+    - name: Install postgresql-16 where pytest expects it
+      run: |
+        cd /home/nonroot
+        wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/libpq5_16.3-1.pgdg110%2B1_amd64.deb
+        wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-client-16_16.3-1.pgdg110%2B1_amd64.deb
+        wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-16_16.3-1.pgdg110%2B1_amd64.deb 
+        dpkg -x libpq5_16.3-1.pgdg110+1_amd64.deb pg
+        dpkg -x postgresql-client-16_16.3-1.pgdg110+1_amd64.deb pg
+        dpkg -x postgresql-16_16.3-1.pgdg110+1_amd64.deb pg
+        mkdir -p /tmp/neon/pg_install/v16/bin
+        ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/pgbench /tmp/neon/pg_install/v16/bin/pgbench  
+        ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/psql /tmp/neon/pg_install/v16/bin/psql  
+        ln -s /home/nonroot/pg/usr/lib/x86_64-linux-gnu /tmp/neon/pg_install/v16/lib 
+        /tmp/neon/pg_install/v16/bin/pgbench --version
+        /tmp/neon/pg_install/v16/bin/psql --version

    - name: Set up Connection String
      id: set-up-connstr
      run: |
-        CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR }}
+        case "${PLATFORM}" in
+          neon-captest-pgvector)
+            CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR }}
+            ;;
+          azure-captest-pgvector)
+            CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR_AZURE }}
+            ;;
+          *)
+            echo >&2 "Unknown PLATFORM=${PLATFORM}"
+            exit 1
+            ;;
+        esac

        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT

@@ -460,6 +509,7 @@ jobs:
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 21600 -k test_pgvector_indexing
+        pg_version: ${{ env.DEFAULT_PG_VERSION }}
      env:
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
@@ -473,6 +523,7 @@ jobs:
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 21600
+        pg_version: ${{ env.DEFAULT_PG_VERSION }}
      env:
        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -487,11 +538,10 @@ jobs:
      uses: slackapi/slack-github-action@v1
      with:
        channel-id: "C033QLM5P7D" # dev-staging-stream
-        slack-message: "Periodic perf testing neon-captest-pgvector: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+        slack-message: "Periodic perf testing ${PLATFORM}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
      env:
        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}

-
  clickbench-compare:
    # ClichBench DB for rds-aurora and rds-Postgres deployed to the same clusters
    # we use for performance testing in pgbench-compare.
@@ -735,6 +785,7 @@ jobs:
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 21600 -k test_user_examples
+        pg_version: ${{ env.DEFAULT_PG_VERSION }}
      env:
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1368,6 +1368,7 @@ dependencies = [
 "tracing",
 "url",
 "utils",
+ "whoami",
 "workspace_hack",
 ]

@@ -3233,16 +3234,6 @@ dependencies = [
 "winapi",
 ]

-[[package]]
-name = "nu-ansi-term"
-version = "0.46.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84"
-dependencies = [
- "overload",
- "winapi",
-]
-
 [[package]]
 name = "num"
 version = "0.4.1"
@@ -3538,12 +3529,6 @@ version = "0.5.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4030760ffd992bef45b0ae3f10ce1aba99e33464c90d14dd7c039884963ddc7a"

-[[package]]
-name = "overload"
-version = "0.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
-
 [[package]]
 name = "p256"
 version = "0.11.1"
@@ -4404,6 +4389,7 @@ dependencies = [
 "tracing-opentelemetry",
 "tracing-subscriber",
 "tracing-utils",
+ "typed-json",
 "url",
 "urlencoding",
 "utils",
@@ -4602,6 +4588,15 @@ dependencies = [
 "bitflags 1.3.2",
 ]

+[[package]]
+name = "redox_syscall"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa"
+dependencies = [
+ "bitflags 1.3.2",
+]
+
 [[package]]
 name = "regex"
 version = "1.10.2"
@@ -5811,6 +5806,28 @@ dependencies = [
 "workspace_hack",
 ]

+[[package]]
+name = "storage_controller_client"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "async-trait",
+ "bytes",
+ "futures",
+ "pageserver_api",
+ "pageserver_client",
+ "postgres",
+ "reqwest 0.12.4",
+ "serde",
+ "thiserror",
+ "tokio",
+ "tokio-postgres",
+ "tokio-stream",
+ "tokio-util",
+ "utils",
+ "workspace_hack",
+]
+
 [[package]]
 name = "storage_scrubber"
 version = "0.1.0"
@@ -5845,6 +5862,7 @@ dependencies = [
 "serde",
 "serde_json",
 "serde_with",
+ "storage_controller_client",
 "thiserror",
 "tokio",
 "tokio-postgres",
@@ -5874,6 +5892,7 @@ dependencies = [
 "reqwest 0.12.4",
 "serde",
 "serde_json",
+ "storage_controller_client",
 "thiserror",
 "tokio",
 "tracing",
@@ -6600,7 +6619,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "30a651bc37f915e81f087d86e62a18eec5f79550c7faff886f7090b4ea757c77"
 dependencies = [
 "matchers",
- "nu-ansi-term",
 "once_cell",
 "regex",
 "serde",
@@ -6665,6 +6683,16 @@ dependencies = [
 "static_assertions",
 ]

+[[package]]
+name = "typed-json"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6024a8d0025400b3f6b189366e9aa92012cf9c4fe1cd2620848dd61425c49eed"
+dependencies = [
+ "serde",
+ "serde_json",
+]
+
 [[package]]
 name = "typenum"
 version = "1.16.0"
@@ -6961,6 +6989,12 @@ version = "0.11.0+wasi-snapshot-preview1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"

+[[package]]
+name = "wasite"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b8dad83b4f25e74f184f64c43b150b91efe7647395b42289f38e50566d82855b"
+
 [[package]]
 name = "wasm-bindgen"
 version = "0.2.92"
@@ -7113,6 +7147,17 @@ dependencies = [
 "once_cell",
 ]

+[[package]]
+name = "whoami"
+version = "1.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a44ab49fad634e88f55bf8f9bb3abd2f27d7204172a112c7c9987e01c1c94ea9"
+dependencies = [
+ "redox_syscall 0.4.1",
+ "wasite",
+ "web-sys",
+]
+
 [[package]]
 name = "winapi"
 version = "0.3.9"
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -13,6 +13,7 @@ members = [
    "safekeeper",
    "storage_broker",
    "storage_controller",
+    "storage_controller/client",
    "storage_scrubber",
    "workspace_hack",
    "libs/compute_api",
@@ -182,14 +183,16 @@ tower-service = "0.3.2"
 tracing = "0.1"
 tracing-error = "0.2.0"
 tracing-opentelemetry = "0.21.0"
-tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json", "ansi"] }
+tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
 twox-hash = { version = "1.6.3", default-features = false }
+typed-json = "0.1"
 url = "2.2"
 urlencoding = "2.1"
 uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] }
 walkdir = "2.3.2"
 rustls-native-certs = "0.7"
 x509-parser = "0.15"
+whoami = "1.5.1"

 ## TODO replace this with tracing
 env_logger = "0.10"
@@ -219,6 +222,7 @@ remote_storage = { version = "0.1", path = "./libs/remote_storage/" }
 safekeeper_api = { version = "0.1", path = "./libs/safekeeper_api" }
 desim = { version = "0.1", path = "./libs/desim" }
 storage_broker = { version = "0.1", path = "./storage_broker/" } # Note: main broker code is inside the binary crate, so linking with the library shouldn't be heavy.
+storage_controller_client = { path = "./storage_controller/client" }
 tenant_size_model = { version = "0.1", path = "./libs/tenant_size_model/" }
 tracing-utils = { version = "0.1", path = "./libs/tracing-utils/" }
 utils = { version = "0.1", path = "./libs/utils/" }
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -311,9 +311,12 @@ RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz
 FROM build-deps AS rum-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

+COPY patches/rum.patch /rum.patch
+
 RUN wget https://github.com/postgrespro/rum/archive/refs/tags/1.3.13.tar.gz -O rum.tar.gz && \
    echo "6ab370532c965568df6210bd844ac6ba649f53055e48243525b0b7e5c4d69a7d rum.tar.gz" | sha256sum --check && \
    mkdir rum-src && cd rum-src && tar xzf ../rum.tar.gz --strip-components=1 -C . && \
+    patch -p1 < /rum.patch && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/rum.control
--- a/compute_tools/src/migration.rs
+++ b/compute_tools/src/migration.rs
@@ -9,6 +9,9 @@ pub(crate) struct MigrationRunner<'m> {

 impl<'m> MigrationRunner<'m> {
    pub fn new(client: &'m mut Client, migrations: &'m [&'m str]) -> Self {
+        // The neon_migration.migration_id::id column is a bigint, which is equivalent to an i64
+        assert!(migrations.len() + 1 < i64::MAX as usize);
+
        Self { client, migrations }
    }

@@ -22,11 +25,8 @@ impl<'m> MigrationRunner<'m> {
        Ok(row.get::<&str, i64>("id"))
    }

-    fn update_migration_id(&mut self) -> Result<()> {
-        let setval = format!(
-            "UPDATE neon_migration.migration_id SET id={}",
-            self.migrations.len()
-        );
+    fn update_migration_id(&mut self, migration_id: i64) -> Result<()> {
+        let setval = format!("UPDATE neon_migration.migration_id SET id={}", migration_id);

        self.client
            .simple_query(&setval)
@@ -57,44 +57,49 @@ impl<'m> MigrationRunner<'m> {
    pub fn run_migrations(mut self) -> Result<()> {
        self.prepare_migrations()?;

-        let mut current_migration: usize = self.get_migration_id()? as usize;
-        let starting_migration_id = current_migration;
-
-        let query = "BEGIN";
-        self.client
-            .simple_query(query)
-            .context("run_migrations begin")?;
-
+        let mut current_migration = self.get_migration_id()? as usize;
        while current_migration < self.migrations.len() {
+            macro_rules! migration_id {
+                ($cm:expr) => {
+                    ($cm + 1) as i64
+                };
+            }
+
            let migration = self.migrations[current_migration];

            if migration.starts_with("-- SKIP") {
-                info!("Skipping migration id={}", current_migration);
+                info!("Skipping migration id={}", migration_id!(current_migration));
            } else {
                info!(
                    "Running migration id={}:\n{}\n",
-                    current_migration, migration
+                    migration_id!(current_migration),
+                    migration
                );
+
+                self.client
+                    .simple_query("BEGIN")
+                    .context("begin migration")?;
+
                self.client.simple_query(migration).with_context(|| {
-                    format!("run_migration current_migration={}", current_migration)
+                    format!(
+                        "run_migrations migration id={}",
+                        migration_id!(current_migration)
+                    )
                })?;
+
+                // Migration IDs start at 1
+                self.update_migration_id(migration_id!(current_migration))?;
+
+                self.client
+                    .simple_query("COMMIT")
+                    .context("commit migration")?;
+
+                info!("Finished migration id={}", migration_id!(current_migration));
            }

            current_migration += 1;
        }

-        self.update_migration_id()?;
-
-        let query = "COMMIT";
-        self.client
-            .simple_query(query)
-            .context("run_migrations commit")?;
-
-        info!(
-            "Ran {} migrations",
-            (self.migrations.len() - starting_migration_id)
-        );
-
        Ok(())
    }
 }
--- a/compute_tools/src/migrations/0001-neon_superuser_bypass_rls.sql
+++ b/compute_tools/src/migrations/0001-neon_superuser_bypass_rls.sql
--- a/compute_tools/src/migrations/0002-alter_roles.sql
+++ b/compute_tools/src/migrations/0002-alter_roles.sql
--- a/compute_tools/src/migrations/0003-grant_pg_create_subscription_to_neon_superuser.sql
+++ b/compute_tools/src/migrations/0003-grant_pg_create_subscription_to_neon_superuser.sql
--- a/compute_tools/src/migrations/0004-grant_pg_monitor_to_neon_superuser.sql
+++ b/compute_tools/src/migrations/0004-grant_pg_monitor_to_neon_superuser.sql
--- a/compute_tools/src/migrations/0005-grant_all_on_tables_to_neon_superuser.sql
+++ b/compute_tools/src/migrations/0005-grant_all_on_tables_to_neon_superuser.sql
--- a/compute_tools/src/migrations/0006-grant_all_on_sequences_to_neon_superuser.sql
+++ b/compute_tools/src/migrations/0006-grant_all_on_sequences_to_neon_superuser.sql
--- a/compute_tools/src/migrations/0007-grant_all_on_tables_to_neon_superuser_with_grant_option.sql
+++ b/compute_tools/src/migrations/0007-grant_all_on_tables_to_neon_superuser_with_grant_option.sql
--- a/compute_tools/src/migrations/0008-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql
+++ b/compute_tools/src/migrations/0008-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql
--- a/compute_tools/src/migrations/0009-revoke_replication_for_previously_allowed_roles.sql
+++ b/compute_tools/src/migrations/0009-revoke_replication_for_previously_allowed_roles.sql
--- a/compute_tools/src/migrations/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql
+++ b/compute_tools/src/migrations/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -777,21 +777,21 @@ pub fn handle_migrations(client: &mut Client) -> Result<()> {

    // Add new migrations in numerical order.
    let migrations = [
-        include_str!("./migrations/0000-neon_superuser_bypass_rls.sql"),
-        include_str!("./migrations/0001-alter_roles.sql"),
-        include_str!("./migrations/0002-grant_pg_create_subscription_to_neon_superuser.sql"),
-        include_str!("./migrations/0003-grant_pg_monitor_to_neon_superuser.sql"),
-        include_str!("./migrations/0004-grant_all_on_tables_to_neon_superuser.sql"),
-        include_str!("./migrations/0005-grant_all_on_sequences_to_neon_superuser.sql"),
+        include_str!("./migrations/0001-neon_superuser_bypass_rls.sql"),
+        include_str!("./migrations/0002-alter_roles.sql"),
+        include_str!("./migrations/0003-grant_pg_create_subscription_to_neon_superuser.sql"),
+        include_str!("./migrations/0004-grant_pg_monitor_to_neon_superuser.sql"),
+        include_str!("./migrations/0005-grant_all_on_tables_to_neon_superuser.sql"),
+        include_str!("./migrations/0006-grant_all_on_sequences_to_neon_superuser.sql"),
        include_str!(
-            "./migrations/0006-grant_all_on_tables_to_neon_superuser_with_grant_option.sql"
+            "./migrations/0007-grant_all_on_tables_to_neon_superuser_with_grant_option.sql"
        ),
        include_str!(
-            "./migrations/0007-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql"
+            "./migrations/0008-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql"
        ),
-        include_str!("./migrations/0008-revoke_replication_for_previously_allowed_roles.sql"),
+        include_str!("./migrations/0009-revoke_replication_for_previously_allowed_roles.sql"),
        include_str!(
-            "./migrations/0009-grant_snapshot_synchronization_funcs_to_neon_superuser.sql"
+            "./migrations/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql"
        ),
    ];

--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -40,6 +40,7 @@ safekeeper_api.workspace = true
 postgres_connection.workspace = true
 storage_broker.workspace = true
 utils.workspace = true
+whoami.workspace = true

 compute_api.workspace = true
 workspace_hack.workspace = true
--- a/control_plane/src/broker.rs
+++ b/control_plane/src/broker.rs
@@ -1,9 +1,9 @@
 //! Code to manage the storage broker
 //!
-//! In the local test environment, the data for each safekeeper is stored in
+//! In the local test environment, the storage broker stores its data directly in
 //!
 //! ```text
-//!   .neon/safekeepers/<safekeeper id>
+//!   .neon
 //! ```
 use std::time::Duration;

--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -1,8 +1,10 @@
 //! Code to manage pageservers
 //!
-//! In the local test environment, the pageserver stores its data directly in
+//! In the local test environment, the data for each pageserver is stored in
 //!
-//!   .neon/
+//! ```text
+//!   .neon/pageserver_<pageserver_id>
+//! ```
 //!
 use std::collections::HashMap;

--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -29,7 +29,6 @@ use utils::{
 pub struct StorageController {
    env: LocalEnv,
    listen: String,
-    path: Utf8PathBuf,
    private_key: Option<Vec<u8>>,
    public_key: Option<String>,
    postgres_port: u16,
@@ -41,6 +40,8 @@ const COMMAND: &str = "storage_controller";

 const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16;

+const DB_NAME: &str = "storage_controller";
+
 #[derive(Serialize, Deserialize)]
 pub struct AttachHookRequest {
    pub tenant_shard_id: TenantShardId,
@@ -65,10 +66,6 @@ pub struct InspectResponse {

 impl StorageController {
    pub fn from_env(env: &LocalEnv) -> Self {
-        let path = Utf8PathBuf::from_path_buf(env.base_data_dir.clone())
-            .unwrap()
-            .join("attachments.json");
-
        // Makes no sense to construct this if pageservers aren't going to use it: assume
        // pageservers have control plane API set
        let listen_url = env.control_plane_api.clone().unwrap();
@@ -128,7 +125,6 @@ impl StorageController {

        Self {
            env: env.clone(),
-            path,
            listen,
            private_key,
            public_key,
@@ -203,7 +199,6 @@ impl StorageController {
    ///
    /// Returns the database url
    pub async fn setup_database(&self) -> anyhow::Result<String> {
-        const DB_NAME: &str = "storage_controller";
        let database_url = format!("postgresql://localhost:{}/{DB_NAME}", self.postgres_port);

        let pg_bin_dir = self.get_pg_bin_dir().await?;
@@ -232,6 +227,30 @@ impl StorageController {
        Ok(database_url)
    }

+    pub async fn connect_to_database(
+        &self,
+    ) -> anyhow::Result<(
+        tokio_postgres::Client,
+        tokio_postgres::Connection<tokio_postgres::Socket, tokio_postgres::tls::NoTlsStream>,
+    )> {
+        tokio_postgres::Config::new()
+            .host("localhost")
+            .port(self.postgres_port)
+            // The user is the ambient operating system user name.
+            // That is an impurity which we want to fix in => TODO https://github.com/neondatabase/neon/issues/8400
+            //
+            // Until we get there, use the ambient operating system user name.
+            // Recent tokio-postgres versions default to this if the user isn't specified.
+            // But tokio-postgres fork doesn't have this upstream commit:
+            // https://github.com/sfackler/rust-postgres/commit/cb609be758f3fb5af537f04b584a2ee0cebd5e79
+            // => we should rebase our fork => TODO https://github.com/neondatabase/neon/issues/8399
+            .user(&whoami::username())
+            .dbname(DB_NAME)
+            .connect(tokio_postgres::NoTls)
+            .await
+            .map_err(anyhow::Error::new)
+    }
+
    pub async fn start(&self, retry_timeout: &Duration) -> anyhow::Result<()> {
        // Start a vanilla Postgres process used by the storage controller for persistence.
        let pg_data_path = Utf8PathBuf::from_path_buf(self.env.base_data_dir.clone())
@@ -256,18 +275,21 @@ impl StorageController {
            if !status.success() {
                anyhow::bail!("initdb failed with status {status}");
            }
-
-            // Write a minimal config file:
-            // - Specify the port, since this is chosen dynamically
-            // - Switch off fsync, since we're running on lightweight test environments and when e.g. scale testing
-            //   the storage controller we don't want a slow local disk to interfere with that.
-            tokio::fs::write(
-                &pg_data_path.join("postgresql.conf"),
-                format!("port = {}\nfsync=off\n", self.postgres_port),
-            )
-            .await?;
        };

+        // Write a minimal config file:
+        // - Specify the port, since this is chosen dynamically
+        // - Switch off fsync, since we're running on lightweight test environments and when e.g. scale testing
+        //   the storage controller we don't want a slow local disk to interfere with that.
+        //
+        // NB: it's important that we rewrite this file on each start command so we propagate changes
+        // from `LocalEnv`'s config file (`.neon/config`).
+        tokio::fs::write(
+            &pg_data_path.join("postgresql.conf"),
+            format!("port = {}\nfsync=off\n", self.postgres_port),
+        )
+        .await?;
+
        println!("Starting storage controller database...");
        let db_start_args = [
            "-w",
@@ -296,11 +318,38 @@ impl StorageController {
        // Run migrations on every startup, in case something changed.
        let database_url = self.setup_database().await?;

+        // We support running a startup SQL script to fiddle with the database before we launch storcon.
+        // This is used by the test suite.
+        let startup_script_path = self
+            .env
+            .base_data_dir
+            .join("storage_controller_db.startup.sql");
+        let startup_script = match tokio::fs::read_to_string(&startup_script_path).await {
+            Ok(script) => {
+                tokio::fs::remove_file(startup_script_path).await?;
+                script
+            }
+            Err(e) => {
+                if e.kind() == std::io::ErrorKind::NotFound {
+                    // always run some startup script so that this code path doesn't bit rot
+                    "BEGIN; COMMIT;".to_string()
+                } else {
+                    anyhow::bail!("Failed to read startup script: {e}")
+                }
+            }
+        };
+        let (mut client, conn) = self.connect_to_database().await?;
+        let conn = tokio::spawn(conn);
+        let tx = client.build_transaction();
+        let tx = tx.start().await?;
+        tx.batch_execute(&startup_script).await?;
+        tx.commit().await?;
+        drop(client);
+        conn.await??;
+
        let mut args = vec![
            "-l",
            &self.listen,
-            "-p",
-            self.path.as_ref(),
            "--dev",
            "--database-url",
            &database_url,
--- a/control_plane/storcon_cli/Cargo.toml
+++ b/control_plane/storcon_cli/Cargo.toml
@@ -17,6 +17,7 @@ pageserver_client.workspace = true
 reqwest.workspace = true
 serde.workspace = true
 serde_json = { workspace = true, features = ["raw_value"] }
+storage_controller_client.workspace = true
 thiserror.workspace = true
 tokio.workspace = true
 tracing.workspace = true
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -14,15 +14,15 @@ use pageserver_api::{
    },
    shard::{ShardStripeSize, TenantShardId},
 };
-use pageserver_client::mgmt_api::{self, ResponseErrorMessageExt};
+use pageserver_client::mgmt_api::{self};
 use reqwest::{Method, StatusCode, Url};
-use serde::{de::DeserializeOwned, Serialize};
 use utils::id::{NodeId, TenantId};

 use pageserver_api::controller_api::{
    NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy,
    TenantShardMigrateRequest, TenantShardMigrateResponse,
 };
+use storage_controller_client::control_api::Client;

 #[derive(Subcommand, Debug)]
 enum Command {
@@ -249,64 +249,6 @@ impl FromStr for NodeAvailabilityArg {
    }
 }

-struct Client {
-    base_url: Url,
-    jwt_token: Option<String>,
-    client: reqwest::Client,
-}
-
-impl Client {
-    fn new(base_url: Url, jwt_token: Option<String>) -> Self {
-        Self {
-            base_url,
-            jwt_token,
-            client: reqwest::ClientBuilder::new()
-                .build()
-                .expect("Failed to construct http client"),
-        }
-    }
-
-    /// Simple HTTP request wrapper for calling into storage controller
-    async fn dispatch<RQ, RS>(
-        &self,
-        method: Method,
-        path: String,
-        body: Option<RQ>,
-    ) -> mgmt_api::Result<RS>
-    where
-        RQ: Serialize + Sized,
-        RS: DeserializeOwned + Sized,
-    {
-        // The configured URL has the /upcall path prefix for pageservers to use: we will strip that out
-        // for general purpose API access.
-        let url = Url::from_str(&format!(
-            "http://{}:{}/{path}",
-            self.base_url.host_str().unwrap(),
-            self.base_url.port().unwrap()
-        ))
-        .unwrap();
-
-        let mut builder = self.client.request(method, url);
-        if let Some(body) = body {
-            builder = builder.json(&body)
-        }
-        if let Some(jwt_token) = &self.jwt_token {
-            builder = builder.header(
-                reqwest::header::AUTHORIZATION,
-                format!("Bearer {jwt_token}"),
-            );
-        }
-
-        let response = builder.send().await.map_err(mgmt_api::Error::ReceiveBody)?;
-        let response = response.error_from_body().await?;
-
-        response
-            .json()
-            .await
-            .map_err(pageserver_client::mgmt_api::Error::ReceiveBody)
-    }
-}
-
 #[tokio::main]
 async fn main() -> anyhow::Result<()> {
    let cli = Cli::parse();
--- a/docs/rfcs/034-ancestor-deletion.md
+++ b/docs/rfcs/034-ancestor-deletion.md
@@ -0,0 +1,252 @@
+# Ancestor Timeline Deletion
+
+Created on: 2024-02-23
+
+Author: John Spray
+
+# Summary
+
+When a tenant creates a new timeline that they will treat as their 'main' history,
+it is awkward to permanently retain an 'old main' timeline as its ancestor. Currently
+this is necessary because it is forbidden to delete a timeline which has descendents.
+
+A new pageserver API is proposed to 'adopt' data from a parent timeline into
+one of its children, such that the link between ancestor and child can be severed,
+leaving the parent in a state where it may then be deleted.
+
+# Motivation
+
+Retaining parent timelines currently has two costs:
+
+- Cognitive load on users, who have to remember which is the "real" main timeline.
+- Storage capacity cost, as the parent timeline will retain layers up to the
+  child's timeline point, even if the child fully covers its keyspace with image
+  layers and will never actually read from the parent.
+
+# Solution
+
+A new pageserver API `PUT /v1/tenant/:tenant_id/timeline/:timeline_id/detach_ancestor`
+will be added. The `timeline_id` in this URL is that of the _child_ timeline that we
+wish to detach from its parent.
+
+On success, this API will leave the following state:
+
+- The detached child timeline will no longer have an ancestor, and will contain all
+  the data needed to service reads without recursing into an ancestor.
+- Any other children of the parent whose timeline points were at a lower LSN than
+  the detached child timeline will be modified to have the child timeline as their
+  new parent.
+- The parent timeline will still exist, but the child will no longer have it as an
+  ancestor. If this was the last timeline that depended on the parent, then the
+  parent will become deletable.
+
+This API's implementation will consist of a series of retryable steps, such that
+on failures/timeout it can safely be called again to reach the target state.
+
+## Example
+
+### Before
+
+The user has "rolled back" their project to LSN X, resulting in a "new main"
+timeline. The parent "old main" timeline still exists, and they would like
+to clean it up.
+
+They have two other timelines A and B. A is from before the rollback point,
+and B is from after the rollback point.
+
+```
+----"old main" timeline-------X-------------------------------------------->
+                |             |                         |
+                |-> child A   |                         |
+                              |-> "new main" timeline   |
+                                                        -> child B
+
+```
+
+### After calling detach ancestor API
+
+The "new main" timeline is no longer dependent on old main, and neither
+is child A, because it had a branch point before X.
+
+The user may now choose to delete child B and "old main" to get to
+a pristine state. Child B is likely to be unwanted since the user
+chose to roll back to X, and it branches from after X. However, we
+don't assume this in the API; it is up to the user to delete it.
+
+```
+|----"old main" timeline---------------------------------------------------->
+                                                         |
+                                                         |
+                                                         |
+                                                         -> child B
+
+|----"new main" timeline--------->
+                 |
+                 |-> child A
+
+
+```
+
+### After removing timelines
+
+We end up with a totally clean state that leaves no trace that a rollback
+ever happened: there is only one root timeline.
+
+```
+| ----"new main" timeline----------->
+                |
+                |-> child A
+
+
+```
+
+## Caveats
+
+Important things for API users to bear in mind:
+
+- this API does not delete the parent timeline: you must still do that explicitly.
+- if there are other child timelines ahead of the branch point of the detached
+  child, the parent won't be deletable: you must either delete or detach those
+  children.
+- do _not_ simply loop over all children and detach them all: this can have an
+  extremely high storage cost. The detach ancestor API is intended for use on a single
+  timeline to make it the new "main".
+- The detach ancestor API should also not be
+  exposed directly to the user as button/API, because they might decide
+  to click it for all the children and thereby generate many copies of the
+  parent's data -- the detach ancestor API should be used as part
+  of a high level "clean up after rollback" feature.
+
+## `detach_ancestor` API implementation
+
+Terms used in the following sections:
+
+- "the child": the timeline whose ID is specified in the detach ancestor API URL, also
+  called "new main" in the example.
+- "the parent": the parent of "the child". Also called "old main" in the example.
+- "the branch point" the ancestor_lsn of "the child"
+
+### Phase 1: write out adopted layers to S3
+
+The child will "adopt" layers from the parent, such that its end state contains
+all the parent's history as well as its own.
+
+For all layers in the parent's layer map whose high LSN is below the branch
+point, issue S3 CopyObject requests to duplicate them into the child timeline's
+prefix. Do not add them to the child's layer map yet.
+
+For delta layers in the parent's layer map which straddle the branch point, read them
+and write out only content up to the branch point into new layer objects.
+
+This is a long running operation if the parent has many layers: it should be
+implemented in a way that resumes rather than restarting from scratch, if the API
+times out and is called again.
+
+As an optimization, if there are no other timelines that will be adopted into
+the child, _and_ the child's image layers already full cover the branch LSN,
+then we may skip adopting layers.
+
+### Phase 2: update the child's index
+
+Having written out all needed layers in phase 1, atomically link them all
+into the child's IndexPart and upload to S3. This may be done while the
+child Timeline is still running.
+
+### Phase 3: modify timelines ancestry
+
+Modify the child's ancestor to None, and upload its IndexPart to persist the change.
+
+For all timelines which have the same parent as the child, and have a branch
+point lower than our branch point, switch their ancestor_timeline to the child,
+and upload their IndexPart to persist the change.
+
+## Alternatives considered
+
+### Generate full image layer on child, rather than adopting parent deltas
+
+This would work for the case of a single child, but would prevent re-targeting
+other timelines that depended on the parent. If we detached many children this
+way, the storage cost would become prohibitive (consider a 1TB database with
+100 child timelines: it would cost 100TiB if they all generated their own image layers).
+
+### Don't rewrite anything: just fake it in the API
+
+We could add a layer of indirection that let a child "pretend" that it had no
+ancestor, when in reality it still had the parent. The pageserver API could
+accept deletion of ancestor timelines, and just update child metadata to make
+them look like they have no ancestor.
+
+This would not achieve the desired reduction in storage cost, and may well be more
+complex to maintain than simply implementing the API described in this RFC.
+
+### Avoid copying objects: enable child index to use parent layers directly
+
+We could teach IndexPart to store a TimelineId for each layer, such that a child
+timeline could reference a parent's layers directly, rather than copying them
+into the child's prefix.
+
+This would impose a cost for the normal case of indices that only target the
+timeline's own layers, add complexity, and break the useful simplifying
+invariant that timelines "own" their own path. If child timelines were
+referencing layers from the parent, we would have to ensure that the parent
+never runs GC/compaction again, which would make the API less flexible (the
+proposal in this RFC enables deletion of the parent but doesn't require it.)
+
+## Performance
+
+### Adopting layers
+
+- CopyObject is a relatively cheap operation, but we may need to issue tens of thousands
+  of such requests: this can take up to tens of seconds and will compete for RemoteStorage
+  semaphore units with other activity on the pageserver.
+- If we are running on storage backend that doesn't implement CopyObject, then
+  this part will be much more expensive as we would stream all layer content
+  through the pageserver. This is no different to issuing a lot
+  of reads to a timeline that does not have a warm local cache: it will move
+  a lot of gigabytes, but that shouldn't break anything.
+- Generating truncated layers for delta that straddle the branch point will
+  require streaming read/write of all the layers in question.
+
+### Updating timeline ancestry
+
+The simplest way to update timeline ancestry will probably be to stop and start
+all the Timeline objects: this is preferable to the complexity of making their
+ancestry mutable at runtime.
+
+There will be a corresponding "stutter" in the availability of the timelines,
+of the order 10-100ms, which is the time taken to upload their IndexPart, and
+restart the Timeline.
+
+# Interaction with other features
+
+## Concurrent timeline creation
+
+If new historic timelines are created using the parent as an ancestor while the
+detach ancestor API is running, they will not be re-parented to the child. This
+doesn't break anything, but it leaves the parent in a state where it might not
+be possible to delete it.
+
+Since timeline creations are an explicit user action, this is not something we need to
+worry about as the storage layer: a user who wants to delete their parent timeline will not create
+new children, and if they do, they can choose to delete those children to
+enable deleting the parent.
+
+For the least surprise to the user, before starting the detach ancestor branch
+operation, the control plane should wait until all branches are created and not
+allow any branches to be created before the branch point on the ancestor branch
+while the operation is ongoing.
+
+## WAL based disaster recovery
+
+WAL based disaster recovery currently supports only restoring of the main
+branch. Enabling WAL based disaster recovery in the future requires that we
+keep a record which timeline generated the WAL and at which LSN was a parent
+detached. Keep a list of timeline ids and the LSN in which they were detached in
+the `index_part.json`. Limit the size of the list to 100 first entries, after
+which the WAL disaster recovery will not be possible.
+
+## Sharded tenants
+
+For sharded tenants, calls to the detach ancestor API will pass through the storage
+controller, which will handle them the same as timeline creations: invoke first
+on shard zero, and then on all the other shards.
--- a/docs/storage_controller.md
+++ b/docs/storage_controller.md
@@ -44,7 +44,7 @@ If you need to modify the database schema, here’s how to create a migration:
 - Use `diesel migration generate <name>` to create a new migration
 - Populate the SQL files in the `migrations/` subdirectory
 - Use `DATABASE_URL=... diesel migration run` to apply the migration you just wrote: this will update the `[schema.rs](http://schema.rs)` file automatically.
-  - This requires a running database: the easiest way to do that is to just run `cargo neon init ; cargo neon start`, which will leave a database available at `postgresql://localhost:1235/attachment_service`
+  - This requires a running database: the easiest way to do that is to just run `cargo neon init ; cargo neon start`, which will leave a database available at `postgresql://localhost:1235/storage_controller`
 - Commit the migration files and the changes to schema.rs
 - If you need to iterate, you can rewind migrations with `diesel migration revert -a` and then `diesel migration run` again.
 - The migrations are build into the storage controller binary, and automatically run at startup after it is deployed, so once you’ve committed a migration no further steps are needed.
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -87,7 +87,7 @@ pub struct TenantLocateResponse {
    pub shard_params: ShardParameters,
 }

-#[derive(Serialize, Deserialize)]
+#[derive(Serialize, Deserialize, Debug)]
 pub struct TenantDescribeResponse {
    pub tenant_id: TenantId,
    pub shards: Vec<TenantDescribeResponseShard>,
@@ -110,7 +110,7 @@ pub struct NodeDescribeResponse {
    pub listen_pg_port: u16,
 }

-#[derive(Serialize, Deserialize)]
+#[derive(Serialize, Deserialize, Debug)]
 pub struct TenantDescribeResponseShard {
    pub tenant_shard_id: TenantShardId,

--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -651,6 +651,17 @@ pub struct TenantDetails {
    pub timelines: Vec<TimelineId>,
 }

+#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Copy, Debug)]
+pub enum TimelineArchivalState {
+    Archived,
+    Unarchived,
+}
+
+#[derive(Serialize, Deserialize, PartialEq, Eq, Clone)]
+pub struct TimelineArchivalConfigRequest {
+    pub state: TimelineArchivalState,
+}
+
 /// This represents the output of the "timeline_detail" and "timeline_list" API calls.
 #[derive(Debug, Serialize, Deserialize, Clone)]
 pub struct TimelineInfo {
--- a/libs/pageserver_api/src/models/detach_ancestor.rs
+++ b/libs/pageserver_api/src/models/detach_ancestor.rs
@@ -1,6 +1,6 @@
 use utils::id::TimelineId;

-#[derive(Default, serde::Serialize)]
+#[derive(Debug, Default, PartialEq, serde::Serialize, serde::Deserialize)]
 pub struct AncestorDetached {
    pub reparented_timelines: Vec<TimelineId>,
 }
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -443,7 +443,7 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
 }

 impl GenericRemoteStorage {
-    pub fn from_config(storage_config: &RemoteStorageConfig) -> anyhow::Result<Self> {
+    pub async fn from_config(storage_config: &RemoteStorageConfig) -> anyhow::Result<Self> {
        let timeout = storage_config.timeout;
        Ok(match &storage_config.storage {
            RemoteStorageKind::LocalFs { local_path: path } => {
@@ -458,7 +458,7 @@ impl GenericRemoteStorage {
                    std::env::var("AWS_ACCESS_KEY_ID").unwrap_or_else(|_| "<none>".into());
                info!("Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}', profile: {profile}, access_key_id: {access_key_id}",
                      s3_config.bucket_name, s3_config.bucket_region, s3_config.prefix_in_bucket, s3_config.endpoint);
-                Self::AwsS3(Arc::new(S3Bucket::new(s3_config, timeout)?))
+                Self::AwsS3(Arc::new(S3Bucket::new(s3_config, timeout).await?))
            }
            RemoteStorageKind::AzureContainer(azure_config) => {
                let storage_account = azure_config
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -16,16 +16,10 @@ use std::{

 use anyhow::{anyhow, Context as _};
 use aws_config::{
-    environment::credentials::EnvironmentVariableCredentialsProvider,
-    imds::credentials::ImdsCredentialsProvider,
-    meta::credentials::CredentialsProviderChain,
-    profile::ProfileFileCredentialsProvider,
-    provider_config::ProviderConfig,
+    default_provider::credentials::DefaultCredentialsChain,
    retry::{RetryConfigBuilder, RetryMode},
-    web_identity_token::WebIdentityTokenCredentialsProvider,
    BehaviorVersion,
 };
-use aws_credential_types::provider::SharedCredentialsProvider;
 use aws_sdk_s3::{
    config::{AsyncSleep, IdentityCache, Region, SharedAsyncSleep},
    error::SdkError,
@@ -76,40 +70,27 @@ struct GetObjectRequest {
 }
 impl S3Bucket {
    /// Creates the S3 storage, errors if incorrect AWS S3 configuration provided.
-    pub fn new(remote_storage_config: &S3Config, timeout: Duration) -> anyhow::Result<Self> {
+    pub async fn new(remote_storage_config: &S3Config, timeout: Duration) -> anyhow::Result<Self> {
        tracing::debug!(
            "Creating s3 remote storage for S3 bucket {}",
            remote_storage_config.bucket_name
        );

-        let region = Some(Region::new(remote_storage_config.bucket_region.clone()));
+        let region = Region::new(remote_storage_config.bucket_region.clone());
+        let region_opt = Some(region.clone());

-        let provider_conf = ProviderConfig::without_region().with_region(region.clone());
-
-        let credentials_provider = {
-            // uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
-            CredentialsProviderChain::first_try(
-                "env",
-                EnvironmentVariableCredentialsProvider::new(),
-            )
-            // uses "AWS_PROFILE" / `aws sso login --profile <profile>`
-            .or_else(
-                "profile-sso",
-                ProfileFileCredentialsProvider::builder()
-                    .configure(&provider_conf)
-                    .build(),
-            )
-            // uses "AWS_WEB_IDENTITY_TOKEN_FILE", "AWS_ROLE_ARN", "AWS_ROLE_SESSION_NAME"
-            // needed to access remote extensions bucket
-            .or_else(
-                "token",
-                WebIdentityTokenCredentialsProvider::builder()
-                    .configure(&provider_conf)
-                    .build(),
-            )
-            // uses imds v2
-            .or_else("imds", ImdsCredentialsProvider::builder().build())
-        };
+        // https://docs.aws.amazon.com/sdkref/latest/guide/standardized-credentials.html
+        // https://docs.rs/aws-config/latest/aws_config/default_provider/credentials/struct.DefaultCredentialsChain.html
+        // Incomplete list of auth methods used by this:
+        // * "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
+        // * "AWS_PROFILE" / `aws sso login --profile <profile>`
+        // * "AWS_WEB_IDENTITY_TOKEN_FILE", "AWS_ROLE_ARN", "AWS_ROLE_SESSION_NAME"
+        // * http (ECS/EKS) container credentials
+        // * imds v2
+        let credentials_provider = DefaultCredentialsChain::builder()
+            .region(region)
+            .build()
+            .await;

        // AWS SDK requires us to specify how the RetryConfig should sleep when it wants to back off
        let sleep_impl: Arc<dyn AsyncSleep> = Arc::new(TokioSleep::new());
@@ -118,9 +99,9 @@ impl S3Bucket {
            #[allow(deprecated)] /* TODO: https://github.com/neondatabase/neon/issues/7665 */
            BehaviorVersion::v2023_11_09(),
        )
-        .region(region)
+        .region(region_opt)
        .identity_cache(IdentityCache::lazy().build())
-        .credentials_provider(SharedCredentialsProvider::new(credentials_provider))
+        .credentials_provider(credentials_provider)
        .sleep_impl(SharedAsyncSleep::from(sleep_impl));

        let sdk_config: aws_config::SdkConfig = std::thread::scope(|s| {
@@ -1041,8 +1022,8 @@ mod tests {

    use crate::{RemotePath, S3Bucket, S3Config};

-    #[test]
-    fn relative_path() {
+    #[tokio::test]
+    async fn relative_path() {
        let all_paths = ["", "some/path", "some/path/"];
        let all_paths: Vec<RemotePath> = all_paths
            .iter()
@@ -1085,8 +1066,9 @@ mod tests {
                max_keys_per_list_response: Some(5),
                upload_storage_class: None,
            };
-            let storage =
-                S3Bucket::new(&config, std::time::Duration::ZERO).expect("remote storage init");
+            let storage = S3Bucket::new(&config, std::time::Duration::ZERO)
+                .await
+                .expect("remote storage init");
            for (test_path_idx, test_path) in all_paths.iter().enumerate() {
                let result = storage.relative_path_to_s3_object(test_path);
                let expected = expected_outputs[prefix_idx][test_path_idx];
--- a/libs/remote_storage/tests/test_real_azure.rs
+++ b/libs/remote_storage/tests/test_real_azure.rs
@@ -31,6 +31,7 @@ struct EnabledAzure {
 impl EnabledAzure {
    async fn setup(max_keys_in_list_response: Option<i32>) -> Self {
        let client = create_azure_client(max_keys_in_list_response)
+            .await
            .context("Azure client creation")
            .expect("Azure client creation failed");

@@ -187,7 +188,7 @@ impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs {
    }
 }

-fn create_azure_client(
+async fn create_azure_client(
    max_keys_per_list_response: Option<i32>,
 ) -> anyhow::Result<Arc<GenericRemoteStorage>> {
    use rand::Rng;
@@ -221,6 +222,8 @@ fn create_azure_client(
        timeout: Duration::from_secs(120),
    };
    Ok(Arc::new(
-        GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?,
+        GenericRemoteStorage::from_config(&remote_storage_config)
+            .await
+            .context("remote storage init")?,
    ))
 }
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -197,6 +197,7 @@ struct EnabledS3 {
 impl EnabledS3 {
    async fn setup(max_keys_in_list_response: Option<i32>) -> Self {
        let client = create_s3_client(max_keys_in_list_response)
+            .await
            .context("S3 client creation")
            .expect("S3 client creation failed");

@@ -352,7 +353,7 @@ impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs {
    }
 }

-fn create_s3_client(
+async fn create_s3_client(
    max_keys_per_list_response: Option<i32>,
 ) -> anyhow::Result<Arc<GenericRemoteStorage>> {
    use rand::Rng;
@@ -385,7 +386,9 @@ fn create_s3_client(
        timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
    };
    Ok(Arc::new(
-        GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?,
+        GenericRemoteStorage::from_config(&remote_storage_config)
+            .await
+            .context("remote storage init")?,
    ))
 }

--- a/libs/utils/src/auth.rs
+++ b/libs/utils/src/auth.rs
@@ -33,6 +33,10 @@ pub enum Scope {
    GenerationsApi,
    // Allows access to control plane managment API and some storage controller endpoints.
    Admin,
+
+    /// Allows access to storage controller APIs used by the scrubber, to interrogate the state
+    /// of a tenant & post scrub results.
+    Scrubber,
 }

 /// JWT payload. See docs/authentication.md for the format
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -1,6 +1,7 @@
 use std::collections::HashMap;

 use bytes::Bytes;
+use detach_ancestor::AncestorDetached;
 use pageserver_api::{models::*, shard::TenantShardId};
 use reqwest::{IntoUrl, Method, StatusCode};
 use utils::{
@@ -418,6 +419,23 @@ impl Client {
        }
    }

+    pub async fn timeline_detach_ancestor(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+    ) -> Result<AncestorDetached> {
+        let uri = format!(
+            "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/detach_ancestor",
+            self.mgmt_api_endpoint
+        );
+
+        self.request(Method::PUT, &uri, ())
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
+
    pub async fn tenant_reset(&self, tenant_shard_id: TenantShardId) -> Result<()> {
        let uri = format!(
            "{}/v1/tenant/{}/reset",
--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -179,7 +179,7 @@ async fn main() -> anyhow::Result<()> {
                .get("remote_storage")
                .expect("need remote_storage");
            let config = RemoteStorageConfig::from_toml(toml_item)?;
-            let storage = remote_storage::GenericRemoteStorage::from_config(&config);
+            let storage = remote_storage::GenericRemoteStorage::from_config(&config).await;
            let cancel = CancellationToken::new();
            storage
                .unwrap()
--- a/pageserver/src/auth.rs
+++ b/pageserver/src/auth.rs
@@ -14,12 +14,14 @@ pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<
        }
        (Scope::PageServerApi, None) => Ok(()), // access to management api for PageServerApi scope
        (Scope::PageServerApi, Some(_)) => Ok(()), // access to tenant api using PageServerApi scope
-        (Scope::Admin | Scope::SafekeeperData | Scope::GenerationsApi, _) => Err(AuthError(
-            format!(
-                "JWT scope '{:?}' is ineligible for Pageserver auth",
-                claims.scope
-            )
-            .into(),
-        )),
+        (Scope::Admin | Scope::SafekeeperData | Scope::GenerationsApi | Scope::Scrubber, _) => {
+            Err(AuthError(
+                format!(
+                    "JWT scope '{:?}' is ineligible for Pageserver auth",
+                    claims.scope
+                )
+                .into(),
+            ))
+        }
    }
 }
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -385,7 +385,7 @@ fn start_pageserver(
    let shutdown_pageserver = tokio_util::sync::CancellationToken::new();

    // Set up remote storage client
-    let remote_storage = create_remote_storage_client(conf)?;
+    let remote_storage = BACKGROUND_RUNTIME.block_on(create_remote_storage_client(conf))?;

    // Set up deletion queue
    let (deletion_queue, deletion_workers) = DeletionQueue::new(
@@ -622,7 +622,6 @@ fn start_pageserver(
                        metric_collection_endpoint,
                        &conf.metric_collection_bucket,
                        conf.metric_collection_interval,
-                        conf.cached_metric_collection_interval,
                        conf.synthetic_size_calculation_interval,
                        conf.id,
                        local_disk_storage,
@@ -702,7 +701,7 @@ fn start_pageserver(
    }
 }

-fn create_remote_storage_client(
+async fn create_remote_storage_client(
    conf: &'static PageServerConf,
 ) -> anyhow::Result<GenericRemoteStorage> {
    let config = if let Some(config) = &conf.remote_storage_config {
@@ -712,7 +711,7 @@ fn create_remote_storage_client(
    };

    // Create the client
-    let mut remote_storage = GenericRemoteStorage::from_config(config)?;
+    let mut remote_storage = GenericRemoteStorage::from_config(config).await?;

    // If `test_remote_failures` is non-zero, wrap the client with a
    // wrapper that simulates failures.
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -68,7 +68,6 @@ pub mod defaults {
        super::ConfigurableSemaphore::DEFAULT_INITIAL.get();

    pub const DEFAULT_METRIC_COLLECTION_INTERVAL: &str = "10 min";
-    pub const DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL: &str = "0s";
    pub const DEFAULT_METRIC_COLLECTION_ENDPOINT: Option<reqwest::Url> = None;
    pub const DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL: &str = "10 min";
    pub const DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY: &str = "10s";
@@ -123,7 +122,6 @@ pub mod defaults {
 #concurrent_tenant_warmup = '{DEFAULT_CONCURRENT_TENANT_WARMUP}'

 #metric_collection_interval = '{DEFAULT_METRIC_COLLECTION_INTERVAL}'
-#cached_metric_collection_interval = '{DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL}'
 #synthetic_size_calculation_interval = '{DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL}'

 #disk_usage_based_eviction = {{ max_usage_pct = .., min_avail_bytes = .., period = "10s"}}
@@ -238,7 +236,6 @@ pub struct PageServerConf {
    // How often to collect metrics and send them to the metrics endpoint.
    pub metric_collection_interval: Duration,
    // How often to send unchanged cached metrics to the metrics endpoint.
-    pub cached_metric_collection_interval: Duration,
    pub metric_collection_endpoint: Option<Url>,
    pub metric_collection_bucket: Option<RemoteStorageConfig>,
    pub synthetic_size_calculation_interval: Duration,
@@ -370,7 +367,6 @@ struct PageServerConfigBuilder {
    concurrent_tenant_size_logical_size_queries: BuilderValue<NonZeroUsize>,

    metric_collection_interval: BuilderValue<Duration>,
-    cached_metric_collection_interval: BuilderValue<Duration>,
    metric_collection_endpoint: BuilderValue<Option<Url>>,
    synthetic_size_calculation_interval: BuilderValue<Duration>,
    metric_collection_bucket: BuilderValue<Option<RemoteStorageConfig>>,
@@ -454,10 +450,6 @@ impl PageServerConfigBuilder {
                DEFAULT_METRIC_COLLECTION_INTERVAL,
            )
            .expect("cannot parse default metric collection interval")),
-            cached_metric_collection_interval: Set(humantime::parse_duration(
-                DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL,
-            )
-            .expect("cannot parse default cached_metric_collection_interval")),
            synthetic_size_calculation_interval: Set(humantime::parse_duration(
                DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL,
            )
@@ -589,14 +581,6 @@ impl PageServerConfigBuilder {
        self.metric_collection_interval = BuilderValue::Set(metric_collection_interval)
    }

-    pub fn cached_metric_collection_interval(
-        &mut self,
-        cached_metric_collection_interval: Duration,
-    ) {
-        self.cached_metric_collection_interval =
-            BuilderValue::Set(cached_metric_collection_interval)
-    }
-
    pub fn metric_collection_endpoint(&mut self, metric_collection_endpoint: Option<Url>) {
        self.metric_collection_endpoint = BuilderValue::Set(metric_collection_endpoint)
    }
@@ -730,7 +714,6 @@ impl PageServerConfigBuilder {
                broker_keepalive_interval,
                log_format,
                metric_collection_interval,
-                cached_metric_collection_interval,
                metric_collection_endpoint,
                metric_collection_bucket,
                synthetic_size_calculation_interval,
@@ -947,7 +930,6 @@ impl PageServerConf {
                    NonZeroUsize::new(permits).context("initial semaphore permits out of range: 0, use other configuration to disable a feature")?
                }),
                "metric_collection_interval" => builder.metric_collection_interval(parse_toml_duration(key, item)?),
-                "cached_metric_collection_interval" => builder.cached_metric_collection_interval(parse_toml_duration(key, item)?),
                "metric_collection_endpoint" => {
                    let endpoint = parse_toml_string(key, item)?.parse().context("failed to parse metric_collection_endpoint")?;
                    builder.metric_collection_endpoint(Some(endpoint));
@@ -1080,7 +1062,6 @@ impl PageServerConf {
            eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::default(
            ),
            metric_collection_interval: Duration::from_secs(60),
-            cached_metric_collection_interval: Duration::from_secs(60 * 60),
            metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
            metric_collection_bucket: None,
            synthetic_size_calculation_interval: Duration::from_secs(60),
@@ -1259,7 +1240,6 @@ initial_superuser_name = 'zzzz'
 id = 10

 metric_collection_interval = '222 s'
-cached_metric_collection_interval = '22200 s'
 metric_collection_endpoint = 'http://localhost:80/metrics'
 synthetic_size_calculation_interval = '333 s'

@@ -1315,9 +1295,6 @@ background_task_maximum_delay = '334 s'
                metric_collection_interval: humantime::parse_duration(
                    defaults::DEFAULT_METRIC_COLLECTION_INTERVAL
                )?,
-                cached_metric_collection_interval: humantime::parse_duration(
-                    defaults::DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL
-                )?,
                metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
                metric_collection_bucket: None,
                synthetic_size_calculation_interval: humantime::parse_duration(
@@ -1396,7 +1373,6 @@ background_task_maximum_delay = '334 s'
                eviction_task_immitated_concurrent_logical_size_queries:
                    ConfigurableSemaphore::default(),
                metric_collection_interval: Duration::from_secs(222),
-                cached_metric_collection_interval: Duration::from_secs(22200),
                metric_collection_endpoint: Some(Url::parse("http://localhost:80/metrics")?),
                metric_collection_bucket: None,
                synthetic_size_calculation_interval: Duration::from_secs(333),
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -46,19 +46,12 @@ pub async fn collect_metrics(
    metric_collection_endpoint: &Url,
    metric_collection_bucket: &Option<RemoteStorageConfig>,
    metric_collection_interval: Duration,
-    _cached_metric_collection_interval: Duration,
    synthetic_size_calculation_interval: Duration,
    node_id: NodeId,
    local_disk_storage: Utf8PathBuf,
    cancel: CancellationToken,
    ctx: RequestContext,
 ) -> anyhow::Result<()> {
-    if _cached_metric_collection_interval != Duration::ZERO {
-        tracing::warn!(
-            "cached_metric_collection_interval is no longer used, please set it to zero."
-        )
-    }
-
    // spin up background worker that caclulates tenant sizes
    let worker_ctx =
        ctx.detached_child(TaskKind::CalculateSyntheticSize, DownloadBehavior::Download);
@@ -103,7 +96,7 @@ pub async fn collect_metrics(
        .expect("Failed to create http client with timeout");

    let bucket_client = if let Some(bucket_config) = metric_collection_bucket {
-        match GenericRemoteStorage::from_config(bucket_config) {
+        match GenericRemoteStorage::from_config(bucket_config).await {
            Ok(client) => Some(client),
            Err(e) => {
                // Non-fatal error: if we were given an invalid config, we will proceed
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -828,9 +828,9 @@ mod test {
        }
    }

-    fn setup(test_name: &str) -> anyhow::Result<TestSetup> {
+    async fn setup(test_name: &str) -> anyhow::Result<TestSetup> {
        let test_name = Box::leak(Box::new(format!("deletion_queue__{test_name}")));
-        let harness = TenantHarness::create(test_name)?;
+        let harness = TenantHarness::create(test_name).await?;

        // We do not load() the harness: we only need its config and remote_storage

@@ -844,7 +844,9 @@ mod test {
            },
            timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
        };
-        let storage = GenericRemoteStorage::from_config(&storage_config).unwrap();
+        let storage = GenericRemoteStorage::from_config(&storage_config)
+            .await
+            .unwrap();

        let mock_control_plane = MockControlPlane::new();

@@ -922,7 +924,9 @@ mod test {
    #[tokio::test]
    async fn deletion_queue_smoke() -> anyhow::Result<()> {
        // Basic test that the deletion queue processes the deletions we pass into it
-        let ctx = setup("deletion_queue_smoke").expect("Failed test setup");
+        let ctx = setup("deletion_queue_smoke")
+            .await
+            .expect("Failed test setup");
        let client = ctx.deletion_queue.new_client();
        client.recover(HashMap::new())?;

@@ -992,7 +996,9 @@ mod test {

    #[tokio::test]
    async fn deletion_queue_validation() -> anyhow::Result<()> {
-        let ctx = setup("deletion_queue_validation").expect("Failed test setup");
+        let ctx = setup("deletion_queue_validation")
+            .await
+            .expect("Failed test setup");
        let client = ctx.deletion_queue.new_client();
        client.recover(HashMap::new())?;

@@ -1051,7 +1057,9 @@ mod test {
    #[tokio::test]
    async fn deletion_queue_recovery() -> anyhow::Result<()> {
        // Basic test that the deletion queue processes the deletions we pass into it
-        let mut ctx = setup("deletion_queue_recovery").expect("Failed test setup");
+        let mut ctx = setup("deletion_queue_recovery")
+            .await
+            .expect("Failed test setup");
        let client = ctx.deletion_queue.new_client();
        client.recover(HashMap::new())?;

--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -377,7 +377,7 @@ paths:
              schema:
                $ref: "#/components/schemas/ConflictError"

-  /v1/tenant/{tenant_id}/{timeline_id}/preserve_initdb_archive:
+  /v1/tenant/{tenant_id}/timeline/{timeline_id}/preserve_initdb_archive:
    parameters:
      - name: tenant_id
        in: path
@@ -397,6 +397,51 @@ paths:
        "202":
          description: Tenant scheduled to load successfully

+  /v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/archival_config:
+    parameters:
+      - name: tenant_shard_id
+        in: path
+        required: true
+        schema:
+          type: string
+      - name: timeline_id
+        in: path
+        required: true
+        schema:
+          type: string
+    put:
+      description: |
+        Either archives or unarchives the given timeline.
+        An archived timeline may not have any non-archived children.
+      requestBody:
+        required: false
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/ArchivalConfigRequest"
+      responses:
+        "200":
+          description: Timeline (un)archived successfully
+        "409":
+          description: |
+            The tenant/timeline is already being modified, perhaps by a concurrent call to this API
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ConflictError"
+        "500":
+          description: Generic operation error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"
+
  /v1/tenant/{tenant_id}/synthetic_size:
    parameters:
      - name: tenant_id
@@ -429,7 +474,9 @@ paths:
              schema:
                $ref: "#/components/schemas/SyntheticSizeResponse"
            text/html:
-              description: SVG representation of the tenant and it's timelines.
+              schema:
+                type: string
+                description: SVG representation of the tenant and its timelines.
        "401":
          description: Unauthorized Error
          content:
@@ -568,7 +615,7 @@ paths:
          type: string
      - name: timeline_id
        in: path
-        ŕequired: true
+        required: true
        schema:
          type: string

@@ -774,15 +821,13 @@ components:
    TenantCreateRequest:
      allOf:
        - $ref: '#/components/schemas/TenantConfig'
+        - $ref: '#/components/schemas/TenantLoadRequest'
        - type: object
          required:
            - new_tenant_id
          properties:
            new_tenant_id:
              type: string
-            generation:
-              type: integer
-              description: Attachment generation number.
    TenantLoadRequest:
      type: object
      properties:
@@ -846,6 +891,15 @@ components:
        warm:
          type: boolean
          description: Whether to poll remote storage for layers to download.  If false, secondary locations don't download anything.
+    ArchivalConfigRequest:
+      type: object
+      required
+        - state
+      properties:
+        state:
+          description: The archival state of a timeline
+          type: string
+          enum: ["Archived", "Unarchived"]
    TenantConfig:
      type: object
      properties:
@@ -1106,7 +1160,7 @@ components:
        reparented_timelines:
          type: array
          description: Set of reparented timeline ids
-          properties:
+          items:
            type: string
            format: hex
            description: TimelineId
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -18,14 +18,17 @@ use hyper::StatusCode;
 use hyper::{Body, Request, Response, Uri};
 use metrics::launch_timestamp::LaunchTimestamp;
 use pageserver_api::models::AuxFilePolicy;
+use pageserver_api::models::DownloadRemoteLayersTaskSpawnRequest;
 use pageserver_api::models::IngestAuxFilesRequest;
 use pageserver_api::models::ListAuxFilesRequest;
 use pageserver_api::models::LocationConfig;
 use pageserver_api::models::LocationConfigListResponse;
+use pageserver_api::models::LocationConfigMode;
 use pageserver_api::models::LsnLease;
 use pageserver_api::models::LsnLeaseRequest;
 use pageserver_api::models::ShardParameters;
 use pageserver_api::models::TenantDetails;
+use pageserver_api::models::TenantLocationConfigRequest;
 use pageserver_api::models::TenantLocationConfigResponse;
 use pageserver_api::models::TenantScanRemoteStorageResponse;
 use pageserver_api::models::TenantScanRemoteStorageShard;
@@ -33,12 +36,10 @@ use pageserver_api::models::TenantShardLocation;
 use pageserver_api::models::TenantShardSplitRequest;
 use pageserver_api::models::TenantShardSplitResponse;
 use pageserver_api::models::TenantSorting;
+use pageserver_api::models::TimelineArchivalConfigRequest;
 use pageserver_api::models::TopTenantShardItem;
 use pageserver_api::models::TopTenantShardsRequest;
 use pageserver_api::models::TopTenantShardsResponse;
-use pageserver_api::models::{
-    DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantLocationConfigRequest,
-};
 use pageserver_api::shard::ShardCount;
 use pageserver_api::shard::TenantShardId;
 use remote_storage::DownloadError;
@@ -664,6 +665,39 @@ async fn timeline_preserve_initdb_handler(
    json_response(StatusCode::OK, ())
 }

+async fn timeline_archival_config_handler(
+    mut request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+
+    let request_data: TimelineArchivalConfigRequest = json_request(&mut request).await?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+    let state = get_state(&request);
+
+    async {
+        let tenant = state
+            .tenant_manager
+            .get_attached_tenant_shard(tenant_shard_id)?;
+
+        tenant
+            .apply_timeline_archival_config(timeline_id, request_data.state)
+            .await
+            .context("applying archival config")
+            .map_err(ApiError::InternalServerError)?;
+        Ok::<_, ApiError>(())
+    }
+    .instrument(info_span!("timeline_archival_config",
+                tenant_id = %tenant_shard_id.tenant_id,
+                shard_id = %tenant_shard_id.shard_slug(),
+                state = ?request_data.state,
+                %timeline_id))
+    .await?;
+
+    json_response(StatusCode::OK, ())
+}
+
 async fn timeline_detail_handler(
    request: Request<Body>,
    _cancel: CancellationToken,
@@ -1721,7 +1755,9 @@ async fn timeline_detach_ancestor_handler(
    request: Request<Body>,
    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    use crate::tenant::timeline::detach_ancestor::Options;
+    use crate::tenant::timeline::detach_ancestor;
+    use pageserver_api::models::detach_ancestor::AncestorDetached;
+
    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
@@ -1729,7 +1765,7 @@ async fn timeline_detach_ancestor_handler(
    let span = tracing::info_span!("detach_ancestor", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id);

    async move {
-        let mut options = Options::default();
+        let mut options = detach_ancestor::Options::default();

        let rewrite_concurrency =
            parse_query_param::<_, std::num::NonZeroUsize>(&request, "rewrite_concurrency")?;
@@ -1757,27 +1793,36 @@ async fn timeline_detach_ancestor_handler(

        let timeline = tenant.get_timeline(timeline_id, true)?;

-        let (_guard, prepared) = timeline
+        let progress = timeline
            .prepare_to_detach_from_ancestor(&tenant, options, ctx)
            .await?;

-        let res = state
-            .tenant_manager
-            .complete_detaching_timeline_ancestor(tenant_shard_id, timeline_id, prepared, ctx)
-            .await;
+        // uncomment to allow early as possible Tenant::drop
+        // drop(tenant);

-        match res {
-            Ok(reparented_timelines) => {
-                let resp = pageserver_api::models::detach_ancestor::AncestorDetached {
+        let resp = match progress {
+            detach_ancestor::Progress::Prepared(_guard, prepared) => {
+                // it would be great to tag the guard on to the tenant activation future
+                let reparented_timelines = state
+                    .tenant_manager
+                    .complete_detaching_timeline_ancestor(
+                        tenant_shard_id,
+                        timeline_id,
+                        prepared,
+                        ctx,
+                    )
+                    .await
+                    .context("timeline detach ancestor completion")
+                    .map_err(ApiError::InternalServerError)?;
+
+                AncestorDetached {
                    reparented_timelines,
-                };
-
-                json_response(StatusCode::OK, resp)
+                }
            }
-            Err(e) => Err(ApiError::InternalServerError(
-                e.context("timeline detach completion"),
-            )),
-        }
+            detach_ancestor::Progress::Done(resp) => resp,
+        };
+
+        json_response(StatusCode::OK, resp)
    }
    .instrument(span)
    .await
@@ -2778,6 +2823,10 @@ pub fn make_router(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/preserve_initdb_archive",
            |r| api_handler(r, timeline_preserve_initdb_handler),
        )
+        .post(
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/archival_config",
+            |r| api_handler(r, timeline_archival_config_handler),
+        )
        .get("/v1/tenant/:tenant_shard_id/timeline/:timeline_id", |r| {
            api_handler(r, timeline_detail_handler)
        })
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -473,6 +473,31 @@ static PITR_HISTORY_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

+#[derive(strum_macros::EnumString, strum_macros::Display, strum_macros::IntoStaticStr)]
+#[strum(serialize_all = "kebab_case")]
+pub(crate) enum MetricLayerKind {
+    Delta,
+    Image,
+}
+
+static TIMELINE_LAYER_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
+    register_uint_gauge_vec!(
+        "pageserver_layer_bytes",
+        "Sum of layer physical sizes in bytes",
+        &["tenant_id", "shard_id", "timeline_id", "kind"]
+    )
+    .expect("failed to define a metric")
+});
+
+static TIMELINE_LAYER_COUNT: Lazy<UIntGaugeVec> = Lazy::new(|| {
+    register_uint_gauge_vec!(
+        "pageserver_layer_count",
+        "Number of layers that exist",
+        &["tenant_id", "shard_id", "timeline_id", "kind"]
+    )
+    .expect("failed to define a metric")
+});
+
 static TIMELINE_ARCHIVE_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
    register_uint_gauge_vec!(
        "pageserver_archive_size",
@@ -585,6 +610,22 @@ pub(crate) static CIRCUIT_BREAKERS_UNBROKEN: Lazy<IntCounter> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

+pub(crate) static COMPRESSION_IMAGE_INPUT_BYTES: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "pageserver_compression_image_in_bytes_total",
+        "Size of uncompressed data written into image layers"
+    )
+    .expect("failed to define a metric")
+});
+
+pub(crate) static COMPRESSION_IMAGE_OUTPUT_BYTES: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "pageserver_compression_image_out_bytes_total",
+        "Size of compressed image layer written"
+    )
+    .expect("failed to define a metric")
+});
+
 pub(crate) mod initial_logical_size {
    use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
    use once_cell::sync::Lazy;
@@ -1490,7 +1531,6 @@ pub(crate) enum ComputeCommandKind {
    Basebackup,
    Fullbackup,
    LeaseLsn,
-    Show,
 }

 pub(crate) struct ComputeCommandCounters {
@@ -2142,6 +2182,10 @@ pub(crate) struct TimelineMetrics {
    pub last_record_gauge: IntGauge,
    pub pitr_history_size: UIntGauge,
    pub archival_size: UIntGauge,
+    pub(crate) layer_size_image: UIntGauge,
+    pub(crate) layer_count_image: UIntGauge,
+    pub(crate) layer_size_delta: UIntGauge,
+    pub(crate) layer_count_delta: UIntGauge,
    pub standby_horizon_gauge: IntGauge,
    pub resident_physical_size_gauge: UIntGauge,
    /// copy of LayeredTimeline.current_logical_size
@@ -2224,6 +2268,42 @@ impl TimelineMetrics {
            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
            .unwrap();

+        let layer_size_image = TIMELINE_LAYER_SIZE
+            .get_metric_with_label_values(&[
+                &tenant_id,
+                &shard_id,
+                &timeline_id,
+                MetricLayerKind::Image.into(),
+            ])
+            .unwrap();
+
+        let layer_count_image = TIMELINE_LAYER_COUNT
+            .get_metric_with_label_values(&[
+                &tenant_id,
+                &shard_id,
+                &timeline_id,
+                MetricLayerKind::Image.into(),
+            ])
+            .unwrap();
+
+        let layer_size_delta = TIMELINE_LAYER_SIZE
+            .get_metric_with_label_values(&[
+                &tenant_id,
+                &shard_id,
+                &timeline_id,
+                MetricLayerKind::Delta.into(),
+            ])
+            .unwrap();
+
+        let layer_count_delta = TIMELINE_LAYER_COUNT
+            .get_metric_with_label_values(&[
+                &tenant_id,
+                &shard_id,
+                &timeline_id,
+                MetricLayerKind::Delta.into(),
+            ])
+            .unwrap();
+
        let standby_horizon_gauge = STANDBY_HORIZON
            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
            .unwrap();
@@ -2278,6 +2358,10 @@ impl TimelineMetrics {
            last_record_gauge,
            pitr_history_size,
            archival_size,
+            layer_size_image,
+            layer_count_image,
+            layer_size_delta,
+            layer_count_delta,
            standby_horizon_gauge,
            resident_physical_size_gauge,
            current_logical_size_gauge,
@@ -2339,6 +2423,31 @@ impl TimelineMetrics {
        let _ = TIMELINE_ARCHIVE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        let _ = PITR_HISTORY_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);

+        let _ = TIMELINE_LAYER_SIZE.remove_label_values(&[
+            tenant_id,
+            shard_id,
+            timeline_id,
+            MetricLayerKind::Image.into(),
+        ]);
+        let _ = TIMELINE_LAYER_COUNT.remove_label_values(&[
+            tenant_id,
+            shard_id,
+            timeline_id,
+            MetricLayerKind::Image.into(),
+        ]);
+        let _ = TIMELINE_LAYER_SIZE.remove_label_values(&[
+            tenant_id,
+            shard_id,
+            timeline_id,
+            MetricLayerKind::Delta.into(),
+        ]);
+        let _ = TIMELINE_LAYER_COUNT.remove_label_values(&[
+            tenant_id,
+            shard_id,
+            timeline_id,
+            MetricLayerKind::Delta.into(),
+        ]);
+
        let _ = EVICTIONS.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        let _ = AUX_FILE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        let _ = VALID_LSN_LEASE_COUNT.remove_label_values(&[tenant_id, shard_id, timeline_id]);
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -1479,66 +1479,6 @@ where
                    ))?
                }
            };
-        } else if let Some(params) = parts.strip_prefix(&["show"]) {
-            // show <tenant_id>
-            if params.len() != 1 {
-                return Err(QueryError::Other(anyhow::anyhow!(
-                    "invalid param number for config command"
-                )));
-            }
-            let tenant_id = TenantId::from_str(params[0])
-                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
-
-            tracing::Span::current().record("tenant_id", field::display(tenant_id));
-
-            self.check_permission(Some(tenant_id))?;
-
-            COMPUTE_COMMANDS_COUNTERS
-                .for_command(ComputeCommandKind::Show)
-                .inc();
-
-            let tenant = self
-                .get_active_tenant_with_timeout(
-                    tenant_id,
-                    ShardSelector::Zero,
-                    ACTIVE_TENANT_TIMEOUT,
-                )
-                .await?;
-            pgb.write_message_noflush(&BeMessage::RowDescription(&[
-                RowDescriptor::int8_col(b"checkpoint_distance"),
-                RowDescriptor::int8_col(b"checkpoint_timeout"),
-                RowDescriptor::int8_col(b"compaction_target_size"),
-                RowDescriptor::int8_col(b"compaction_period"),
-                RowDescriptor::int8_col(b"compaction_threshold"),
-                RowDescriptor::int8_col(b"gc_horizon"),
-                RowDescriptor::int8_col(b"gc_period"),
-                RowDescriptor::int8_col(b"image_creation_threshold"),
-                RowDescriptor::int8_col(b"pitr_interval"),
-            ]))?
-            .write_message_noflush(&BeMessage::DataRow(&[
-                Some(tenant.get_checkpoint_distance().to_string().as_bytes()),
-                Some(
-                    tenant
-                        .get_checkpoint_timeout()
-                        .as_secs()
-                        .to_string()
-                        .as_bytes(),
-                ),
-                Some(tenant.get_compaction_target_size().to_string().as_bytes()),
-                Some(
-                    tenant
-                        .get_compaction_period()
-                        .as_secs()
-                        .to_string()
-                        .as_bytes(),
-                ),
-                Some(tenant.get_compaction_threshold().to_string().as_bytes()),
-                Some(tenant.get_gc_horizon().to_string().as_bytes()),
-                Some(tenant.get_gc_period().as_secs().to_string().as_bytes()),
-                Some(tenant.get_image_creation_threshold().to_string().as_bytes()),
-                Some(tenant.get_pitr_interval().as_secs().to_string().as_bytes()),
-            ]))?
-            .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
        } else {
            return Err(QueryError::Other(anyhow::anyhow!(
                "unknown command {query_string}"
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -2031,7 +2031,7 @@ mod tests {
    #[tokio::test]
    async fn aux_files_round_trip() -> anyhow::Result<()> {
        let name = "aux_files_round_trip";
-        let harness = TenantHarness::create(name)?;
+        let harness = TenantHarness::create(name).await?;

        pub const TIMELINE_ID: TimelineId =
            TimelineId::from_array(hex!("11223344556677881122334455667788"));
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -21,6 +21,7 @@ use futures::FutureExt;
 use futures::StreamExt;
 use pageserver_api::models;
 use pageserver_api::models::AuxFilePolicy;
+use pageserver_api::models::TimelineArchivalState;
 use pageserver_api::models::TimelineState;
 use pageserver_api::models::TopTenantShardItem;
 use pageserver_api::models::WalRedoManagerStatus;
@@ -1228,6 +1229,14 @@ impl Tenant {
        Ok(timeline_preloads)
    }

+    pub async fn apply_timeline_archival_config(
+        &self,
+        _timeline_id: TimelineId,
+        _config: TimelineArchivalState,
+    ) -> anyhow::Result<()> {
+        Ok(())
+    }
+
    pub(crate) fn tenant_shard_id(&self) -> TenantShardId {
        self.tenant_shard_id
    }
@@ -2912,7 +2921,7 @@ impl Tenant {
                if let Some(ancestor_id) = timeline.get_ancestor_timeline_id() {
                    if let Some(ancestor_gc_cutoffs) = gc_cutoffs.get(&ancestor_id) {
                        target.within_ancestor_pitr =
-                            timeline.get_ancestor_lsn() >= ancestor_gc_cutoffs.pitr;
+                            timeline.get_ancestor_lsn() >= ancestor_gc_cutoffs.time;
                    }
                }

@@ -2928,7 +2937,7 @@ impl Tenant {
                timeline.metrics.pitr_history_size.set(
                    timeline
                        .get_last_record_lsn()
-                        .checked_sub(target.cutoffs.pitr)
+                        .checked_sub(target.cutoffs.time)
                        .unwrap_or(Lsn(0))
                        .0,
                );
@@ -3788,7 +3797,7 @@ pub(crate) mod harness {
    }

    impl TenantHarness {
-        pub fn create_custom(
+        pub async fn create_custom(
            test_name: &'static str,
            tenant_conf: TenantConf,
            tenant_id: TenantId,
@@ -3824,7 +3833,7 @@ pub(crate) mod harness {
                },
                timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
            };
-            let remote_storage = GenericRemoteStorage::from_config(&config).unwrap();
+            let remote_storage = GenericRemoteStorage::from_config(&config).await.unwrap();
            let deletion_queue = MockDeletionQueue::new(Some(remote_storage.clone()));

            Ok(Self {
@@ -3839,7 +3848,7 @@ pub(crate) mod harness {
            })
        }

-        pub fn create(test_name: &'static str) -> anyhow::Result<Self> {
+        pub async fn create(test_name: &'static str) -> anyhow::Result<Self> {
            // Disable automatic GC and compaction to make the unit tests more deterministic.
            // The tests perform them manually if needed.
            let tenant_conf = TenantConf {
@@ -3856,6 +3865,7 @@ pub(crate) mod harness {
                shard,
                Generation::new(0xdeadbeef),
            )
+            .await
        }

        pub fn span(&self) -> tracing::Span {
@@ -3992,7 +4002,7 @@ mod tests {

    #[tokio::test]
    async fn test_basic() -> anyhow::Result<()> {
-        let (tenant, ctx) = TenantHarness::create("test_basic")?.load().await;
+        let (tenant, ctx) = TenantHarness::create("test_basic").await?.load().await;
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
            .await?;
@@ -4039,7 +4049,8 @@ mod tests {

    #[tokio::test]
    async fn no_duplicate_timelines() -> anyhow::Result<()> {
-        let (tenant, ctx) = TenantHarness::create("no_duplicate_timelines")?
+        let (tenant, ctx) = TenantHarness::create("no_duplicate_timelines")
+            .await?
            .load()
            .await;
        let _ = tenant
@@ -4071,7 +4082,7 @@ mod tests {
    async fn test_branch() -> anyhow::Result<()> {
        use std::str::from_utf8;

-        let (tenant, ctx) = TenantHarness::create("test_branch")?.load().await;
+        let (tenant, ctx) = TenantHarness::create("test_branch").await?.load().await;
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
            .await?;
@@ -4193,7 +4204,8 @@ mod tests {
    #[tokio::test]
    async fn test_prohibit_branch_creation_on_garbage_collected_data() -> anyhow::Result<()> {
        let (tenant, ctx) =
-            TenantHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")?
+            TenantHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")
+                .await?
                .load()
                .await;
        let tline = tenant
@@ -4240,7 +4252,8 @@ mod tests {
    #[tokio::test]
    async fn test_prohibit_branch_creation_on_pre_initdb_lsn() -> anyhow::Result<()> {
        let (tenant, ctx) =
-            TenantHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn")?
+            TenantHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn")
+                .await?
                .load()
                .await;

@@ -4262,7 +4275,7 @@ mod tests {
                    .source()
                    .unwrap()
                    .to_string()
-                    .contains("is earlier than latest GC horizon"));
+                    .contains("is earlier than latest GC cutoff"));
            }
        }

@@ -4295,7 +4308,8 @@ mod tests {
    #[tokio::test]
    async fn test_get_branchpoints_from_an_inactive_timeline() -> anyhow::Result<()> {
        let (tenant, ctx) =
-            TenantHarness::create("test_get_branchpoints_from_an_inactive_timeline")?
+            TenantHarness::create("test_get_branchpoints_from_an_inactive_timeline")
+                .await?
                .load()
                .await;
        let tline = tenant
@@ -4352,7 +4366,8 @@ mod tests {
    #[tokio::test]
    async fn test_retain_data_in_parent_which_is_needed_for_child() -> anyhow::Result<()> {
        let (tenant, ctx) =
-            TenantHarness::create("test_retain_data_in_parent_which_is_needed_for_child")?
+            TenantHarness::create("test_retain_data_in_parent_which_is_needed_for_child")
+                .await?
                .load()
                .await;
        let tline = tenant
@@ -4382,10 +4397,10 @@ mod tests {
    }
    #[tokio::test]
    async fn test_parent_keeps_data_forever_after_branching() -> anyhow::Result<()> {
-        let (tenant, ctx) =
-            TenantHarness::create("test_parent_keeps_data_forever_after_branching")?
-                .load()
-                .await;
+        let (tenant, ctx) = TenantHarness::create("test_parent_keeps_data_forever_after_branching")
+            .await?
+            .load()
+            .await;
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
            .await?;
@@ -4423,7 +4438,7 @@ mod tests {
    #[tokio::test]
    async fn timeline_load() -> anyhow::Result<()> {
        const TEST_NAME: &str = "timeline_load";
-        let harness = TenantHarness::create(TEST_NAME)?;
+        let harness = TenantHarness::create(TEST_NAME).await?;
        {
            let (tenant, ctx) = harness.load().await;
            let tline = tenant
@@ -4450,7 +4465,7 @@ mod tests {
    #[tokio::test]
    async fn timeline_load_with_ancestor() -> anyhow::Result<()> {
        const TEST_NAME: &str = "timeline_load_with_ancestor";
-        let harness = TenantHarness::create(TEST_NAME)?;
+        let harness = TenantHarness::create(TEST_NAME).await?;
        // create two timelines
        {
            let (tenant, ctx) = harness.load().await;
@@ -4498,7 +4513,10 @@ mod tests {
    #[tokio::test]
    async fn delta_layer_dumping() -> anyhow::Result<()> {
        use storage_layer::AsLayerDesc;
-        let (tenant, ctx) = TenantHarness::create("test_layer_dumping")?.load().await;
+        let (tenant, ctx) = TenantHarness::create("test_layer_dumping")
+            .await?
+            .load()
+            .await;
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
            .await?;
@@ -4525,7 +4543,7 @@ mod tests {

    #[tokio::test]
    async fn test_images() -> anyhow::Result<()> {
-        let (tenant, ctx) = TenantHarness::create("test_images")?.load().await;
+        let (tenant, ctx) = TenantHarness::create("test_images").await?.load().await;
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
            .await?;
@@ -4696,7 +4714,7 @@ mod tests {
    //
    #[tokio::test]
    async fn test_bulk_insert() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_bulk_insert")?;
+        let harness = TenantHarness::create("test_bulk_insert").await?;
        let (tenant, ctx) = harness.load().await;
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
@@ -4727,7 +4745,7 @@ mod tests {
    // so the search can stop at the first delta layer and doesn't traverse any deeper.
    #[tokio::test]
    async fn test_get_vectored() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_get_vectored")?;
+        let harness = TenantHarness::create("test_get_vectored").await?;
        let (tenant, ctx) = harness.load().await;
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
@@ -4805,7 +4823,7 @@ mod tests {

    #[tokio::test]
    async fn test_get_vectored_aux_files() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_get_vectored_aux_files")?;
+        let harness = TenantHarness::create("test_get_vectored_aux_files").await?;

        let (tenant, ctx) = harness.load().await;
        let tline = tenant
@@ -4891,7 +4909,8 @@ mod tests {
            TenantId::generate(),
            ShardIdentity::unsharded(),
            Generation::new(0xdeadbeef),
-        )?;
+        )
+        .await?;
        let (tenant, ctx) = harness.load().await;

        let mut current_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
@@ -5034,7 +5053,7 @@ mod tests {
    // ```
    #[tokio::test]
    async fn test_get_vectored_ancestor_descent() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_get_vectored_on_lsn_axis")?;
+        let harness = TenantHarness::create("test_get_vectored_on_lsn_axis").await?;
        let (tenant, ctx) = harness.load().await;

        let start_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
@@ -5183,7 +5202,7 @@ mod tests {
        name: &'static str,
        compaction_algorithm: CompactionAlgorithm,
    ) -> anyhow::Result<()> {
-        let mut harness = TenantHarness::create(name)?;
+        let mut harness = TenantHarness::create(name).await?;
        harness.tenant_conf.compaction_algorithm = CompactionAlgorithmSettings {
            kind: compaction_algorithm,
        };
@@ -5267,7 +5286,8 @@ mod tests {

    #[tokio::test]
    async fn test_traverse_branches() -> anyhow::Result<()> {
-        let (tenant, ctx) = TenantHarness::create("test_traverse_branches")?
+        let (tenant, ctx) = TenantHarness::create("test_traverse_branches")
+            .await?
            .load()
            .await;
        let mut tline = tenant
@@ -5357,7 +5377,8 @@ mod tests {

    #[tokio::test]
    async fn test_traverse_ancestors() -> anyhow::Result<()> {
-        let (tenant, ctx) = TenantHarness::create("test_traverse_ancestors")?
+        let (tenant, ctx) = TenantHarness::create("test_traverse_ancestors")
+            .await?
            .load()
            .await;
        let mut tline = tenant
@@ -5423,7 +5444,8 @@ mod tests {

    #[tokio::test]
    async fn test_write_at_initdb_lsn_takes_optimization_code_path() -> anyhow::Result<()> {
-        let (tenant, ctx) = TenantHarness::create("test_empty_test_timeline_is_usable")?
+        let (tenant, ctx) = TenantHarness::create("test_empty_test_timeline_is_usable")
+            .await?
            .load()
            .await;

@@ -5492,7 +5514,7 @@ mod tests {
    #[tokio::test]
    async fn test_create_guard_crash() -> anyhow::Result<()> {
        let name = "test_create_guard_crash";
-        let harness = TenantHarness::create(name)?;
+        let harness = TenantHarness::create(name).await?;
        {
            let (tenant, ctx) = harness.load().await;
            let tline = tenant
@@ -5545,7 +5567,7 @@ mod tests {
        name: &'static str,
        compaction_algorithm: CompactionAlgorithm,
    ) -> anyhow::Result<()> {
-        let mut harness = TenantHarness::create(name)?;
+        let mut harness = TenantHarness::create(name).await?;
        harness.tenant_conf.compaction_algorithm = CompactionAlgorithmSettings {
            kind: compaction_algorithm,
        };
@@ -5569,7 +5591,7 @@ mod tests {

    #[tokio::test]
    async fn test_metadata_scan() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_metadata_scan")?;
+        let harness = TenantHarness::create("test_metadata_scan").await?;
        let (tenant, ctx) = harness.load().await;
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
@@ -5688,7 +5710,7 @@ mod tests {

    #[tokio::test]
    async fn test_metadata_compaction_trigger() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_metadata_compaction_trigger")?;
+        let harness = TenantHarness::create("test_metadata_compaction_trigger").await?;
        let (tenant, ctx) = harness.load().await;
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
@@ -5747,7 +5769,9 @@ mod tests {

    #[tokio::test]
    async fn test_branch_copies_dirty_aux_file_flag() {
-        let harness = TenantHarness::create("test_branch_copies_dirty_aux_file_flag").unwrap();
+        let harness = TenantHarness::create("test_branch_copies_dirty_aux_file_flag")
+            .await
+            .unwrap();

        // the default aux file policy to switch is v1 if not set by the admins
        assert_eq!(
@@ -5849,7 +5873,9 @@ mod tests {

    #[tokio::test]
    async fn aux_file_policy_switch() {
-        let mut harness = TenantHarness::create("aux_file_policy_switch").unwrap();
+        let mut harness = TenantHarness::create("aux_file_policy_switch")
+            .await
+            .unwrap();
        harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::CrossValidation; // set to cross-validation mode
        let (tenant, ctx) = harness.load().await;

@@ -6023,7 +6049,9 @@ mod tests {

    #[tokio::test]
    async fn aux_file_policy_force_switch() {
-        let mut harness = TenantHarness::create("aux_file_policy_force_switch").unwrap();
+        let mut harness = TenantHarness::create("aux_file_policy_force_switch")
+            .await
+            .unwrap();
        harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::V1;
        let (tenant, ctx) = harness.load().await;

@@ -6084,7 +6112,9 @@ mod tests {

    #[tokio::test]
    async fn aux_file_policy_auto_detect() {
-        let mut harness = TenantHarness::create("aux_file_policy_auto_detect").unwrap();
+        let mut harness = TenantHarness::create("aux_file_policy_auto_detect")
+            .await
+            .unwrap();
        harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::V2; // set to cross-validation mode
        let (tenant, ctx) = harness.load().await;

@@ -6147,7 +6177,7 @@ mod tests {

    #[tokio::test]
    async fn test_metadata_image_creation() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_metadata_image_creation")?;
+        let harness = TenantHarness::create("test_metadata_image_creation").await?;
        let (tenant, ctx) = harness.load().await;
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
@@ -6246,7 +6276,7 @@ mod tests {

    #[tokio::test]
    async fn test_vectored_missing_data_key_reads() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_vectored_missing_data_key_reads")?;
+        let harness = TenantHarness::create("test_vectored_missing_data_key_reads").await?;
        let (tenant, ctx) = harness.load().await;

        let base_key = Key::from_hex("000000000033333333444444445500000000").unwrap();
@@ -6318,7 +6348,7 @@ mod tests {

    #[tokio::test]
    async fn test_vectored_missing_metadata_key_reads() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_vectored_missing_metadata_key_reads")?;
+        let harness = TenantHarness::create("test_vectored_missing_metadata_key_reads").await?;
        let (tenant, ctx) = harness.load().await;

        let base_key = Key::from_hex("620000000033333333444444445500000000").unwrap();
@@ -6410,7 +6440,7 @@ mod tests {

    #[tokio::test]
    async fn test_metadata_tombstone_reads() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_metadata_tombstone_reads")?;
+        let harness = TenantHarness::create("test_metadata_tombstone_reads").await?;
        let (tenant, ctx) = harness.load().await;
        let key0 = Key::from_hex("620000000033333333444444445500000000").unwrap();
        let key1 = Key::from_hex("620000000033333333444444445500000001").unwrap();
@@ -6490,7 +6520,9 @@ mod tests {

    #[tokio::test]
    async fn test_metadata_tombstone_image_creation() {
-        let harness = TenantHarness::create("test_metadata_tombstone_image_creation").unwrap();
+        let harness = TenantHarness::create("test_metadata_tombstone_image_creation")
+            .await
+            .unwrap();
        let (tenant, ctx) = harness.load().await;

        let key0 = Key::from_hex("620000000033333333444444445500000000").unwrap();
@@ -6562,8 +6594,9 @@ mod tests {

    #[tokio::test]
    async fn test_metadata_tombstone_empty_image_creation() {
-        let harness =
-            TenantHarness::create("test_metadata_tombstone_empty_image_creation").unwrap();
+        let harness = TenantHarness::create("test_metadata_tombstone_empty_image_creation")
+            .await
+            .unwrap();
        let (tenant, ctx) = harness.load().await;

        let key1 = Key::from_hex("620000000033333333444444445500000001").unwrap();
@@ -6626,7 +6659,7 @@ mod tests {

    #[tokio::test]
    async fn test_simple_bottom_most_compaction_images() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_simple_bottom_most_compaction_images")?;
+        let harness = TenantHarness::create("test_simple_bottom_most_compaction_images").await?;
        let (tenant, ctx) = harness.load().await;

        fn get_key(id: u32) -> Key {
@@ -6718,8 +6751,8 @@ mod tests {
        {
            // Update GC info
            let mut guard = tline.gc_info.write().unwrap();
-            guard.cutoffs.pitr = Lsn(0x30);
-            guard.cutoffs.horizon = Lsn(0x30);
+            guard.cutoffs.time = Lsn(0x30);
+            guard.cutoffs.space = Lsn(0x30);
        }

        let expected_result = [
@@ -6810,7 +6843,7 @@ mod tests {
            vec![
                // Image layer at GC horizon
                PersistentLayerKey {
-                    key_range: Key::MIN..get_key(10),
+                    key_range: Key::MIN..Key::MAX,
                    lsn_range: Lsn(0x30)..Lsn(0x31),
                    is_delta: false
                },
@@ -6834,7 +6867,7 @@ mod tests {

    #[tokio::test]
    async fn test_neon_test_record() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_neon_test_record")?;
+        let harness = TenantHarness::create("test_neon_test_record").await?;
        let (tenant, ctx) = harness.load().await;

        fn get_key(id: u32) -> Key {
@@ -6915,7 +6948,7 @@ mod tests {

    #[tokio::test]
    async fn test_lsn_lease() -> anyhow::Result<()> {
-        let (tenant, ctx) = TenantHarness::create("test_lsn_lease")?.load().await;
+        let (tenant, ctx) = TenantHarness::create("test_lsn_lease").await?.load().await;
        let key = Key::from_hex("010000000033333333444444445500000000").unwrap();

        let end_lsn = Lsn(0x100);
@@ -7004,7 +7037,7 @@ mod tests {

    #[tokio::test]
    async fn test_simple_bottom_most_compaction_deltas() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_simple_bottom_most_compaction_deltas")?;
+        let harness = TenantHarness::create("test_simple_bottom_most_compaction_deltas").await?;
        let (tenant, ctx) = harness.load().await;

        fn get_key(id: u32) -> Key {
@@ -7109,8 +7142,8 @@ mod tests {
            *guard = GcInfo {
                retain_lsns: vec![],
                cutoffs: GcCutoffs {
-                    pitr: Lsn(0x30),
-                    horizon: Lsn(0x30),
+                    time: Lsn(0x30),
+                    space: Lsn(0x30),
                },
                leases: Default::default(),
                within_ancestor_pitr: false,
--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -262,7 +262,7 @@ where

    pub fn iter<'a>(self, start_key: &'a [u8; L], ctx: &'a RequestContext) -> DiskBtreeIterator<'a>
    where
-        R: 'a,
+        R: 'a + Send,
    {
        DiskBtreeIterator {
            stream: Box::pin(self.into_stream(start_key, ctx)),
@@ -521,7 +521,7 @@ where
 pub struct DiskBtreeIterator<'a> {
    #[allow(clippy::type_complexity)]
    stream: std::pin::Pin<
-        Box<dyn Stream<Item = std::result::Result<(Vec<u8>, u64), DiskBtreeError>> + 'a>,
+        Box<dyn Stream<Item = std::result::Result<(Vec<u8>, u64), DiskBtreeError>> + 'a + Send>,
    >,
 }

--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -2698,7 +2698,9 @@ mod tests {
        // Test that if an InProgress tenant is in the map during shutdown, the shutdown will gracefully
        // wait for it to complete before proceeding.

-        let h = TenantHarness::create("shutdown_awaits_in_progress_tenant").unwrap();
+        let h = TenantHarness::create("shutdown_awaits_in_progress_tenant")
+            .await
+            .unwrap();
        let (t, _ctx) = h.load().await;

        // harness loads it to active, which is forced and nothing is running on the tenant
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -241,7 +241,7 @@ use self::index::IndexPart;

 use super::metadata::MetadataUpdate;
 use super::storage_layer::{Layer, LayerName, ResidentLayer};
-use super::upload_queue::SetDeletedFlagProgress;
+use super::upload_queue::{NotInitialized, SetDeletedFlagProgress};
 use super::Generation;

 pub(crate) use download::{
@@ -1930,6 +1930,31 @@ impl RemoteTimelineClient {
            }
        }
    }
+
+    /// Returns an accessor which will hold the UploadQueue mutex for accessing the upload queue
+    /// externally to RemoteTimelineClient.
+    pub(crate) fn initialized_upload_queue(
+        &self,
+    ) -> Result<UploadQueueAccessor<'_>, NotInitialized> {
+        let mut inner = self.upload_queue.lock().unwrap();
+        inner.initialized_mut()?;
+        Ok(UploadQueueAccessor { inner })
+    }
+}
+
+pub(crate) struct UploadQueueAccessor<'a> {
+    inner: std::sync::MutexGuard<'a, UploadQueue>,
+}
+
+impl<'a> UploadQueueAccessor<'a> {
+    pub(crate) fn latest_uploaded_index_part(&self) -> &IndexPart {
+        match &*self.inner {
+            UploadQueue::Initialized(x) => &x.clean.0,
+            UploadQueue::Uninitialized | UploadQueue::Stopped(_) => {
+                unreachable!("checked before constructing")
+            }
+        }
+    }
 }

 pub fn remote_tenant_path(tenant_shard_id: &TenantShardId) -> RemotePath {
@@ -2103,7 +2128,7 @@ mod tests {
    impl TestSetup {
        async fn new(test_name: &str) -> anyhow::Result<Self> {
            let test_name = Box::leak(Box::new(format!("remote_timeline_client__{test_name}")));
-            let harness = TenantHarness::create(test_name)?;
+            let harness = TenantHarness::create(test_name).await?;
            let (tenant, ctx) = harness.load().await;

            let timeline = tenant
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -176,6 +176,24 @@ pub(crate) struct Lineage {
    ///
    /// If you are adding support for detaching from a hierarchy, consider changing the ancestry
    /// into a `Vec<(TimelineId, Lsn)>` to be a path instead.
+    // FIXME: this is insufficient even for path of two timelines for future wal recovery
+    // purposes:
+    //
+    // assuming a "old main" which has received most of the WAL, and has a branch "new main",
+    // starting a bit before "old main" last_record_lsn. the current version works fine,
+    // because we will know to replay wal and branch at the recorded Lsn to do wal recovery.
+    //
+    // then assuming "new main" would similarly receive a branch right before its last_record_lsn,
+    // "new new main". the current implementation would just store ("new main", ancestor_lsn, _)
+    // here. however, we cannot recover from WAL using only that information, we would need the
+    // whole ancestry here:
+    //
+    // ```json
+    // [
+    //   ["old main", ancestor_lsn("new main"), _],
+    //   ["new main", ancestor_lsn("new new main"), _]
+    // ]
+    // ```
    #[serde(skip_serializing_if = "Option::is_none", default)]
    original_ancestor: Option<(TimelineId, Lsn, NaiveDateTime)>,
 }
@@ -217,6 +235,14 @@ impl Lineage {
        self.original_ancestor
            .is_some_and(|(_, ancestor_lsn, _)| ancestor_lsn == lsn)
    }
+
+    pub(crate) fn is_detached_from_original_ancestor(&self) -> bool {
+        self.original_ancestor.is_some()
+    }
+
+    pub(crate) fn is_reparented(&self) -> bool {
+        !self.reparenting_history.is_empty()
+    }
 }

 #[cfg(test)]
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -135,11 +135,9 @@ pub struct TimelineInputs {
    ancestor_lsn: Lsn,
    last_record: Lsn,
    latest_gc_cutoff: Lsn,
-    horizon_cutoff: Lsn,
-    pitr_cutoff: Lsn,

    /// Cutoff point based on GC settings
-    next_gc_cutoff: Lsn,
+    next_pitr_cutoff: Lsn,

    /// Cutoff point calculated from the user-supplied 'max_retention_period'
    retention_param_cutoff: Option<Lsn>,
@@ -150,7 +148,7 @@ pub struct TimelineInputs {

 /// Gathers the inputs for the tenant sizing model.
 ///
-/// Tenant size does not consider the latest state, but only the state until next_gc_cutoff, which
+/// Tenant size does not consider the latest state, but only the state until next_pitr_cutoff, which
 /// is updated on-demand, during the start of this calculation and separate from the
 /// [`TimelineInputs::latest_gc_cutoff`].
 ///
@@ -158,11 +156,8 @@ pub struct TimelineInputs {
 ///
 /// ```text
 /// 0-----|---------|----|------------| · · · · · |·> lsn
-///   initdb_lsn  branchpoints*  next_gc_cutoff  latest
+///   initdb_lsn  branchpoints*  next_pitr_cutoff  latest
 /// ```
-///
-/// Until gc_horizon_cutoff > `Timeline::last_record_lsn` for any of the tenant's timelines, the
-/// tenant size will be zero.
 pub(super) async fn gather_inputs(
    tenant: &Tenant,
    limit: &Arc<Semaphore>,
@@ -172,7 +167,7 @@ pub(super) async fn gather_inputs(
    cancel: &CancellationToken,
    ctx: &RequestContext,
 ) -> Result<ModelInputs, CalculateSyntheticSizeError> {
-    // refresh is needed to update gc related pitr_cutoff and horizon_cutoff
+    // refresh is needed to update [`timeline::GcCutoffs`]
    tenant.refresh_gc_info(cancel, ctx).await?;

    // Collect information about all the timelines
@@ -236,20 +231,18 @@ pub(super) async fn gather_inputs(
        // we don't consider the `Timeline::disk_consistent_lsn` at all, because we are not
        // actually removing files.
        //
-        // We only consider [`GcInfo::pitr_cutoff`], and not [`GcInfo::horizon_cutoff`], because from
+        // We only consider [`timeline::GcCutoffs::time`], and not [`timeline::GcCutoffs::space`], because from
        // a user's perspective they have only requested retention up to the time bound (pitr_cutoff), rather
-        // than a space bound (horizon cutoff).  This means that if someone drops a database and waits for their
+        // than our internal space cutoff.  This means that if someone drops a database and waits for their
        // PITR interval, they will see synthetic size decrease, even if we are still storing data inside
-        // horizon_cutoff.
-        let pitr_cutoff = gc_info.cutoffs.pitr;
-        let horizon_cutoff = gc_info.cutoffs.horizon;
-        let mut next_gc_cutoff = pitr_cutoff;
+        // the space cutoff.
+        let mut next_pitr_cutoff = gc_info.cutoffs.time;

        // If the caller provided a shorter retention period, use that instead of the GC cutoff.
        let retention_param_cutoff = if let Some(max_retention_period) = max_retention_period {
            let param_cutoff = Lsn(last_record_lsn.0.saturating_sub(max_retention_period));
-            if next_gc_cutoff < param_cutoff {
-                next_gc_cutoff = param_cutoff;
+            if next_pitr_cutoff < param_cutoff {
+                next_pitr_cutoff = param_cutoff;
            }
            Some(param_cutoff)
        } else {
@@ -263,7 +256,7 @@ pub(super) async fn gather_inputs(
            .copied()
            .collect::<Vec<_>>();

-        // next_gc_cutoff in parent branch are not of interest (right now at least), nor do we
+        // next_pitr_cutoff in parent branch are not of interest (right now at least), nor do we
        // want to query any logical size before initdb_lsn.
        let branch_start_lsn = cmp::max(ancestor_lsn, timeline.initdb_lsn);

@@ -291,10 +284,10 @@ pub(super) async fn gather_inputs(
            )
        }

-        // Add a point for the GC cutoff
-        let branch_start_needed = next_gc_cutoff <= branch_start_lsn;
+        // Add a point for the PITR cutoff
+        let branch_start_needed = next_pitr_cutoff <= branch_start_lsn;
        if !branch_start_needed {
-            lsns.push((next_gc_cutoff, LsnKind::GcCutOff));
+            lsns.push((next_pitr_cutoff, LsnKind::GcCutOff));
        }

        lsns.sort_unstable();
@@ -333,7 +326,7 @@ pub(super) async fn gather_inputs(
                    parent: Some(parent),
                    lsn: lsn.0,
                    size: None,
-                    needed: lsn > next_gc_cutoff,
+                    needed: lsn > next_pitr_cutoff,
                },
                timeline_id: timeline.timeline_id,
                kind,
@@ -357,8 +350,8 @@ pub(super) async fn gather_inputs(
                    segment: Segment {
                        parent: Some(lease_parent),
                        lsn: lsn.0,
-                        size: None,                   // Filled in later, if necessary
-                        needed: lsn > next_gc_cutoff, // only needed if the point is within rentention.
+                        size: None,                     // Filled in later, if necessary
+                        needed: lsn > next_pitr_cutoff, // only needed if the point is within rentention.
                    },
                    timeline_id: timeline.timeline_id,
                    kind: LsnKind::LeaseStart,
@@ -398,9 +391,7 @@ pub(super) async fn gather_inputs(
            last_record: last_record_lsn,
            // this is not used above, because it might not have updated recently enough
            latest_gc_cutoff: *timeline.get_latest_gc_cutoff_lsn(),
-            horizon_cutoff,
-            pitr_cutoff,
-            next_gc_cutoff,
+            next_pitr_cutoff,
            retention_param_cutoff,
            lease_points,
        });
@@ -742,9 +733,7 @@ fn verify_size_for_multiple_branches() {
      "ancestor_lsn": "0/18D3D98",
      "last_record": "0/2230CD0",
      "latest_gc_cutoff": "0/1698C48",
-      "horizon_cutoff": "0/2210CD0",
-      "pitr_cutoff": "0/2210CD0",
-      "next_gc_cutoff": "0/2210CD0",
+      "next_pitr_cutoff": "0/2210CD0",
      "retention_param_cutoff": null,
      "lease_points": []
    },
@@ -753,9 +742,7 @@ fn verify_size_for_multiple_branches() {
      "ancestor_lsn": "0/176D998",
      "last_record": "0/1837770",
      "latest_gc_cutoff": "0/1698C48",
-      "horizon_cutoff": "0/1817770",
-      "pitr_cutoff": "0/1817770",
-      "next_gc_cutoff": "0/1817770",
+      "next_pitr_cutoff": "0/1817770",
      "retention_param_cutoff": null,
      "lease_points": []
    },
@@ -764,9 +751,7 @@ fn verify_size_for_multiple_branches() {
      "ancestor_lsn": "0/0",
      "last_record": "0/18D3D98",
      "latest_gc_cutoff": "0/1698C48",
-      "horizon_cutoff": "0/18B3D98",
-      "pitr_cutoff": "0/18B3D98",
-      "next_gc_cutoff": "0/18B3D98",
+      "next_pitr_cutoff": "0/18B3D98",
      "retention_param_cutoff": null,
      "lease_points": []
    }
@@ -820,9 +805,7 @@ fn verify_size_for_one_branch() {
      "ancestor_lsn": "0/0",
      "last_record": "47/280A5860",
      "latest_gc_cutoff": "47/240A5860",
-      "horizon_cutoff": "47/240A5860",
-      "pitr_cutoff": "47/240A5860",
-      "next_gc_cutoff": "47/240A5860",
+      "next_pitr_cutoff": "47/240A5860",
      "retention_param_cutoff": "0/0",
      "lease_points": []
    }
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -6,8 +6,6 @@ pub(crate) mod inmemory_layer;
 pub(crate) mod layer;
 mod layer_desc;
 mod layer_name;
-
-#[cfg(test)]
 pub mod merge_iterator;

 use crate::context::{AccessStatsBehavior, RequestContext};
@@ -676,6 +674,26 @@ impl LayerAccessStats {
            },
        }
    }
+
+    /// Whether this layer has been accessed (excluding in [`AccessStatsBehavior::Skip`]).
+    ///
+    /// This indicates whether the layer has been used for some purpose that would motivate
+    /// us to keep it on disk, such as for serving a getpage request.
+    fn accessed(&self) -> bool {
+        let locked = self.0.lock().unwrap();
+        let inner = &locked.for_eviction_policy;
+
+        // Consider it accessed if the most recent access is more recent than
+        // the most recent change in residence status.
+        match (
+            inner.last_accesses.recent(),
+            inner.last_residence_changes.recent(),
+        ) {
+            (None, _) => false,
+            (Some(_), None) => true,
+            (Some(a), Some(r)) => a.when >= r.timestamp,
+        }
+    }
 }

 /// Get a layer descriptor from a layer.
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -33,11 +33,14 @@ use crate::page_cache::{self, FileId, PAGE_SZ};
 use crate::repository::{Key, Value, KEY_SIZE};
 use crate::tenant::blob_io::BlobWriter;
 use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, FileBlockReader};
-use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
+use crate::tenant::disk_btree::{
+    DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection,
+};
 use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::vectored_blob_io::{
-    BlobFlag, MaxVectoredReadBytes, VectoredBlobReader, VectoredRead, VectoredReadPlanner,
+    BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
+    VectoredReadPlanner,
 };
 use crate::tenant::{PageReconstructError, Timeline};
 use crate::virtual_file::{self, VirtualFile};
@@ -53,6 +56,7 @@ use pageserver_api::models::{ImageCompressionAlgorithm, LayerAccessKind};
 use pageserver_api::shard::TenantShardId;
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
+use std::collections::VecDeque;
 use std::fs::File;
 use std::io::SeekFrom;
 use std::ops::Range;
@@ -747,12 +751,10 @@ impl DeltaLayer {
 }

 impl DeltaLayerInner {
-    #[cfg(test)]
    pub(crate) fn key_range(&self) -> &Range<Key> {
        &self.layer_key_range
    }

-    #[cfg(test)]
    pub(crate) fn lsn_range(&self) -> &Range<Lsn> {
        &self.layer_lsn_range
    }
@@ -1180,9 +1182,7 @@ impl DeltaLayerInner {
                    let delta_key = DeltaKey::from_slice(key);
                    let val_ref = ValueRef {
                        blob_ref: BlobRef(value),
-                        reader: BlockCursor::new(crate::tenant::block_io::BlockReaderRef::Adapter(
-                            Adapter(self),
-                        )),
+                        layer: self,
                    };
                    let pos = BlobRef(value).pos();
                    if let Some(last) = all_keys.last_mut() {
@@ -1426,7 +1426,7 @@ impl DeltaLayerInner {
        let keys = self.load_keys(ctx).await?;

        async fn dump_blob(val: &ValueRef<'_>, ctx: &RequestContext) -> anyhow::Result<String> {
-            let buf = val.reader.read_blob(val.blob_ref.pos(), ctx).await?;
+            let buf = val.load_raw(ctx).await?;
            let val = Value::des(&buf)?;
            let desc = match val {
                Value::Image(img) => {
@@ -1461,8 +1461,7 @@ impl DeltaLayerInner {
            use pageserver_api::key::CHECKPOINT_KEY;
            use postgres_ffi::CheckPoint;
            if key == CHECKPOINT_KEY {
-                let buf = val.reader.read_blob(val.blob_ref.pos(), ctx).await?;
-                let val = Value::des(&buf)?;
+                let val = val.load(ctx).await?;
                match val {
                    Value::Image(img) => {
                        let checkpoint = CheckPoint::decode(&img)?;
@@ -1515,7 +1514,6 @@ impl DeltaLayerInner {
        offset
    }

-    #[cfg(test)]
    pub(crate) fn iter<'a>(&'a self, ctx: &'a RequestContext) -> DeltaLayerIterator<'a> {
        let block_reader = FileBlockReader::new(&self.file, self.file_id);
        let tree_reader =
@@ -1526,7 +1524,7 @@ impl DeltaLayerInner {
            index_iter: tree_reader.iter(&[0; DELTA_KEY_SIZE], ctx),
            key_values_batch: std::collections::VecDeque::new(),
            is_end: false,
-            planner: crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner::new(
+            planner: StreamingVectoredReadPlanner::new(
                1024 * 8192, // The default value. Unit tests might use a different value. 1024 * 8K = 8MB buffer.
                1024,        // The default value. Unit tests might use a different value
            ),
@@ -1547,17 +1545,24 @@ pub struct DeltaEntry<'a> {
 /// Reference to an on-disk value
 pub struct ValueRef<'a> {
    blob_ref: BlobRef,
-    reader: BlockCursor<'a>,
+    layer: &'a DeltaLayerInner,
 }

 impl<'a> ValueRef<'a> {
    /// Loads the value from disk
    pub async fn load(&self, ctx: &RequestContext) -> Result<Value> {
-        // theoretically we *could* record an access time for each, but it does not really matter
-        let buf = self.reader.read_blob(self.blob_ref.pos(), ctx).await?;
+        let buf = self.load_raw(ctx).await?;
        let val = Value::des(&buf)?;
        Ok(val)
    }
+
+    async fn load_raw(&self, ctx: &RequestContext) -> Result<Vec<u8>> {
+        let reader = BlockCursor::new(crate::tenant::block_io::BlockReaderRef::Adapter(Adapter(
+            self.layer,
+        )));
+        let buf = reader.read_blob(self.blob_ref.pos(), ctx).await?;
+        Ok(buf)
+    }
 }

 pub(crate) struct Adapter<T>(T);
@@ -1591,17 +1596,15 @@ impl<'a> pageserver_compaction::interface::CompactionDeltaEntry<'a, Key> for Del
    }
 }

-#[cfg(test)]
 pub struct DeltaLayerIterator<'a> {
    delta_layer: &'a DeltaLayerInner,
    ctx: &'a RequestContext,
-    planner: crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner,
-    index_iter: crate::tenant::disk_btree::DiskBtreeIterator<'a>,
-    key_values_batch: std::collections::VecDeque<(Key, Lsn, Value)>,
+    planner: StreamingVectoredReadPlanner,
+    index_iter: DiskBtreeIterator<'a>,
+    key_values_batch: VecDeque<(Key, Lsn, Value)>,
    is_end: bool,
 }

-#[cfg(test)]
 impl<'a> DeltaLayerIterator<'a> {
    /// Retrieve a batch of key-value pairs into the iterator buffer.
    async fn next_batch(&mut self) -> anyhow::Result<()> {
@@ -1668,6 +1671,7 @@ pub(crate) mod test {
    use rand::RngCore;

    use super::*;
+    use crate::repository::Value;
    use crate::tenant::harness::TIMELINE_ID;
    use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner;
    use crate::tenant::Tenant;
@@ -1677,6 +1681,7 @@ pub(crate) mod test {
        tenant::{disk_btree::tests::TestDisk, harness::TenantHarness},
        DEFAULT_PG_VERSION,
    };
+    use bytes::Bytes;

    /// Construct an index for a fictional delta layer and and then
    /// traverse in order to plan vectored reads for a query. Finally,
@@ -1929,7 +1934,7 @@ pub(crate) mod test {

    #[tokio::test]
    async fn test_delta_layer_vectored_read_end_to_end() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_delta_layer_oversized_vectored_read")?;
+        let harness = TenantHarness::create("test_delta_layer_oversized_vectored_read").await?;
        let (tenant, ctx) = harness.load().await;

        let timeline_id = TimelineId::generate();
@@ -2029,7 +2034,9 @@ pub(crate) mod test {
        use crate::walrecord::NeonWalRecord;
        use bytes::Bytes;

-        let h = crate::tenant::harness::TenantHarness::create("truncate_delta_smoke").unwrap();
+        let h = crate::tenant::harness::TenantHarness::create("truncate_delta_smoke")
+            .await
+            .unwrap();
        let (tenant, ctx) = h.load().await;
        let ctx = &ctx;
        let timeline = tenant
@@ -2245,6 +2252,15 @@ pub(crate) mod test {
        (k1, l1).cmp(&(k2, l2))
    }

+    pub(crate) fn sort_delta_value(
+        (k1, l1, v1): &(Key, Lsn, Value),
+        (k2, l2, v2): &(Key, Lsn, Value),
+    ) -> std::cmp::Ordering {
+        let order_1 = if v1.is_image() { 0 } else { 1 };
+        let order_2 = if v2.is_image() { 0 } else { 1 };
+        (k1, l1, order_1).cmp(&(k2, l2, order_2))
+    }
+
    pub(crate) async fn produce_delta_layer(
        tenant: &Tenant,
        tline: &Arc<Timeline>,
@@ -2253,7 +2269,7 @@ pub(crate) mod test {
    ) -> anyhow::Result<ResidentLayer> {
        deltas.sort_by(sort_delta);
        let (key_start, _, _) = deltas.first().unwrap();
-        let (key_max, _, _) = deltas.first().unwrap();
+        let (key_max, _, _) = deltas.last().unwrap();
        let lsn_min = deltas.iter().map(|(_, lsn, _)| lsn).min().unwrap();
        let lsn_max = deltas.iter().map(|(_, lsn, _)| lsn).max().unwrap();
        let lsn_end = Lsn(lsn_max.0 + 1);
@@ -2298,10 +2314,7 @@ pub(crate) mod test {

    #[tokio::test]
    async fn delta_layer_iterator() {
-        use crate::repository::Value;
-        use bytes::Bytes;
-
-        let harness = TenantHarness::create("delta_layer_iterator").unwrap();
+        let harness = TenantHarness::create("delta_layer_iterator").await.unwrap();
        let (tenant, ctx) = harness.load().await;

        let tline = tenant
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -29,13 +29,16 @@ use crate::page_cache::{self, FileId, PAGE_SZ};
 use crate::repository::{Key, Value, KEY_SIZE};
 use crate::tenant::blob_io::BlobWriter;
 use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
-use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
+use crate::tenant::disk_btree::{
+    DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection,
+};
 use crate::tenant::storage_layer::{
    LayerAccessStats, ValueReconstructResult, ValueReconstructState,
 };
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::vectored_blob_io::{
-    BlobFlag, MaxVectoredReadBytes, VectoredBlobReader, VectoredRead, VectoredReadPlanner,
+    BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
+    VectoredReadPlanner,
 };
 use crate::tenant::{PageReconstructError, Timeline};
 use crate::virtual_file::{self, VirtualFile};
@@ -50,6 +53,7 @@ use pageserver_api::models::LayerAccessKind;
 use pageserver_api::shard::{ShardIdentity, TenantShardId};
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
+use std::collections::VecDeque;
 use std::fs::File;
 use std::io::SeekFrom;
 use std::ops::Range;
@@ -369,12 +373,10 @@ impl ImageLayer {
 }

 impl ImageLayerInner {
-    #[cfg(test)]
    pub(crate) fn key_range(&self) -> &Range<Key> {
        &self.key_range
    }

-    #[cfg(test)]
    pub(crate) fn lsn(&self) -> Lsn {
        self.lsn
    }
@@ -699,7 +701,6 @@ impl ImageLayerInner {
        }
    }

-    #[cfg(test)]
    pub(crate) fn iter<'a>(&'a self, ctx: &'a RequestContext) -> ImageLayerIterator<'a> {
        let block_reader = FileBlockReader::new(&self.file, self.file_id);
        let tree_reader =
@@ -708,9 +709,9 @@ impl ImageLayerInner {
            image_layer: self,
            ctx,
            index_iter: tree_reader.iter(&[0; KEY_SIZE], ctx),
-            key_values_batch: std::collections::VecDeque::new(),
+            key_values_batch: VecDeque::new(),
            is_end: false,
-            planner: crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner::new(
+            planner: StreamingVectoredReadPlanner::new(
                1024 * 8192, // The default value. Unit tests might use a different value. 1024 * 8K = 8MB buffer.
                1024,        // The default value. Unit tests might use a different value
            ),
@@ -737,6 +738,9 @@ struct ImageLayerWriterInner {
    key_range: Range<Key>,
    lsn: Lsn,

+    // Total uncompressed bytes passed into put_image
+    uncompressed_bytes: u64,
+
    blob_writer: BlobWriter<false>,
    tree: DiskBtreeBuilder<BlockBuf, KEY_SIZE>,
 }
@@ -792,6 +796,7 @@ impl ImageLayerWriterInner {
            lsn,
            tree: tree_builder,
            blob_writer,
+            uncompressed_bytes: 0,
        };

        Ok(writer)
@@ -810,6 +815,7 @@ impl ImageLayerWriterInner {
    ) -> anyhow::Result<()> {
        ensure!(self.key_range.contains(&key));
        let compression = self.conf.image_compression;
+        self.uncompressed_bytes += img.len() as u64;
        let (_img, res) = self
            .blob_writer
            .write_blob_maybe_compressed(img, ctx, compression)
@@ -835,6 +841,11 @@ impl ImageLayerWriterInner {
        let index_start_blk =
            ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;

+        // Calculate compression ratio
+        let compressed_size = self.blob_writer.size() - PAGE_SZ as u64; // Subtract PAGE_SZ for header
+        crate::metrics::COMPRESSION_IMAGE_INPUT_BYTES.inc_by(self.uncompressed_bytes);
+        crate::metrics::COMPRESSION_IMAGE_OUTPUT_BYTES.inc_by(compressed_size);
+
        let mut file = self.blob_writer.into_inner();

        // Write out the index
@@ -974,17 +985,15 @@ impl Drop for ImageLayerWriter {
    }
 }

-#[cfg(test)]
 pub struct ImageLayerIterator<'a> {
    image_layer: &'a ImageLayerInner,
    ctx: &'a RequestContext,
-    planner: crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner,
-    index_iter: crate::tenant::disk_btree::DiskBtreeIterator<'a>,
-    key_values_batch: std::collections::VecDeque<(Key, Lsn, Value)>,
+    planner: StreamingVectoredReadPlanner,
+    index_iter: DiskBtreeIterator<'a>,
+    key_values_batch: VecDeque<(Key, Lsn, Value)>,
    is_end: bool,
 }

-#[cfg(test)]
 impl<'a> ImageLayerIterator<'a> {
    /// Retrieve a batch of key-value pairs into the iterator buffer.
    async fn next_batch(&mut self) -> anyhow::Result<()> {
@@ -1102,6 +1111,7 @@ mod test {
            ShardIdentity::unsharded(),
            get_next_gen(),
        )
+        .await
        .unwrap();
        let (tenant, ctx) = harness.load().await;
        let timeline = tenant
@@ -1168,6 +1178,7 @@ mod test {
                // But here, all we care about is that the gen number is unique.
                get_next_gen(),
            )
+            .await
            .unwrap();
            let (tenant, ctx) = harness.load().await;
            let timeline = tenant
@@ -1299,7 +1310,7 @@ mod test {

    #[tokio::test]
    async fn image_layer_iterator() {
-        let harness = TenantHarness::create("image_layer_iterator").unwrap();
+        let harness = TenantHarness::create("image_layer_iterator").await.unwrap();
        let (tenant, ctx) = harness.load().await;

        let tline = tenant
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -385,6 +385,7 @@ impl Layer {
    }

    /// Get all key/values in the layer. Should be replaced with an iterator-based API in the future.
+    #[allow(dead_code)]
    pub(crate) async fn load_key_values(
        &self,
        ctx: &RequestContext,
@@ -693,6 +694,18 @@ impl Drop for LayerInner {
            // and we could be delaying shutdown for nothing.
        }

+        if let Some(timeline) = self.timeline.upgrade() {
+            // Only need to decrement metrics if the timeline still exists: otherwise
+            // it will have already de-registered these metrics via TimelineMetrics::shutdown
+            if self.desc.is_delta() {
+                timeline.metrics.layer_count_delta.dec();
+                timeline.metrics.layer_size_delta.sub(self.desc.file_size);
+            } else {
+                timeline.metrics.layer_count_image.dec();
+                timeline.metrics.layer_size_image.sub(self.desc.file_size);
+            }
+        }
+
        if !*self.wanted_deleted.get_mut() {
            return;
        }
@@ -791,6 +804,15 @@ impl LayerInner {
            (heavier_once_cell::OnceCell::default(), 0, Status::Evicted)
        };

+        // This object acts as a RAII guard on these metrics: increment on construction
+        if desc.is_delta() {
+            timeline.metrics.layer_count_delta.inc();
+            timeline.metrics.layer_size_delta.add(desc.file_size);
+        } else {
+            timeline.metrics.layer_count_image.inc();
+            timeline.metrics.layer_size_image.add(desc.file_size);
+        }
+
        LayerInner {
            conf,
            debug_str: {
@@ -1469,14 +1491,22 @@ impl LayerInner {
                let duration = SystemTime::now().duration_since(local_layer_mtime);
                match duration {
                    Ok(elapsed) => {
-                        timeline
-                            .metrics
-                            .evictions_with_low_residence_duration
-                            .read()
-                            .unwrap()
-                            .observe(elapsed);
+                        let accessed = self.access_stats.accessed();
+                        if accessed {
+                            // Only layers used for reads contribute to our "low residence" metric that is used
+                            // to detect thrashing.  Layers promoted for other reasons (e.g. compaction) are allowed
+                            // to be rapidly evicted without contributing to this metric.
+                            timeline
+                                .metrics
+                                .evictions_with_low_residence_duration
+                                .read()
+                                .unwrap()
+                                .observe(elapsed);
+                        }
+
                        tracing::info!(
                            residence_millis = elapsed.as_millis(),
+                            accessed,
                            "evicted layer after known residence period"
                        );
                    }
@@ -1889,7 +1919,7 @@ impl ResidentLayer {
        self.owner.metadata()
    }

-    #[cfg(test)]
+    /// Cast the layer to a delta, return an error if it is an image layer.
    pub(crate) async fn get_as_delta(
        &self,
        ctx: &RequestContext,
@@ -1901,7 +1931,7 @@ impl ResidentLayer {
        }
    }

-    #[cfg(test)]
+    /// Cast the layer to an image, return an error if it is a delta layer.
    pub(crate) async fn get_as_image(
        &self,
        ctx: &RequestContext,
--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -22,7 +22,7 @@ const FOREVER: std::time::Duration = std::time::Duration::from_secs(ADVANCE.as_s
 async fn smoke_test() {
    let handle = tokio::runtime::Handle::current();

-    let h = TenantHarness::create("smoke_test").unwrap();
+    let h = TenantHarness::create("smoke_test").await.unwrap();
    let span = h.span();
    let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
    let (tenant, _) = h.load().await;
@@ -176,7 +176,9 @@ async fn evict_and_wait_on_wanted_deleted() {
    // this is the runtime on which Layer spawns the blocking tasks on
    let handle = tokio::runtime::Handle::current();

-    let h = TenantHarness::create("evict_and_wait_on_wanted_deleted").unwrap();
+    let h = TenantHarness::create("evict_and_wait_on_wanted_deleted")
+        .await
+        .unwrap();
    utils::logging::replace_panic_hook_with_tracing_panic_hook().forget();
    let (tenant, ctx) = h.load().await;

@@ -258,7 +260,9 @@ fn read_wins_pending_eviction() {
    rt.block_on(async move {
        // this is the runtime on which Layer spawns the blocking tasks on
        let handle = tokio::runtime::Handle::current();
-        let h = TenantHarness::create("read_wins_pending_eviction").unwrap();
+        let h = TenantHarness::create("read_wins_pending_eviction")
+            .await
+            .unwrap();
        let (tenant, ctx) = h.load().await;
        let span = h.span();
        let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
@@ -390,7 +394,7 @@ fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) {
    rt.block_on(async move {
        // this is the runtime on which Layer spawns the blocking tasks on
        let handle = tokio::runtime::Handle::current();
-        let h = TenantHarness::create(name).unwrap();
+        let h = TenantHarness::create(name).await.unwrap();
        let (tenant, ctx) = h.load().await;
        let span = h.span();
        let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
@@ -559,8 +563,9 @@ fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) {
 #[tokio::test(start_paused = true)]
 async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() {
    let handle = tokio::runtime::Handle::current();
-    let h =
-        TenantHarness::create("cancelled_get_or_maybe_download_does_not_cancel_eviction").unwrap();
+    let h = TenantHarness::create("cancelled_get_or_maybe_download_does_not_cancel_eviction")
+        .await
+        .unwrap();
    let (tenant, ctx) = h.load().await;

    let timeline = tenant
@@ -636,7 +641,9 @@ async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() {
 #[tokio::test(start_paused = true)]
 async fn evict_and_wait_does_not_wait_for_download() {
    // let handle = tokio::runtime::Handle::current();
-    let h = TenantHarness::create("evict_and_wait_does_not_wait_for_download").unwrap();
+    let h = TenantHarness::create("evict_and_wait_does_not_wait_for_download")
+        .await
+        .unwrap();
    let (tenant, ctx) = h.load().await;
    let span = h.span();
    let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
@@ -733,7 +740,9 @@ async fn eviction_cancellation_on_drop() {
    // this is the runtime on which Layer spawns the blocking tasks on
    let handle = tokio::runtime::Handle::current();

-    let h = TenantHarness::create("eviction_cancellation_on_drop").unwrap();
+    let h = TenantHarness::create("eviction_cancellation_on_drop")
+        .await
+        .unwrap();
    utils::logging::replace_panic_hook_with_tracing_panic_hook().forget();
    let (tenant, ctx) = h.load().await;

--- a/pageserver/src/tenant/storage_layer/merge_iterator.rs
+++ b/pageserver/src/tenant/storage_layer/merge_iterator.rs
@@ -96,15 +96,22 @@ impl<'a> std::cmp::PartialOrd for IteratorWrapper<'a> {
 impl<'a> std::cmp::Ord for IteratorWrapper<'a> {
    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
        use std::cmp::Ordering;
-        let a = self.peek_next_key_lsn();
-        let b = other.peek_next_key_lsn();
+        let a = self.peek_next_key_lsn_value();
+        let b = other.peek_next_key_lsn_value();
        match (a, b) {
-            (Some((k1, l1)), Some((k2, l2))) => {
-                let loaded_1 = if self.is_loaded() { 1 } else { 0 };
-                let loaded_2 = if other.is_loaded() { 1 } else { 0 };
+            (Some((k1, l1, v1)), Some((k2, l2, v2))) => {
+                fn map_value_to_num(val: &Option<&Value>) -> usize {
+                    match val {
+                        None => 0,
+                        Some(Value::Image(_)) => 1,
+                        Some(Value::WalRecord(_)) => 2,
+                    }
+                }
+                let order_1 = map_value_to_num(&v1);
+                let order_2 = map_value_to_num(&v2);
                // When key_lsn are the same, the unloaded iter will always appear before the loaded one.
                // And note that we do a reverse at the end of the comparison, so it works with the max heap.
-                (k1, l1, loaded_1).cmp(&(k2, l2, loaded_2))
+                (k1, l1, order_1).cmp(&(k2, l2, order_2))
            }
            (Some(_), None) => Ordering::Less,
            (None, Some(_)) => Ordering::Greater,
@@ -137,13 +144,16 @@ impl<'a> IteratorWrapper<'a> {
        }
    }

-    fn peek_next_key_lsn(&self) -> Option<(&Key, Lsn)> {
+    fn peek_next_key_lsn_value(&self) -> Option<(&Key, Lsn, Option<&Value>)> {
        match self {
-            Self::Loaded { iter } => iter.peek().as_ref().map(|(key, lsn, _)| (key, *lsn)),
+            Self::Loaded { iter } => iter
+                .peek()
+                .as_ref()
+                .map(|(key, lsn, val)| (key, *lsn, Some(val))),
            Self::NotLoaded {
                first_key_lower_bound: (key, lsn),
                ..
-            } => Some((key, *lsn)),
+            } => Some((key, *lsn, None)),
        }
    }

@@ -191,6 +201,13 @@ impl<'a> IteratorWrapper<'a> {
    }
 }

+/// A merge iterator over delta/image layer iterators. When duplicated records are
+/// found, the iterator will not perform any deduplication, and the caller should handle
+/// these situation. By saying duplicated records, there are many possibilities:
+/// * Two same delta at the same LSN.
+/// * Two same image at the same LSN.
+/// * Delta/image at the same LSN where the image has already applied the delta.
+/// The iterator will always put the image before the delta.
 pub struct MergeIterator<'a> {
    heap: BinaryHeap<IteratorWrapper<'a>>,
 }
@@ -245,8 +262,9 @@ mod tests {
    use crate::{
        tenant::{
            harness::{TenantHarness, TIMELINE_ID},
-            storage_layer::delta_layer::test::{produce_delta_layer, sort_delta},
+            storage_layer::delta_layer::test::{produce_delta_layer, sort_delta, sort_delta_value},
        },
+        walrecord::NeonWalRecord,
        DEFAULT_PG_VERSION,
    };

@@ -275,7 +293,9 @@ mod tests {
        use crate::repository::Value;
        use bytes::Bytes;

-        let harness = TenantHarness::create("merge_iterator_merge_in_between").unwrap();
+        let harness = TenantHarness::create("merge_iterator_merge_in_between")
+            .await
+            .unwrap();
        let (tenant, ctx) = harness.load().await;

        let tline = tenant
@@ -338,7 +358,9 @@ mod tests {
        use crate::repository::Value;
        use bytes::Bytes;

-        let harness = TenantHarness::create("merge_iterator_delta_merge").unwrap();
+        let harness = TenantHarness::create("merge_iterator_delta_merge")
+            .await
+            .unwrap();
        let (tenant, ctx) = harness.load().await;

        let tline = tenant
@@ -407,6 +429,133 @@ mod tests {
        // TODO: test layers are loaded only when needed, reducing num of active iterators in k-merge
    }

-    // TODO: image layer merge, delta+image mixed merge
-    // TODO: is it possible to have duplicated delta at same LSN now? we might need to test that
+    #[tokio::test]
+    async fn delta_image_mixed_merge() {
+        use crate::repository::Value;
+        use bytes::Bytes;
+
+        let harness = TenantHarness::create("merge_iterator_delta_image_mixed_merge")
+            .await
+            .unwrap();
+        let (tenant, ctx) = harness.load().await;
+
+        let tline = tenant
+            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
+            .await
+            .unwrap();
+
+        fn get_key(id: u32) -> Key {
+            let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap();
+            key.field6 = id;
+            key
+        }
+        // In this test case, we want to test if the iterator still works correctly with multiple copies
+        // of a delta+image at the same LSN, for example, the following sequence a@10=+a, a@10=+a, a@10=ab, a@10=ab.
+        // Duplicated deltas/images are possible for old tenants before the full L0 compaction file name fix.
+        // An incomplete compaction could produce multiple exactly-the-same delta layers. Force image generation
+        // could produce overlapping images. Apart from duplicated deltas/images, in the current storage implementation
+        // one key-lsn could have a delta in the delta layer and one image in the image layer. The iterator should
+        // correctly process these situations and return everything as-is, and the upper layer of the system
+        // will handle duplicated LSNs.
+        let test_deltas1 = vec![
+            (
+                get_key(0),
+                Lsn(0x10),
+                Value::WalRecord(NeonWalRecord::wal_init()),
+            ),
+            (
+                get_key(0),
+                Lsn(0x18),
+                Value::WalRecord(NeonWalRecord::wal_append("a")),
+            ),
+            (
+                get_key(5),
+                Lsn(0x10),
+                Value::WalRecord(NeonWalRecord::wal_init()),
+            ),
+            (
+                get_key(5),
+                Lsn(0x18),
+                Value::WalRecord(NeonWalRecord::wal_append("b")),
+            ),
+        ];
+        let resident_layer_1 = produce_delta_layer(&tenant, &tline, test_deltas1.clone(), &ctx)
+            .await
+            .unwrap();
+        let mut test_deltas2 = test_deltas1.clone();
+        test_deltas2.push((
+            get_key(10),
+            Lsn(0x20),
+            Value::Image(Bytes::copy_from_slice(b"test")),
+        ));
+        let resident_layer_2 = produce_delta_layer(&tenant, &tline, test_deltas2.clone(), &ctx)
+            .await
+            .unwrap();
+        let test_deltas3 = vec![
+            (
+                get_key(0),
+                Lsn(0x10),
+                Value::Image(Bytes::copy_from_slice(b"")),
+            ),
+            (
+                get_key(5),
+                Lsn(0x18),
+                Value::Image(Bytes::copy_from_slice(b"b")),
+            ),
+            (
+                get_key(15),
+                Lsn(0x20),
+                Value::Image(Bytes::copy_from_slice(b"test")),
+            ),
+        ];
+        let resident_layer_3 = produce_delta_layer(&tenant, &tline, test_deltas3.clone(), &ctx)
+            .await
+            .unwrap();
+        let mut test_deltas4 = test_deltas3.clone();
+        test_deltas4.push((
+            get_key(20),
+            Lsn(0x20),
+            Value::Image(Bytes::copy_from_slice(b"test")),
+        ));
+        let resident_layer_4 = produce_delta_layer(&tenant, &tline, test_deltas4.clone(), &ctx)
+            .await
+            .unwrap();
+        let mut expect = Vec::new();
+        expect.extend(test_deltas1);
+        expect.extend(test_deltas2);
+        expect.extend(test_deltas3);
+        expect.extend(test_deltas4);
+        expect.sort_by(sort_delta_value);
+
+        // Test with different layer order for MergeIterator::create to ensure the order
+        // is stable.
+
+        let mut merge_iter = MergeIterator::create(
+            &[
+                resident_layer_4.get_as_delta(&ctx).await.unwrap(),
+                resident_layer_1.get_as_delta(&ctx).await.unwrap(),
+                resident_layer_3.get_as_delta(&ctx).await.unwrap(),
+                resident_layer_2.get_as_delta(&ctx).await.unwrap(),
+            ],
+            &[],
+            &ctx,
+        );
+        assert_merge_iter_equal(&mut merge_iter, &expect).await;
+
+        let mut merge_iter = MergeIterator::create(
+            &[
+                resident_layer_1.get_as_delta(&ctx).await.unwrap(),
+                resident_layer_4.get_as_delta(&ctx).await.unwrap(),
+                resident_layer_3.get_as_delta(&ctx).await.unwrap(),
+                resident_layer_2.get_as_delta(&ctx).await.unwrap(),
+            ],
+            &[],
+            &ctx,
+        );
+        assert_merge_iter_equal(&mut merge_iter, &expect).await;
+
+        is_send(merge_iter);
+    }
+
+    fn is_send(_: impl Send) {}
 }
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -69,6 +69,7 @@ use std::{
 use crate::{
    aux_file::AuxFileSizeEstimator,
    tenant::{
+        config::defaults::DEFAULT_PITR_INTERVAL,
        layer_map::{LayerMap, SearchResult},
        metadata::TimelineMetadata,
        storage_layer::PersistentLayerDesc,
@@ -197,7 +198,7 @@ impl PartialOrd for Hole {

 /// Temporary function for immutable storage state refactor, ensures we are dropping mutex guard instead of other things.
 /// Can be removed after all refactors are done.
-fn drop_rlock<T>(rlock: tokio::sync::OwnedRwLockReadGuard<T>) {
+fn drop_rlock<T>(rlock: tokio::sync::RwLockReadGuard<T>) {
    drop(rlock)
 }

@@ -270,7 +271,7 @@ pub struct Timeline {
    ///
    /// In the future, we'll be able to split up the tuple of LayerMap and `LayerFileManager`,
    /// so that e.g. on-demand-download/eviction, and layer spreading, can operate just on `LayerFileManager`.
-    pub(crate) layers: Arc<tokio::sync::RwLock<LayerManager>>,
+    pub(crate) layers: tokio::sync::RwLock<LayerManager>,

    last_freeze_at: AtomicLsn,
    // Atomic would be more appropriate here.
@@ -477,37 +478,32 @@ impl GcInfo {
    }
 }

-/// The `GcInfo` component describing which Lsns need to be retained.
+/// The `GcInfo` component describing which Lsns need to be retained.  Functionally, this
+/// is a single number (the oldest LSN which we must retain), but it internally distinguishes
+/// between time-based and space-based retention for observability and consumption metrics purposes.
 #[derive(Debug)]
 pub(crate) struct GcCutoffs {
-    /// Keep everything newer than this point.
-    ///
-    /// This is calculated by subtracting 'gc_horizon' setting from
-    /// last-record LSN
-    ///
-    /// FIXME: is this inclusive or exclusive?
-    pub(crate) horizon: Lsn,
+    /// Calculated from the [`TenantConf::gc_horizon`], this LSN indicates how much
+    /// history we must keep to retain a specified number of bytes of WAL.
+    pub(crate) space: Lsn,

-    /// In addition to 'retain_lsns' and 'horizon_cutoff', keep everything newer than this
-    /// point.
-    ///
-    /// This is calculated by finding a number such that a record is needed for PITR
-    /// if only if its LSN is larger than 'pitr_cutoff'.
-    pub(crate) pitr: Lsn,
+    /// Calculated from [`TenantConf::pitr_interval`], this LSN indicates how much
+    /// history we must keep to enable reading back at least the PITR interval duration.
+    pub(crate) time: Lsn,
 }

 impl Default for GcCutoffs {
    fn default() -> Self {
        Self {
-            horizon: Lsn::INVALID,
-            pitr: Lsn::INVALID,
+            space: Lsn::INVALID,
+            time: Lsn::INVALID,
        }
    }
 }

 impl GcCutoffs {
    fn select_min(&self) -> Lsn {
-        std::cmp::min(self.horizon, self.pitr)
+        std::cmp::min(self.space, self.time)
    }
 }

@@ -866,7 +862,7 @@ impl Timeline {
        let gc_info = self.gc_info.read().unwrap();
        let history = self
            .get_last_record_lsn()
-            .checked_sub(gc_info.cutoffs.pitr)
+            .checked_sub(gc_info.cutoffs.time)
            .unwrap_or(Lsn(0))
            .0;
        (history, gc_info.within_ancestor_pitr)
@@ -1565,7 +1561,7 @@ impl Timeline {
    ) -> anyhow::Result<()> {
        ensure!(
            lsn >= **latest_gc_cutoff_lsn,
-            "LSN {} is earlier than latest GC horizon {} (we might've already garbage collected needed data)",
+            "LSN {} is earlier than latest GC cutoff {} (we might've already garbage collected needed data)",
            lsn,
            **latest_gc_cutoff_lsn,
        );
@@ -3408,6 +3404,7 @@ impl Timeline {
        }
    }

+    #[allow(unknown_lints)] // doc_lazy_continuation is still a new lint
    #[allow(clippy::doc_lazy_continuation)]
    /// Get the data needed to reconstruct all keys in the provided keyspace
    ///
@@ -4732,13 +4729,7 @@ impl Timeline {
        tenant: &crate::tenant::Tenant,
        options: detach_ancestor::Options,
        ctx: &RequestContext,
-    ) -> Result<
-        (
-            completion::Completion,
-            detach_ancestor::PreparedTimelineDetach,
-        ),
-        detach_ancestor::Error,
-    > {
+    ) -> Result<detach_ancestor::Progress, detach_ancestor::Error> {
        detach_ancestor::prepare(self, tenant, options, ctx).await
    }

@@ -4945,24 +4936,21 @@ impl Timeline {
    }

    /// Find the Lsns above which layer files need to be retained on
-    /// garbage collection. This is separate from actually performing the GC,
-    /// and is updated more frequently, so that compaction can remove obsolete
-    /// page versions more aggressively.
+    /// garbage collection.
    ///
-    /// TODO: that's wishful thinking, compaction doesn't actually do that
-    /// currently.
+    /// We calculate two cutoffs, one based on time and one based on WAL size.  `pitr`
+    /// controls the time cutoff (or ZERO to disable time-based retention), and `space_cutoff` controls
+    /// the space-based retention.
    ///
-    /// The 'cutoff_horizon' point is used to retain recent versions that might still be
-    /// needed by read-only nodes. (As of this writing, the caller just passes
-    /// the latest LSN subtracted by a constant, and doesn't do anything smart
-    /// to figure out what read-only nodes might actually need.)
-    ///
-    /// The 'pitr' duration is used to calculate a 'pitr_cutoff', which can be used to determine
-    /// whether a record is needed for PITR.
+    /// This function doesn't simply to calculate time & space based retention: it treats time-based
+    /// retention as authoritative if enabled, and falls back to space-based retention if calculating
+    /// the LSN for a time point isn't possible.  Therefore the GcCutoffs::horizon in the response might
+    /// be different to the `space_cutoff` input.  Callers should treat the min() of the two cutoffs
+    /// in the response as the GC cutoff point for the timeline.
    #[instrument(skip_all, fields(timeline_id=%self.timeline_id))]
    pub(super) async fn find_gc_cutoffs(
        &self,
-        cutoff_horizon: Lsn,
+        space_cutoff: Lsn,
        pitr: Duration,
        cancel: &CancellationToken,
        ctx: &RequestContext,
@@ -4975,58 +4963,87 @@ impl Timeline {

        pausable_failpoint!("Timeline::find_gc_cutoffs-pausable");

-        // First, calculate pitr_cutoff_timestamp and then convert it to LSN.
-        //
-        // Some unit tests depend on garbage-collection working even when
-        // CLOG data is missing, so that find_lsn_for_timestamp() doesn't
-        // work, so avoid calling it altogether if time-based retention is not
-        // configured. It would be pointless anyway.
-        let pitr_cutoff = if pitr != Duration::ZERO {
-            let now = SystemTime::now();
-            if let Some(pitr_cutoff_timestamp) = now.checked_sub(pitr) {
-                let pitr_timestamp = to_pg_timestamp(pitr_cutoff_timestamp);
-
-                match self
-                    .find_lsn_for_timestamp(pitr_timestamp, cancel, ctx)
-                    .await?
-                {
-                    LsnForTimestamp::Present(lsn) => lsn,
-                    LsnForTimestamp::Future(lsn) => {
-                        // The timestamp is in the future. That sounds impossible,
-                        // but what it really means is that there hasn't been
-                        // any commits since the cutoff timestamp.
-                        //
-                        // In this case we should use the LSN of the most recent commit,
-                        // which is implicitly the last LSN in the log.
-                        debug!("future({})", lsn);
-                        self.get_last_record_lsn()
-                    }
-                    LsnForTimestamp::Past(lsn) => {
-                        debug!("past({})", lsn);
-                        // conservative, safe default is to remove nothing, when we
-                        // have no commit timestamp data available
-                        *self.get_latest_gc_cutoff_lsn()
-                    }
-                    LsnForTimestamp::NoData(lsn) => {
-                        debug!("nodata({})", lsn);
-                        // conservative, safe default is to remove nothing, when we
-                        // have no commit timestamp data available
-                        *self.get_latest_gc_cutoff_lsn()
-                    }
-                }
-            } else {
-                // If we don't have enough data to convert to LSN,
-                // play safe and don't remove any layers.
-                *self.get_latest_gc_cutoff_lsn()
+        if cfg!(test) {
+            // Unit tests which specify zero PITR interval expect to avoid doing any I/O for timestamp lookup
+            if pitr == Duration::ZERO {
+                return Ok(GcCutoffs {
+                    time: self.get_last_record_lsn(),
+                    space: space_cutoff,
+                });
+            }
+        }
+
+        // Calculate a time-based limit on how much to retain:
+        // - if PITR interval is set, then this is our cutoff.
+        // - if PITR interval is not set, then we do a lookup
+        //   based on DEFAULT_PITR_INTERVAL, so that size-based retention does not result in keeping history around permanently on idle databases.
+        let time_cutoff = {
+            let now = SystemTime::now();
+            let time_range = if pitr == Duration::ZERO {
+                humantime::parse_duration(DEFAULT_PITR_INTERVAL).expect("constant is invalid")
+            } else {
+                pitr
+            };
+
+            // If PITR is so large or `now` is so small that this underflows, we will retain no history (highly unexpected case)
+            let time_cutoff = now.checked_sub(time_range).unwrap_or(now);
+            let timestamp = to_pg_timestamp(time_cutoff);
+
+            match self.find_lsn_for_timestamp(timestamp, cancel, ctx).await? {
+                LsnForTimestamp::Present(lsn) => Some(lsn),
+                LsnForTimestamp::Future(lsn) => {
+                    // The timestamp is in the future. That sounds impossible,
+                    // but what it really means is that there hasn't been
+                    // any commits since the cutoff timestamp.
+                    //
+                    // In this case we should use the LSN of the most recent commit,
+                    // which is implicitly the last LSN in the log.
+                    debug!("future({})", lsn);
+                    Some(self.get_last_record_lsn())
+                }
+                LsnForTimestamp::Past(lsn) => {
+                    debug!("past({})", lsn);
+                    None
+                }
+                LsnForTimestamp::NoData(lsn) => {
+                    debug!("nodata({})", lsn);
+                    None
+                }
            }
-        } else {
-            // No time-based retention was configured. Interpret this as "keep no history".
-            self.get_last_record_lsn()
        };

-        Ok(GcCutoffs {
-            horizon: cutoff_horizon,
-            pitr: pitr_cutoff,
+        Ok(match (pitr, time_cutoff) {
+            (Duration::ZERO, Some(time_cutoff)) => {
+                // PITR is not set. Retain the size-based limit, or the default time retention,
+                // whichever requires less data.
+                GcCutoffs {
+                    time: self.get_last_record_lsn(),
+                    space: std::cmp::max(time_cutoff, space_cutoff),
+                }
+            }
+            (Duration::ZERO, None) => {
+                // PITR is not set, and time lookup failed
+                GcCutoffs {
+                    time: self.get_last_record_lsn(),
+                    space: space_cutoff,
+                }
+            }
+            (_, None) => {
+                // PITR interval is set & we didn't look up a timestamp successfully.  Conservatively assume PITR
+                // cannot advance beyond what was already GC'd, and respect space-based retention
+                GcCutoffs {
+                    time: *self.get_latest_gc_cutoff_lsn(),
+                    space: space_cutoff,
+                }
+            }
+            (_, Some(time_cutoff)) => {
+                // PITR interval is set and we looked up timestamp successfully.  Ignore
+                // size based retention and make time cutoff authoritative
+                GcCutoffs {
+                    time: time_cutoff,
+                    space: time_cutoff,
+                }
+            }
        })
    }

@@ -5051,11 +5068,11 @@ impl Timeline {
            return Err(GcError::TimelineCancelled);
        }

-        let (horizon_cutoff, pitr_cutoff, retain_lsns, max_lsn_with_valid_lease) = {
+        let (space_cutoff, time_cutoff, retain_lsns, max_lsn_with_valid_lease) = {
            let gc_info = self.gc_info.read().unwrap();

-            let horizon_cutoff = min(gc_info.cutoffs.horizon, self.get_disk_consistent_lsn());
-            let pitr_cutoff = gc_info.cutoffs.pitr;
+            let space_cutoff = min(gc_info.cutoffs.space, self.get_disk_consistent_lsn());
+            let time_cutoff = gc_info.cutoffs.time;
            let retain_lsns = gc_info.retain_lsns.clone();

            // Gets the maximum LSN that holds the valid lease.
@@ -5065,14 +5082,14 @@ impl Timeline {
            let max_lsn_with_valid_lease = gc_info.leases.last_key_value().map(|(lsn, _)| *lsn);

            (
-                horizon_cutoff,
-                pitr_cutoff,
+                space_cutoff,
+                time_cutoff,
                retain_lsns,
                max_lsn_with_valid_lease,
            )
        };

-        let mut new_gc_cutoff = Lsn::min(horizon_cutoff, pitr_cutoff);
+        let mut new_gc_cutoff = Lsn::min(space_cutoff, time_cutoff);
        let standby_horizon = self.standby_horizon.load();
        // Hold GC for the standby, but as a safety guard do it only within some
        // reasonable lag.
@@ -5101,8 +5118,8 @@ impl Timeline {

        let res = self
            .gc_timeline(
-                horizon_cutoff,
-                pitr_cutoff,
+                space_cutoff,
+                time_cutoff,
                retain_lsns,
                max_lsn_with_valid_lease,
                new_gc_cutoff,
@@ -5120,8 +5137,8 @@ impl Timeline {

    async fn gc_timeline(
        &self,
-        horizon_cutoff: Lsn,
-        pitr_cutoff: Lsn,
+        space_cutoff: Lsn,
+        time_cutoff: Lsn,
        retain_lsns: Vec<Lsn>,
        max_lsn_with_valid_lease: Option<Lsn>,
        new_gc_cutoff: Lsn,
@@ -5182,22 +5199,22 @@ impl Timeline {
            result.layers_total += 1;

            // 1. Is it newer than GC horizon cutoff point?
-            if l.get_lsn_range().end > horizon_cutoff {
+            if l.get_lsn_range().end > space_cutoff {
                debug!(
-                    "keeping {} because it's newer than horizon_cutoff {}",
+                    "keeping {} because it's newer than space_cutoff {}",
                    l.layer_name(),
-                    horizon_cutoff,
+                    space_cutoff,
                );
                result.layers_needed_by_cutoff += 1;
                continue 'outer;
            }

            // 2. It is newer than PiTR cutoff point?
-            if l.get_lsn_range().end > pitr_cutoff {
+            if l.get_lsn_range().end > time_cutoff {
                debug!(
-                    "keeping {} because it's newer than pitr_cutoff {}",
+                    "keeping {} because it's newer than time_cutoff {}",
                    l.layer_name(),
-                    pitr_cutoff,
+                    time_cutoff,
                );
                result.layers_needed_by_pitr += 1;
                continue 'outer;
@@ -6029,8 +6046,9 @@ mod tests {

    #[tokio::test]
    async fn two_layer_eviction_attempts_at_the_same_time() {
-        let harness =
-            TenantHarness::create("two_layer_eviction_attempts_at_the_same_time").unwrap();
+        let harness = TenantHarness::create("two_layer_eviction_attempts_at_the_same_time")
+            .await
+            .unwrap();

        let (tenant, ctx) = harness.load().await;
        let timeline = tenant
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -26,9 +26,11 @@ use utils::id::TimelineId;

 use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
 use crate::page_cache;
+use crate::tenant::config::defaults::{DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD};
+use crate::tenant::storage_layer::merge_iterator::MergeIterator;
 use crate::tenant::storage_layer::{AsLayerDesc, PersistentLayerDesc};
-use crate::tenant::timeline::{drop_rlock, Hole, ImageLayerCreationOutcome};
-use crate::tenant::timeline::{DeltaLayerWriter, ImageLayerWriter};
+use crate::tenant::timeline::{drop_rlock, DeltaLayerWriter, ImageLayerWriter};
+use crate::tenant::timeline::{Hole, ImageLayerCreationOutcome};
 use crate::tenant::timeline::{Layer, ResidentLayer};
 use crate::tenant::DeltaLayer;
 use crate::virtual_file::{MaybeFatalIo, VirtualFile};
@@ -195,7 +197,7 @@ impl Timeline {
        tracing::info!(
            "latest_gc_cutoff: {}, pitr cutoff {}",
            *latest_gc_cutoff,
-            self.gc_info.read().unwrap().cutoffs.pitr
+            self.gc_info.read().unwrap().cutoffs.time
        );

        let layers = self.layers.read().await;
@@ -379,7 +381,7 @@ impl Timeline {
            };

            let begin = tokio::time::Instant::now();
-            let phase1_layers_locked = Arc::clone(&self.layers).read_owned().await;
+            let phase1_layers_locked = self.layers.read().await;
            let now = tokio::time::Instant::now();
            stats.read_lock_acquisition_micros =
                DurationRecorder::Recorded(RecordedDuration(now - begin), now);
@@ -399,9 +401,9 @@ impl Timeline {
    }

    /// Level0 files first phase of compaction, explained in the [`Self::compact_legacy`] comment.
-    async fn compact_level0_phase1(
-        self: &Arc<Self>,
-        guard: tokio::sync::OwnedRwLockReadGuard<LayerManager>,
+    async fn compact_level0_phase1<'a>(
+        self: &'a Arc<Self>,
+        guard: tokio::sync::RwLockReadGuard<'a, LayerManager>,
        mut stats: CompactLevel0Phase1StatsBuilder,
        target_file_size: u64,
        ctx: &RequestContext,
@@ -415,6 +417,7 @@ impl Timeline {
            .map(|x| guard.get_from_desc(&x))
            .collect_vec();
        stats.level0_deltas_count = Some(level0_deltas.len());
+
        // Only compact if enough layers have accumulated.
        let threshold = self.get_compaction_threshold();
        if level0_deltas.is_empty() || level0_deltas.len() < threshold {
@@ -445,6 +448,22 @@ impl Timeline {
        let mut prev_lsn_end = first_level0_delta.layer_desc().lsn_range.end;
        let mut deltas_to_compact = Vec::with_capacity(level0_deltas.len());

+        // Accumulate the size of layers in `deltas_to_compact`
+        let mut deltas_to_compact_bytes = 0;
+
+        // Under normal circumstances, we will accumulate up to compaction_interval L0s of size
+        // checkpoint_distance each.  To avoid edge cases using extra system resources, bound our
+        // work in this function to only operate on this much delta data at once.
+        //
+        // Take the max of the configured value & the default, so that tests that configure tiny values
+        // can still use a sensible amount of memory, but if a deployed system configures bigger values we
+        // still let them compact a full stack of L0s in one go.
+        let delta_size_limit = std::cmp::max(
+            self.get_compaction_threshold(),
+            DEFAULT_COMPACTION_THRESHOLD,
+        ) as u64
+            * std::cmp::max(self.get_checkpoint_distance(), DEFAULT_CHECKPOINT_DISTANCE);
+
        deltas_to_compact.push(first_level0_delta.download_and_keep_resident().await?);
        for l in level0_deltas_iter {
            let lsn_range = &l.layer_desc().lsn_range;
@@ -453,7 +472,20 @@ impl Timeline {
                break;
            }
            deltas_to_compact.push(l.download_and_keep_resident().await?);
+            deltas_to_compact_bytes += l.metadata().file_size;
            prev_lsn_end = lsn_range.end;
+
+            if deltas_to_compact_bytes >= delta_size_limit {
+                info!(
+                    l0_deltas_selected = deltas_to_compact.len(),
+                    l0_deltas_total = level0_deltas.len(),
+                    "L0 compaction picker hit max delta layer size limit: {}",
+                    delta_size_limit
+                );
+
+                // Proceed with compaction, but only a subset of L0s
+                break;
+            }
        }
        let lsn_range = Range {
            start: deltas_to_compact
@@ -990,7 +1022,7 @@ impl Timeline {
                    "enhanced legacy compaction currently does not support retain_lsns (branches)"
                )));
            }
-            let gc_cutoff = Lsn::min(gc_info.cutoffs.horizon, gc_info.cutoffs.pitr);
+            let gc_cutoff = gc_info.cutoffs.select_min();
            let mut selected_layers = Vec::new();
            // TODO: consider retain_lsns
            drop(gc_info);
@@ -1008,10 +1040,12 @@ impl Timeline {
        );
        // Step 1: (In the future) construct a k-merge iterator over all layers. For now, simply collect all keys + LSNs.
        // Also, collect the layer information to decide when to split the new delta layers.
-        let mut all_key_values = Vec::new();
+        let mut downloaded_layers = Vec::new();
        let mut delta_split_points = BTreeSet::new();
        for layer in &layer_selection {
-            all_key_values.extend(layer.load_key_values(ctx).await?);
+            let resident_layer = layer.download_and_keep_resident().await?;
+            downloaded_layers.push(resident_layer);
+
            let desc = layer.layer_desc();
            if desc.is_delta() {
                // TODO: is it correct to only record split points for deltas intersecting with the GC horizon? (exclude those below/above the horizon)
@@ -1021,44 +1055,28 @@ impl Timeline {
                delta_split_points.insert(key_range.end);
            }
        }
-        // Key small to large, LSN low to high, if the same LSN has both image and delta due to the merge of delta layers and
-        // image layers, make image appear before than delta.
-        struct ValueWrapper<'a>(&'a crate::repository::Value);
-        impl Ord for ValueWrapper<'_> {
-            fn cmp(&self, other: &Self) -> std::cmp::Ordering {
-                use crate::repository::Value;
-                use std::cmp::Ordering;
-                match (self.0, other.0) {
-                    (Value::Image(_), Value::WalRecord(_)) => Ordering::Less,
-                    (Value::WalRecord(_), Value::Image(_)) => Ordering::Greater,
-                    _ => Ordering::Equal,
-                }
+        let mut delta_layers = Vec::new();
+        let mut image_layers = Vec::new();
+        for resident_layer in &downloaded_layers {
+            if resident_layer.layer_desc().is_delta() {
+                let layer = resident_layer.get_as_delta(ctx).await?;
+                delta_layers.push(layer);
+            } else {
+                let layer = resident_layer.get_as_image(ctx).await?;
+                image_layers.push(layer);
            }
        }
-        impl PartialOrd for ValueWrapper<'_> {
-            fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
-                Some(self.cmp(other))
-            }
-        }
-        impl PartialEq for ValueWrapper<'_> {
-            fn eq(&self, other: &Self) -> bool {
-                self.cmp(other) == std::cmp::Ordering::Equal
-            }
-        }
-        impl Eq for ValueWrapper<'_> {}
-        all_key_values.sort_by(|(k1, l1, v1), (k2, l2, v2)| {
-            (k1, l1, ValueWrapper(v1)).cmp(&(k2, l2, ValueWrapper(v2)))
-        });
+        let mut merge_iter = MergeIterator::create(&delta_layers, &image_layers, ctx);
        // Step 2: Produce images+deltas. TODO: ensure newly-produced delta does not overlap with other deltas.
        // Data of the same key.
        let mut accumulated_values = Vec::new();
-        let mut last_key = all_key_values.first().unwrap().0; // TODO: assert all_key_values not empty
+        let mut last_key: Option<Key> = None;

        /// Take a list of images and deltas, produce an image at the GC horizon, and a list of deltas above the GC horizon.
        async fn flush_accumulated_states(
            tline: &Arc<Timeline>,
            key: Key,
-            accumulated_values: &[&(Key, Lsn, crate::repository::Value)],
+            accumulated_values: &[(Key, Lsn, crate::repository::Value)],
            horizon: Lsn,
        ) -> anyhow::Result<(Vec<(Key, Lsn, crate::repository::Value)>, bytes::Bytes)> {
            let mut base_image = None;
@@ -1159,7 +1177,7 @@ impl Timeline {
            self.conf,
            self.timeline_id,
            self.tenant_shard_id,
-            &(all_key_values.first().unwrap().0..all_key_values.last().unwrap().0.next()),
+            &(Key::MIN..Key::MAX), // covers the full key range
            gc_cutoff,
            ctx,
        )
@@ -1169,20 +1187,24 @@ impl Timeline {
        let delta_split_points = delta_split_points.into_iter().collect_vec();
        let mut current_delta_split_point = 0;
        let mut delta_layers = Vec::new();
-        for item @ (key, _, _) in &all_key_values {
-            if &last_key == key {
-                accumulated_values.push(item);
+        while let Some((key, lsn, val)) = merge_iter.next().await? {
+            if last_key.is_none() || last_key.as_ref() == Some(&key) {
+                if last_key.is_none() {
+                    last_key = Some(key);
+                }
+                accumulated_values.push((key, lsn, val));
            } else {
+                let last_key = last_key.as_mut().unwrap();
                let (deltas, image) =
-                    flush_accumulated_states(self, last_key, &accumulated_values, gc_cutoff)
+                    flush_accumulated_states(self, *last_key, &accumulated_values, gc_cutoff)
                        .await?;
                // Put the image into the image layer. Currently we have a single big layer for the compaction.
-                image_layer_writer.put_image(last_key, image, ctx).await?;
+                image_layer_writer.put_image(*last_key, image, ctx).await?;
                delta_values.extend(deltas);
                delta_layers.extend(
                    flush_deltas(
                        &mut delta_values,
-                        last_key,
+                        *last_key,
                        &delta_split_points,
                        &mut current_delta_split_point,
                        self,
@@ -1192,11 +1214,12 @@ impl Timeline {
                    .await?,
                );
                accumulated_values.clear();
-                accumulated_values.push(item);
-                last_key = *key;
+                *last_key = key;
+                accumulated_values.push((key, lsn, val));
            }
        }

+        let last_key = last_key.expect("no keys produced during compaction");
        // TODO: move this part to the loop body
        let (deltas, image) =
            flush_accumulated_states(self, last_key, &accumulated_values, gc_cutoff).await?;
--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -10,6 +10,7 @@ use crate::{
    },
    virtual_file::{MaybeFatalIo, VirtualFile},
 };
+use pageserver_api::models::detach_ancestor::AncestorDetached;
 use tokio_util::sync::CancellationToken;
 use tracing::Instrument;
 use utils::{completion, generation::Generation, http::error::ApiError, id::TimelineId, lsn::Lsn};
@@ -39,6 +40,9 @@ pub(crate) enum Error {

    #[error("unexpected error")]
    Unexpected(#[source] anyhow::Error),
+
+    #[error("failpoint: {}", .0)]
+    Failpoint(&'static str),
 }

 impl From<Error> for ApiError {
@@ -57,11 +61,41 @@ impl From<Error> for ApiError {
            | e @ Error::CopyDeltaPrefix(_)
            | e @ Error::UploadRewritten(_)
            | e @ Error::CopyFailed(_)
-            | e @ Error::Unexpected(_) => ApiError::InternalServerError(e.into()),
+            | e @ Error::Unexpected(_)
+            | e @ Error::Failpoint(_) => ApiError::InternalServerError(e.into()),
        }
    }
 }

+impl From<crate::tenant::upload_queue::NotInitialized> for Error {
+    fn from(_: crate::tenant::upload_queue::NotInitialized) -> Self {
+        // treat all as shutting down signals, even though that is not entirely correct
+        // (uninitialized state)
+        Error::ShuttingDown
+    }
+}
+
+impl From<FlushLayerError> for Error {
+    fn from(value: FlushLayerError) -> Self {
+        match value {
+            FlushLayerError::Cancelled => Error::ShuttingDown,
+            FlushLayerError::NotRunning(_) => {
+                // FIXME(#6424): technically statically unreachable right now, given how we never
+                // drop the sender
+                Error::ShuttingDown
+            }
+            FlushLayerError::CreateImageLayersError(_) | FlushLayerError::Other(_) => {
+                Error::FlushAncestor(value)
+            }
+        }
+    }
+}
+
+pub(crate) enum Progress {
+    Prepared(completion::Completion, PreparedTimelineDetach),
+    Done(AncestorDetached),
+}
+
 pub(crate) struct PreparedTimelineDetach {
    layers: Vec<Layer>,
 }
@@ -88,7 +122,7 @@ pub(super) async fn prepare(
    tenant: &Tenant,
    options: Options,
    ctx: &RequestContext,
-) -> Result<(completion::Completion, PreparedTimelineDetach), Error> {
+) -> Result<Progress, Error> {
    use Error::*;

    let Some((ancestor, ancestor_lsn)) = detached
@@ -96,15 +130,67 @@ pub(super) async fn prepare(
        .as_ref()
        .map(|tl| (tl.clone(), detached.ancestor_lsn))
    else {
-        // TODO: check if we have already been detached; for this we need to read the stored data
-        // on remote client, for that we need a follow-up which makes uploads cheaper and maintains
-        // a projection of the commited data.
+        {
+            let accessor = detached.remote_client.initialized_upload_queue()?;
+
+            // we are safe to inspect the latest uploaded, because we can only witness this after
+            // restart is complete and ancestor is no more.
+            let latest = accessor.latest_uploaded_index_part();
+            if !latest.lineage.is_detached_from_original_ancestor() {
+                return Err(NoAncestor);
+            }
+        }
+
+        // detached has previously been detached; let's inspect each of the current timelines and
+        // report back the timelines which have been reparented by our detach
+        let mut all_direct_children = tenant
+            .timelines
+            .lock()
+            .unwrap()
+            .values()
+            .filter(|tl| matches!(tl.ancestor_timeline.as_ref(), Some(ancestor) if Arc::ptr_eq(ancestor, detached)))
+            .map(|tl| (tl.ancestor_lsn, tl.clone()))
+            .collect::<Vec<_>>();
+
+        let mut any_shutdown = false;
+
+        all_direct_children.retain(
+            |(_, tl)| match tl.remote_client.initialized_upload_queue() {
+                Ok(accessor) => accessor
+                    .latest_uploaded_index_part()
+                    .lineage
+                    .is_reparented(),
+                Err(_shutdownalike) => {
+                    // not 100% a shutdown, but let's bail early not to give inconsistent results in
+                    // sharded enviroment.
+                    any_shutdown = true;
+                    true
+                }
+            },
+        );
+
+        if any_shutdown {
+            // it could be one or many being deleted; have client retry
+            return Err(Error::ShuttingDown);
+        }
+
+        let mut reparented = all_direct_children;
+        // why this instead of hashset? there is a reason, but I've forgotten it many times.
        //
-        // the error is wrong per openapi
-        return Err(NoAncestor);
+        // maybe if this was a hashset we would not be able to distinguish some race condition.
+        reparented.sort_unstable_by_key(|(lsn, tl)| (*lsn, tl.timeline_id));
+
+        return Ok(Progress::Done(AncestorDetached {
+            reparented_timelines: reparented
+                .into_iter()
+                .map(|(_, tl)| tl.timeline_id)
+                .collect(),
+        }));
    };

    if !ancestor_lsn.is_valid() {
+        // rare case, probably wouldn't even load
+        tracing::error!("ancestor is set, but ancestor_lsn is invalid, this timeline needs fixing");
        return Err(NoAncestor);
    }

@@ -131,6 +217,15 @@ pub(super) async fn prepare(

    let _gate_entered = detached.gate.enter().map_err(|_| ShuttingDown)?;

+    utils::pausable_failpoint!("timeline-detach-ancestor::before_starting_after_locking_pausable");
+
+    fail::fail_point!(
+        "timeline-detach-ancestor::before_starting_after_locking",
+        |_| Err(Error::Failpoint(
+            "timeline-detach-ancestor::before_starting_after_locking"
+        ))
+    );
+
    if ancestor_lsn >= ancestor.get_disk_consistent_lsn() {
        let span =
            tracing::info_span!("freeze_and_flush", ancestor_timeline_id=%ancestor.timeline_id);
@@ -151,7 +246,7 @@ pub(super) async fn prepare(
                }
            };

-            res.map_err(FlushAncestor)?;
+            res?;

            // we do not need to wait for uploads to complete but we do need `struct Layer`,
            // copying delta prefix is unsupported currently for `InMemoryLayer`.
@@ -159,7 +254,7 @@ pub(super) async fn prepare(
                elapsed_ms = started_at.elapsed().as_millis(),
                "froze and flushed the ancestor"
            );
-            Ok(())
+            Ok::<_, Error>(())
        }
        .instrument(span)
        .await?;
@@ -283,7 +378,7 @@ pub(super) async fn prepare(

    let prepared = PreparedTimelineDetach { layers: new_layers };

-    Ok((guard, prepared))
+    Ok(Progress::Prepared(guard, prepared))
 }

 fn partition_work(
@@ -350,7 +445,11 @@ async fn copy_lsn_prefix(
    target_timeline: &Arc<Timeline>,
    ctx: &RequestContext,
 ) -> Result<Option<ResidentLayer>, Error> {
-    use Error::{CopyDeltaPrefix, RewrittenDeltaDownloadFailed};
+    use Error::{CopyDeltaPrefix, RewrittenDeltaDownloadFailed, ShuttingDown};
+
+    if target_timeline.cancel.is_cancelled() {
+        return Err(ShuttingDown);
+    }

    tracing::debug!(%layer, %end_lsn, "copying lsn prefix");

@@ -529,7 +628,7 @@ pub(super) async fn complete(
        match res {
            Ok(Some(timeline)) => {
                tracing::info!(reparented=%timeline.timeline_id, "reparenting done");
-                reparented.push(timeline.timeline_id);
+                reparented.push((timeline.ancestor_lsn, timeline.timeline_id));
            }
            Ok(None) => {
                // lets just ignore this for now. one or all reparented timelines could had
@@ -551,5 +650,12 @@ pub(super) async fn complete(
        tracing::info!("failed to reparent some candidates");
    }

+    reparented.sort_unstable();
+
+    let reparented = reparented
+        .into_iter()
+        .map(|(_, timeline_id)| timeline_id)
+        .collect();
+
    Ok(reparented)
 }
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -1118,7 +1118,7 @@ mod tests {

    #[tokio::test]
    async fn no_connection_no_candidate() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("no_connection_no_candidate")?;
+        let harness = TenantHarness::create("no_connection_no_candidate").await?;
        let mut state = dummy_state(&harness).await;
        let now = Utc::now().naive_utc();

@@ -1151,7 +1151,7 @@ mod tests {

    #[tokio::test]
    async fn connection_no_candidate() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("connection_no_candidate")?;
+        let harness = TenantHarness::create("connection_no_candidate").await?;
        let mut state = dummy_state(&harness).await;
        let now = Utc::now().naive_utc();

@@ -1216,7 +1216,7 @@ mod tests {

    #[tokio::test]
    async fn no_connection_candidate() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("no_connection_candidate")?;
+        let harness = TenantHarness::create("no_connection_candidate").await?;
        let mut state = dummy_state(&harness).await;
        let now = Utc::now().naive_utc();

@@ -1279,7 +1279,7 @@ mod tests {

    #[tokio::test]
    async fn candidate_with_many_connection_failures() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("candidate_with_many_connection_failures")?;
+        let harness = TenantHarness::create("candidate_with_many_connection_failures").await?;
        let mut state = dummy_state(&harness).await;
        let now = Utc::now().naive_utc();

@@ -1319,7 +1319,7 @@ mod tests {

    #[tokio::test]
    async fn lsn_wal_over_threshold_current_candidate() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("lsn_wal_over_threshcurrent_candidate")?;
+        let harness = TenantHarness::create("lsn_wal_over_threshcurrent_candidate").await?;
        let mut state = dummy_state(&harness).await;
        let current_lsn = Lsn(100_000).align();
        let now = Utc::now().naive_utc();
@@ -1385,7 +1385,8 @@ mod tests {

    #[tokio::test]
    async fn timeout_connection_threshold_current_candidate() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("timeout_connection_threshold_current_candidate")?;
+        let harness =
+            TenantHarness::create("timeout_connection_threshold_current_candidate").await?;
        let mut state = dummy_state(&harness).await;
        let current_lsn = Lsn(100_000).align();
        let now = Utc::now().naive_utc();
@@ -1448,7 +1449,7 @@ mod tests {

    #[tokio::test]
    async fn timeout_wal_over_threshold_current_candidate() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("timeout_wal_over_threshold_current_candidate")?;
+        let harness = TenantHarness::create("timeout_wal_over_threshold_current_candidate").await?;
        let mut state = dummy_state(&harness).await;
        let current_lsn = Lsn(100_000).align();
        let new_lsn = Lsn(100_100).align();
@@ -1550,7 +1551,7 @@ mod tests {
        // and pageserver should prefer to connect to it.
        let test_az = Some("test_az".to_owned());

-        let harness = TenantHarness::create("switch_to_same_availability_zone")?;
+        let harness = TenantHarness::create("switch_to_same_availability_zone").await?;
        let mut state = dummy_state(&harness).await;
        state.conf.availability_zone.clone_from(&test_az);
        let current_lsn = Lsn(100_000).align();
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -228,18 +228,20 @@ impl UploadQueue {
        Ok(self.initialized_mut().expect("we just set it"))
    }

-    pub(crate) fn initialized_mut(&mut self) -> anyhow::Result<&mut UploadQueueInitialized> {
+    pub(crate) fn initialized_mut(
+        &mut self,
+    ) -> Result<&mut UploadQueueInitialized, NotInitialized> {
        use UploadQueue::*;
        match self {
-            Uninitialized => Err(NotInitialized::Uninitialized.into()),
+            Uninitialized => Err(NotInitialized::Uninitialized),
            Initialized(x) => {
                if x.shutting_down {
-                    Err(NotInitialized::ShuttingDown.into())
+                    Err(NotInitialized::ShuttingDown)
                } else {
                    Ok(x)
                }
            }
-            Stopped(_) => Err(NotInitialized::Stopped.into()),
+            Stopped(_) => Err(NotInitialized::Stopped),
        }
    }

--- a/pageserver/src/tenant/vectored_blob_io.rs
+++ b/pageserver/src/tenant/vectored_blob_io.rs
@@ -396,7 +396,6 @@ impl<'a> VectoredBlobReader<'a> {
 /// Read planner used in [`crate::tenant::storage_layer::image_layer::ImageLayerIterator`]. It provides a streaming API for
 /// getting read blobs. It returns a batch when `handle` gets called and when the current key would just exceed the read_size and
 /// max_cnt constraints.
-#[cfg(test)]
 pub struct StreamingVectoredReadPlanner {
    read_builder: Option<VectoredReadBuilder>,
    // Arguments for previous blob passed into [`StreamingVectoredReadPlanner::handle`]
@@ -410,7 +409,6 @@ pub struct StreamingVectoredReadPlanner {
    cnt: usize,
 }

-#[cfg(test)]
 impl StreamingVectoredReadPlanner {
    pub fn new(max_read_size: u64, max_cnt: usize) -> Self {
        assert!(max_cnt > 0);
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -1754,7 +1754,7 @@ mod tests {

    #[tokio::test]
    async fn test_relsize() -> Result<()> {
-        let (tenant, ctx) = TenantHarness::create("test_relsize")?.load().await;
+        let (tenant, ctx) = TenantHarness::create("test_relsize").await?.load().await;
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx)
            .await?;
@@ -1975,7 +1975,10 @@ mod tests {
    // and then created it again within the same layer.
    #[tokio::test]
    async fn test_drop_extend() -> Result<()> {
-        let (tenant, ctx) = TenantHarness::create("test_drop_extend")?.load().await;
+        let (tenant, ctx) = TenantHarness::create("test_drop_extend")
+            .await?
+            .load()
+            .await;
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx)
            .await?;
@@ -2046,7 +2049,10 @@ mod tests {
    // and then extended it again within the same layer.
    #[tokio::test]
    async fn test_truncate_extend() -> Result<()> {
-        let (tenant, ctx) = TenantHarness::create("test_truncate_extend")?.load().await;
+        let (tenant, ctx) = TenantHarness::create("test_truncate_extend")
+            .await?
+            .load()
+            .await;
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx)
            .await?;
@@ -2188,7 +2194,7 @@ mod tests {
    /// split into multiple 1 GB segments in Postgres.
    #[tokio::test]
    async fn test_large_rel() -> Result<()> {
-        let (tenant, ctx) = TenantHarness::create("test_large_rel")?.load().await;
+        let (tenant, ctx) = TenantHarness::create("test_large_rel").await?.load().await;
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx)
            .await?;
@@ -2296,7 +2302,7 @@ mod tests {
        let startpoint = Lsn::from_hex("14AEC08").unwrap();
        let _endpoint = Lsn::from_hex("1FFFF98").unwrap();

-        let harness = TenantHarness::create("test_ingest_real_wal").unwrap();
+        let harness = TenantHarness::create("test_ingest_real_wal").await.unwrap();
        let (tenant, ctx) = harness.load().await;

        let remote_initdb_path =
--- a/patches/rum.patch
+++ b/patches/rum.patch
@@ -0,0 +1,54 @@
+commit 68f3b3b0d594f08aacc4a082ee210749ed5677eb
+Author: Anastasia Lubennikova <anastasia@neon.tech>
+Date:   Mon Jul 15 12:31:56 2024 +0100
+
+    Neon: fix unlogged index build patch
+
+diff --git a/src/ruminsert.c b/src/ruminsert.c
+index e8b209d..e89bf2a 100644
+--- a/src/ruminsert.c
+++ b/src/ruminsert.c
+@@ -628,6 +628,10 @@ rumbuild(Relation heap, Relation index, struct IndexInfo *indexInfo)
+ 		elog(ERROR, "index \"%s\" already contains data",
+ 			 RelationGetRelationName(index));
+ 
+#ifdef NEON_SMGR
+	smgr_start_unlogged_build(index->rd_smgr);
+#endif
+
+ 	initRumState(&buildstate.rumstate, index);
+ 	buildstate.rumstate.isBuild = true;
+ 	buildstate.indtuples = 0;
+@@ -693,6 +697,10 @@ rumbuild(Relation heap, Relation index, struct IndexInfo *indexInfo)
+ 	buildstate.buildStats.nTotalPages = RelationGetNumberOfBlocks(index);
+ 	rumUpdateStats(index, &buildstate.buildStats, buildstate.rumstate.isBuild);
+ 
+#ifdef NEON_SMGR
+	smgr_finish_unlogged_build_phase_1(index->rd_smgr);
+#endif
+
+ 	/*
+ 	 * Write index to xlog
+ 	 */
+@@ -713,6 +721,21 @@ rumbuild(Relation heap, Relation index, struct IndexInfo *indexInfo)
+ 		UnlockReleaseBuffer(buffer);
+ 	}
+ 
+#ifdef NEON_SMGR
+	{
+#if PG_VERSION_NUM >= 160000
+		RelFileLocator rlocator = RelationGetSmgr(index)->smgr_rlocator.locator;
+#else
+		RelFileNode rlocator = RelationGetSmgr(index)->smgr_rnode.node;
+#endif
+
+		SetLastWrittenLSNForBlockRange(XactLastRecEnd, rlocator, MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index));
+		SetLastWrittenLSNForRelation(XactLastRecEnd, rlocator, MAIN_FORKNUM);
+
+		smgr_end_unlogged_build(index->rd_smgr);
+	}
+#endif
+
+ 	/*
+ 	 * Return statistics
+ 	 */
--- a/poetry.lock
+++ b/poetry.lock
@@ -2641,19 +2641,18 @@ pbr = "*"

 [[package]]
 name = "setuptools"
-version = "65.5.1"
+version = "70.0.0"
 description = "Easily download, build, install, upgrade, and uninstall Python packages"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
 files = [
-    {file = "setuptools-65.5.1-py3-none-any.whl", hash = "sha256:d0b9a8433464d5800cbe05094acf5c6d52a91bfac9b52bcfc4d41382be5d5d31"},
-    {file = "setuptools-65.5.1.tar.gz", hash = "sha256:e197a19aa8ec9722928f2206f8de752def0e4c9fc6953527360d1c36d94ddb2f"},
+    {file = "setuptools-70.0.0-py3-none-any.whl", hash = "sha256:54faa7f2e8d2d11bcd2c07bed282eef1046b5c080d1c32add737d7b5817b1ad4"},
+    {file = "setuptools-70.0.0.tar.gz", hash = "sha256:f211a66637b8fa059bb28183da127d4e86396c991a942b028c6650d4319c3fd0"},
 ]

 [package.extras]
-docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-hoverxref (<2)", "sphinx-inline-tabs", "sphinx-notfound-page (==0.8.3)", "sphinx-reredirects", "sphinxcontrib-towncrier"]
-testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8 (<5)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pip (>=19.1)", "pip-run (>=8.8)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-timeout", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"]
-testing-integration = ["build[virtualenv]", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"]
+docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier"]
+testing = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "importlib-metadata", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "mypy (==1.9)", "packaging (>=23.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.1)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy", "pytest-perf", "pytest-ruff (>=0.2.1)", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"]

 [[package]]
 name = "six"
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -92,6 +92,7 @@ tracing-opentelemetry.workspace = true
 tracing-subscriber.workspace = true
 tracing-utils.workspace = true
 tracing.workspace = true
+typed-json.workspace = true
 url.workspace = true
 urlencoding.workspace = true
 utils.workspace = true
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -181,8 +181,9 @@ pub async fn worker(
    let rx = futures::stream::poll_fn(move |cx| rx.poll_recv(cx));
    let rx = rx.map(RequestData::from);

-    let storage =
-        GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?;
+    let storage = GenericRemoteStorage::from_config(&remote_storage_config)
+        .await
+        .context("remote storage init")?;

    let properties = WriterProperties::builder()
        .set_data_page_size_limit(config.parquet_upload_page_size)
@@ -217,6 +218,7 @@ pub async fn worker(

        let storage_disconnect =
            GenericRemoteStorage::from_config(&disconnect_events_storage_config)
+                .await
                .context("remote storage for disconnect events init")?;
        let parquet_config_disconnect = parquet_config.clone();
        tokio::try_join!(
@@ -545,7 +547,9 @@ mod tests {
            },
            timeout: std::time::Duration::from_secs(120),
        };
-        let storage = GenericRemoteStorage::from_config(&remote_storage_config).unwrap();
+        let storage = GenericRemoteStorage::from_config(&remote_storage_config)
+            .await
+            .unwrap();

        worker_inner(storage, rx, config).await.unwrap();

--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -18,7 +18,7 @@ use hyper1::Response;
 use hyper1::StatusCode;
 use hyper1::{HeaderMap, Request};
 use pq_proto::StartupMessageParamsBuilder;
-use serde_json::json;
+use serde::Serialize;
 use serde_json::Value;
 use tokio::time;
 use tokio_postgres::error::DbError;
@@ -32,6 +32,7 @@ use tokio_postgres::Transaction;
 use tokio_util::sync::CancellationToken;
 use tracing::error;
 use tracing::info;
+use typed_json::json;
 use url::Url;
 use utils::http::error::ApiError;

@@ -263,13 +264,8 @@ pub async fn handle(
                | SqlOverHttpError::Postgres(e) => e.as_db_error(),
                _ => None,
            };
-            fn get<'a, T: serde::Serialize>(
-                db: Option<&'a DbError>,
-                x: impl FnOnce(&'a DbError) -> T,
-            ) -> Value {
-                db.map(x)
-                    .and_then(|t| serde_json::to_value(t).ok())
-                    .unwrap_or_default()
+            fn get<'a, T: Default>(db: Option<&'a DbError>, x: impl FnOnce(&'a DbError) -> T) -> T {
+                db.map(x).unwrap_or_default()
            }

            if let Some(db_error) = db_error {
@@ -278,17 +274,11 @@ pub async fn handle(

            let position = db_error.and_then(|db| db.position());
            let (position, internal_position, internal_query) = match position {
-                Some(ErrorPosition::Original(position)) => (
-                    Value::String(position.to_string()),
-                    Value::Null,
-                    Value::Null,
-                ),
-                Some(ErrorPosition::Internal { position, query }) => (
-                    Value::Null,
-                    Value::String(position.to_string()),
-                    Value::String(query.clone()),
-                ),
-                None => (Value::Null, Value::Null, Value::Null),
+                Some(ErrorPosition::Original(position)) => (Some(position.to_string()), None, None),
+                Some(ErrorPosition::Internal { position, query }) => {
+                    (None, Some(position.to_string()), Some(query.clone()))
+                }
+                None => (None, None, None),
            };

            let code = get(db_error, |db| db.code().code());
@@ -578,10 +568,8 @@ async fn handle_inner(
        .status(StatusCode::OK)
        .header(header::CONTENT_TYPE, "application/json");

-    //
-    // Now execute the query and return the result
-    //
-    let result = match payload {
+    // Now execute the query and return the result.
+    let json_output = match payload {
        Payload::Single(stmt) => stmt.process(cancel, &mut client, parsed_headers).await?,
        Payload::Batch(statements) => {
            if parsed_headers.txn_read_only {
@@ -605,11 +593,9 @@ async fn handle_inner(

    let metrics = client.metrics();

-    // how could this possibly fail
-    let body = serde_json::to_string(&result).expect("json serialization should not fail");
-    let len = body.len();
+    let len = json_output.len();
    let response = response
-        .body(Full::new(Bytes::from(body)))
+        .body(Full::new(Bytes::from(json_output)))
        // only fails if invalid status code or invalid header/values are given.
        // these are not user configurable so it cannot fail dynamically
        .expect("building response payload should not fail");
@@ -631,7 +617,7 @@ impl QueryData {
        cancel: CancellationToken,
        client: &mut Client<tokio_postgres::Client>,
        parsed_headers: HttpHeaders,
-    ) -> Result<Value, SqlOverHttpError> {
+    ) -> Result<String, SqlOverHttpError> {
        let (inner, mut discard) = client.inner();
        let cancel_token = inner.cancel_token();

@@ -644,7 +630,10 @@ impl QueryData {
            // The query successfully completed.
            Either::Left((Ok((status, results)), __not_yet_cancelled)) => {
                discard.check_idle(status);
-                Ok(results)
+
+                let json_output =
+                    serde_json::to_string(&results).expect("json serialization should not fail");
+                Ok(json_output)
            }
            // The query failed with an error
            Either::Left((Err(e), __not_yet_cancelled)) => {
@@ -662,7 +651,10 @@ impl QueryData {
                    // query successed before it was cancelled.
                    Ok(Ok((status, results))) => {
                        discard.check_idle(status);
-                        Ok(results)
+
+                        let json_output = serde_json::to_string(&results)
+                            .expect("json serialization should not fail");
+                        Ok(json_output)
                    }
                    // query failed or was cancelled.
                    Ok(Err(error)) => {
@@ -696,7 +688,7 @@ impl BatchQueryData {
        cancel: CancellationToken,
        client: &mut Client<tokio_postgres::Client>,
        parsed_headers: HttpHeaders,
-    ) -> Result<Value, SqlOverHttpError> {
+    ) -> Result<String, SqlOverHttpError> {
        info!("starting transaction");
        let (inner, mut discard) = client.inner();
        let cancel_token = inner.cancel_token();
@@ -718,9 +710,9 @@ impl BatchQueryData {
            e
        })?;

-        let results =
+        let json_output =
            match query_batch(cancel.child_token(), &transaction, self, parsed_headers).await {
-                Ok(results) => {
+                Ok(json_output) => {
                    info!("commit");
                    let status = transaction.commit().await.map_err(|e| {
                        // if we cannot commit - for now don't return connection to pool
@@ -729,7 +721,7 @@ impl BatchQueryData {
                        e
                    })?;
                    discard.check_idle(status);
-                    results
+                    json_output
                }
                Err(SqlOverHttpError::Cancelled(_)) => {
                    if let Err(err) = cancel_token.cancel_query(NoTls).await {
@@ -753,7 +745,7 @@ impl BatchQueryData {
                }
            };

-        Ok(json!({ "results": results }))
+        Ok(json_output)
    }
 }

@@ -762,7 +754,7 @@ async fn query_batch(
    transaction: &Transaction<'_>,
    queries: BatchQueryData,
    parsed_headers: HttpHeaders,
-) -> Result<Vec<Value>, SqlOverHttpError> {
+) -> Result<String, SqlOverHttpError> {
    let mut results = Vec::with_capacity(queries.queries.len());
    let mut current_size = 0;
    for stmt in queries.queries {
@@ -787,7 +779,11 @@ async fn query_batch(
            }
        }
    }
-    Ok(results)
+
+    let results = json!({ "results": results });
+    let json_output = serde_json::to_string(&results).expect("json serialization should not fail");
+
+    Ok(json_output)
 }

 async fn query_to_json<T: GenericClient>(
@@ -795,7 +791,7 @@ async fn query_to_json<T: GenericClient>(
    data: QueryData,
    current_size: &mut usize,
    parsed_headers: HttpHeaders,
-) -> Result<(ReadyForQueryStatus, Value), SqlOverHttpError> {
+) -> Result<(ReadyForQueryStatus, impl Serialize), SqlOverHttpError> {
    info!("executing query");
    let query_params = data.params;
    let mut row_stream = std::pin::pin!(client.query_raw_txt(&data.query, query_params).await?);
@@ -844,8 +840,8 @@ async fn query_to_json<T: GenericClient>(

    for c in row_stream.columns() {
        fields.push(json!({
-            "name": Value::String(c.name().to_owned()),
-            "dataTypeID": Value::Number(c.type_().oid().into()),
+            "name": c.name().to_owned(),
+            "dataTypeID": c.type_().oid(),
            "tableID": c.table_oid(),
            "columnID": c.column_id(),
            "dataTypeSize": c.type_size(),
@@ -863,15 +859,14 @@ async fn query_to_json<T: GenericClient>(
        .map(|row| pg_text_row_to_json(row, &columns, parsed_headers.raw_output, array_mode))
        .collect::<Result<Vec<_>, _>>()?;

-    // resulting JSON format is based on the format of node-postgres result
-    Ok((
-        ready,
-        json!({
-            "command": command_tag_name,
-            "rowCount": command_tag_count,
-            "rows": rows,
-            "fields": fields,
-            "rowAsArray": array_mode,
-        }),
-    ))
+    // Resulting JSON format is based on the format of node-postgres result.
+    let results = json!({
+        "command": command_tag_name.to_string(),
+        "rowCount": command_tag_count,
+        "rows": rows,
+        "fields": fields,
+        "rowAsArray": array_mode,
+    });
+
+    Ok((ready, results))
 }
--- a/proxy/src/usage_metrics.rs
+++ b/proxy/src/usage_metrics.rs
@@ -357,11 +357,15 @@ pub async fn task_backup(
        info!("metrics backup has shut down");
    }
    // Even if the remote storage is not configured, we still want to clear the metrics.
-    let storage = backup_config
-        .remote_storage_config
-        .as_ref()
-        .map(|config| GenericRemoteStorage::from_config(config).context("remote storage init"))
-        .transpose()?;
+    let storage = if let Some(config) = backup_config.remote_storage_config.as_ref() {
+        Some(
+            GenericRemoteStorage::from_config(config)
+                .await
+                .context("remote storage init")?,
+        )
+    } else {
+        None
+    };
    let mut ticker = tokio::time::interval(backup_config.interval);
    let mut prev = Utc::now();
    let hostname = hostname::get()?.as_os_str().to_string_lossy().into_owned();
--- a/safekeeper/src/auth.rs
+++ b/safekeeper/src/auth.rs
@@ -12,13 +12,15 @@ pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<
            }
            Ok(())
        }
-        (Scope::Admin | Scope::PageServerApi | Scope::GenerationsApi, _) => Err(AuthError(
-            format!(
-                "JWT scope '{:?}' is ineligible for Safekeeper auth",
-                claims.scope
-            )
-            .into(),
-        )),
+        (Scope::Admin | Scope::PageServerApi | Scope::GenerationsApi | Scope::Scrubber, _) => {
+            Err(AuthError(
+                format!(
+                    "JWT scope '{:?}' is ineligible for Safekeeper auth",
+                    claims.scope
+                )
+                .into(),
+            ))
+        }
        (Scope::SafekeeperData, _) => Ok(()),
    }
 }
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -418,7 +418,7 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
    let timeline_collector = safekeeper::metrics::TimelineCollector::new();
    metrics::register_internal(Box::new(timeline_collector))?;

-    wal_backup::init_remote_storage(&conf);
+    wal_backup::init_remote_storage(&conf).await;

    // Keep handles to main tasks to die if any of them disappears.
    let mut tasks_handles: FuturesUnordered<BoxFuture<(String, JoinTaskRes)>> =
--- a/safekeeper/src/copy_timeline.rs
+++ b/safekeeper/src/copy_timeline.rs
@@ -74,10 +74,16 @@ pub async fn handle_request(request: Request) -> Result<()> {
        assert!(flush_lsn >= start_lsn);

        if request.until_lsn > flush_lsn {
-            bail!("requested LSN is beyond the end of the timeline");
+            bail!(format!(
+                "requested LSN {} is beyond the end of the timeline {}",
+                request.until_lsn, flush_lsn
+            ));
        }
        if request.until_lsn < start_lsn {
-            bail!("requested LSN is before the start of the timeline");
+            bail!(format!(
+                "requested LSN {} is before the start of the timeline {}",
+                request.until_lsn, start_lsn
+            ));
        }

        if request.until_lsn > commit_lsn {
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -173,15 +173,6 @@ pub static BROKER_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
        .expect("Failed to create broker runtime")
 });

-pub static WAL_REMOVER_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
-    tokio::runtime::Builder::new_multi_thread()
-        .thread_name("WAL remover")
-        .worker_threads(1)
-        .enable_all()
-        .build()
-        .expect("Failed to create broker runtime")
-});
-
 pub static WAL_BACKUP_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
    tokio::runtime::Builder::new_multi_thread()
        .thread_name("WAL backup worker")
@@ -189,12 +180,3 @@ pub static WAL_BACKUP_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
        .build()
        .expect("Failed to create WAL backup runtime")
 });
-
-pub static METRICS_SHIFTER_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
-    tokio::runtime::Builder::new_multi_thread()
-        .thread_name("metric shifter")
-        .worker_threads(1)
-        .enable_all()
-        .build()
-        .expect("Failed to create broker runtime")
-});
--- a/safekeeper/src/timeline_eviction.rs
+++ b/safekeeper/src/timeline_eviction.rs
@@ -199,10 +199,7 @@ async fn redownload_partial_segment(
    file.flush().await?;

    let final_path = local_segment_path(mgr, partial);
-    info!(
-        "downloaded {} bytes, renaming to {}",
-        final_path, final_path,
-    );
+    info!("downloaded {actual_len} bytes, renaming to {final_path}");
    if let Err(e) = durable_rename(&tmp_file, &final_path, !mgr.conf.no_sync).await {
        // Probably rename succeeded, but fsync of it failed. Remove
        // the file then to avoid using it.
--- a/safekeeper/src/timeline_guard.rs
+++ b/safekeeper/src/timeline_guard.rs
@@ -4,7 +4,7 @@

 use std::collections::HashSet;

-use tracing::{debug, warn};
+use tracing::debug;

 use crate::timeline_manager::ManagerCtlMessage;

@@ -23,7 +23,7 @@ impl Drop for ResidenceGuard {
            .manager_tx
            .send(ManagerCtlMessage::GuardDrop(self.guard_id));
        if let Err(e) = res {
-            warn!("failed to send GuardDrop message: {:?}", e);
+            debug!("failed to send GuardDrop message: {:?}", e);
        }
    }
 }
--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -22,7 +22,7 @@ use tokio::fs::File;

 use tokio::select;
 use tokio::sync::mpsc::{self, Receiver, Sender};
-use tokio::sync::watch;
+use tokio::sync::{watch, OnceCell};
 use tokio::time::sleep;
 use tracing::*;

@@ -33,8 +33,6 @@ use crate::timeline::{PeerInfo, WalResidentTimeline};
 use crate::timeline_manager::{Manager, StateSnapshot};
 use crate::{SafeKeeperConf, WAL_BACKUP_RUNTIME};

-use once_cell::sync::OnceCell;
-
 const UPLOAD_FAILURE_RETRY_MIN_MS: u64 = 10;
 const UPLOAD_FAILURE_RETRY_MAX_MS: u64 = 5000;

@@ -167,7 +165,7 @@ fn determine_offloader(
    }
 }

-static REMOTE_STORAGE: OnceCell<Option<GenericRemoteStorage>> = OnceCell::new();
+static REMOTE_STORAGE: OnceCell<Option<GenericRemoteStorage>> = OnceCell::const_new();

 // Storage must be configured and initialized when this is called.
 fn get_configured_remote_storage() -> &'static GenericRemoteStorage {
@@ -178,14 +176,22 @@ fn get_configured_remote_storage() -> &'static GenericRemoteStorage {
        .unwrap()
 }

-pub fn init_remote_storage(conf: &SafeKeeperConf) {
+pub async fn init_remote_storage(conf: &SafeKeeperConf) {
    // TODO: refactor REMOTE_STORAGE to avoid using global variables, and provide
    // dependencies to all tasks instead.
-    REMOTE_STORAGE.get_or_init(|| {
-        conf.remote_storage
-            .as_ref()
-            .map(|c| GenericRemoteStorage::from_config(c).expect("failed to create remote storage"))
-    });
+    REMOTE_STORAGE
+        .get_or_init(|| async {
+            if let Some(conf) = conf.remote_storage.as_ref() {
+                Some(
+                    GenericRemoteStorage::from_config(conf)
+                        .await
+                        .expect("failed to create remote storage"),
+                )
+            } else {
+                None
+            }
+        })
+        .await;
 }

 struct WalBackupTask {
--- a/safekeeper/src/wal_backup_partial.rs
+++ b/safekeeper/src/wal_backup_partial.rs
@@ -289,6 +289,18 @@ impl PartialBackup {
            })
            .collect();

+        if new_segments.len() == 1 {
+            // we have an uploaded segment, it must not be deleted from remote storage
+            segments_to_delete.retain(|name| name != &new_segments[0].name);
+        } else {
+            // there should always be zero or one uploaded segment
+            assert!(
+                new_segments.is_empty(),
+                "too many uploaded segments: {:?}",
+                new_segments
+            );
+        }
+
        info!("deleting objects: {:?}", segments_to_delete);
        let mut objects_to_delete = vec![];
        for seg in segments_to_delete.iter() {
--- a/storage_controller/client/Cargo.toml
+++ b/storage_controller/client/Cargo.toml
@@ -0,0 +1,23 @@
+[package]
+name = "storage_controller_client"
+version = "0.1.0"
+edition.workspace = true
+license.workspace = true
+
+[dependencies]
+pageserver_api.workspace = true
+pageserver_client.workspace = true
+thiserror.workspace = true
+async-trait.workspace = true
+reqwest.workspace = true
+utils.workspace = true
+serde.workspace = true
+workspace_hack = { version = "0.1", path = "../../workspace_hack" }
+tokio-postgres.workspace = true
+tokio-stream.workspace = true
+tokio.workspace = true
+futures.workspace = true
+tokio-util.workspace = true
+anyhow.workspace = true
+postgres.workspace = true
+bytes.workspace = true
--- a/storage_controller/client/src/control_api.rs
+++ b/storage_controller/client/src/control_api.rs
@@ -0,0 +1,62 @@
+use pageserver_client::mgmt_api::{self, ResponseErrorMessageExt};
+use reqwest::{Method, Url};
+use serde::{de::DeserializeOwned, Serialize};
+use std::str::FromStr;
+
+pub struct Client {
+    base_url: Url,
+    jwt_token: Option<String>,
+    client: reqwest::Client,
+}
+
+impl Client {
+    pub fn new(base_url: Url, jwt_token: Option<String>) -> Self {
+        Self {
+            base_url,
+            jwt_token,
+            client: reqwest::ClientBuilder::new()
+                .build()
+                .expect("Failed to construct http client"),
+        }
+    }
+
+    /// Simple HTTP request wrapper for calling into storage controller
+    pub async fn dispatch<RQ, RS>(
+        &self,
+        method: Method,
+        path: String,
+        body: Option<RQ>,
+    ) -> mgmt_api::Result<RS>
+    where
+        RQ: Serialize + Sized,
+        RS: DeserializeOwned + Sized,
+    {
+        // The configured URL has the /upcall path prefix for pageservers to use: we will strip that out
+        // for general purpose API access.
+        let url = Url::from_str(&format!(
+            "http://{}:{}/{path}",
+            self.base_url.host_str().unwrap(),
+            self.base_url.port().unwrap()
+        ))
+        .unwrap();
+
+        let mut builder = self.client.request(method, url);
+        if let Some(body) = body {
+            builder = builder.json(&body)
+        }
+        if let Some(jwt_token) = &self.jwt_token {
+            builder = builder.header(
+                reqwest::header::AUTHORIZATION,
+                format!("Bearer {jwt_token}"),
+            );
+        }
+
+        let response = builder.send().await.map_err(mgmt_api::Error::ReceiveBody)?;
+        let response = response.error_from_body().await?;
+
+        response
+            .json()
+            .await
+            .map_err(pageserver_client::mgmt_api::Error::ReceiveBody)
+    }
+}
--- a/storage_controller/client/src/lib.rs
+++ b/storage_controller/client/src/lib.rs
@@ -0,0 +1 @@
+pub mod control_api;
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -330,6 +330,22 @@ async fn handle_tenant_timeline_delete(
    .await
 }

+async fn handle_tenant_timeline_detach_ancestor(
+    service: Arc<Service>,
+    req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    check_permissions(&req, Scope::PageServerApi)?;
+
+    let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
+
+    let res = service
+        .tenant_timeline_detach_ancestor(tenant_id, timeline_id)
+        .await?;
+
+    json_response(StatusCode::OK, res)
+}
+
 async fn handle_tenant_timeline_passthrough(
    service: Arc<Service>,
    req: Request<Body>,
@@ -414,7 +430,7 @@ async fn handle_tenant_describe(
    service: Arc<Service>,
    req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
+    check_permissions(&req, Scope::Scrubber)?;

    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
    json_response(StatusCode::OK, service.tenant_describe(tenant_id)?)
@@ -1006,6 +1022,16 @@ pub fn make_router(
                RequestName("v1_tenant_timeline"),
            )
        })
+        .put(
+            "/v1/tenant/:tenant_id/timeline/:timeline_id/detach_ancestor",
+            |r| {
+                tenant_service_handler(
+                    r,
+                    handle_tenant_timeline_detach_ancestor,
+                    RequestName("v1_tenant_timeline_detach_ancestor"),
+                )
+            },
+        )
        // Tenant detail GET passthrough to shard zero:
        .get("/v1/tenant/:tenant_id", |r| {
            tenant_service_handler(
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -1,11 +1,11 @@
 use anyhow::{anyhow, Context};
-use camino::Utf8PathBuf;
 use clap::Parser;
 use diesel::Connection;
 use metrics::launch_timestamp::LaunchTimestamp;
 use metrics::BuildInfo;
 use std::path::PathBuf;
 use std::sync::Arc;
+use std::time::Duration;
 use storage_controller::http::make_router;
 use storage_controller::metrics::preinitialize_metrics;
 use storage_controller::persistence::Persistence;
@@ -51,10 +51,6 @@ struct Cli {
    #[arg(long)]
    compute_hook_url: Option<String>,

-    /// Path to the .json file to store state (will be created if it doesn't exist)
-    #[arg(short, long)]
-    path: Option<Utf8PathBuf>,
-
    /// URL to connect to postgres, like postgresql://localhost:1234/storage_controller
    #[arg(long)]
    database_url: Option<String>,
@@ -206,11 +202,10 @@ async fn async_main() -> anyhow::Result<()> {

    let args = Cli::parse();
    tracing::info!(
-        "version: {}, launch_timestamp: {}, build_tag {}, state at {}, listening on {}",
+        "version: {}, launch_timestamp: {}, build_tag {}, listening on {}",
        GIT_VERSION,
        launch_ts.to_string(),
        BUILD_TAG,
-        args.path.as_ref().unwrap_or(&Utf8PathBuf::from("<none>")),
        args.listen
    );

@@ -277,8 +272,7 @@ async fn async_main() -> anyhow::Result<()> {
        .await
        .context("Running database migrations")?;

-    let json_path = args.path;
-    let persistence = Arc::new(Persistence::new(secrets.database_url, json_path.clone()));
+    let persistence = Arc::new(Persistence::new(secrets.database_url));

    let service = Service::spawn(config, persistence.clone()).await?;

@@ -316,22 +310,23 @@ async fn async_main() -> anyhow::Result<()> {
    }
    tracing::info!("Terminating on signal");

-    if json_path.is_some() {
-        // Write out a JSON dump on shutdown: this is used in compat tests to avoid passing
-        // full postgres dumps around.
-        if let Err(e) = persistence.write_tenants_json().await {
-            tracing::error!("Failed to write JSON on shutdown: {e}")
+    // Stop HTTP server first, so that we don't have to service requests
+    // while shutting down Service.
+    server_shutdown.cancel();
+    match tokio::time::timeout(Duration::from_secs(5), server_task).await {
+        Ok(Ok(_)) => {
+            tracing::info!("Joined HTTP server task");
+        }
+        Ok(Err(e)) => {
+            tracing::error!("Error joining HTTP server task: {e}")
+        }
+        Err(_) => {
+            tracing::warn!("Timed out joining HTTP server task");
+            // We will fall through and shut down the service anyway, any request handlers
+            // in flight will experience cancellation & their clients will see a torn connection.
        }
    }

-    // Stop HTTP server first, so that we don't have to service requests
-    // while shutting down Service
-    server_shutdown.cancel();
-    if let Err(e) = server_task.await {
-        tracing::error!("Error joining HTTP server task: {e}")
-    }
-    tracing::info!("Joined HTTP server task");
-
    service.shutdown().await;
    tracing::info!("Service shutdown complete");

--- a/storage_controller/src/pageserver_client.rs
+++ b/storage_controller/src/pageserver_client.rs
@@ -1,8 +1,9 @@
 use pageserver_api::{
    models::{
-        LocationConfig, LocationConfigListResponse, PageserverUtilization, SecondaryProgress,
-        TenantScanRemoteStorageResponse, TenantShardSplitRequest, TenantShardSplitResponse,
-        TimelineCreateRequest, TimelineInfo, TopTenantShardsRequest, TopTenantShardsResponse,
+        detach_ancestor::AncestorDetached, LocationConfig, LocationConfigListResponse,
+        PageserverUtilization, SecondaryProgress, TenantScanRemoteStorageResponse,
+        TenantShardSplitRequest, TenantShardSplitResponse, TimelineCreateRequest, TimelineInfo,
+        TopTenantShardsRequest, TopTenantShardsResponse,
    },
    shard::TenantShardId,
 };
@@ -226,6 +227,21 @@ impl PageserverClient {
        )
    }

+    pub(crate) async fn timeline_detach_ancestor(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+    ) -> Result<AncestorDetached> {
+        measured_request!(
+            "timeline_detach_ancestor",
+            crate::metrics::Method::Put,
+            &self.node_id_label,
+            self.inner
+                .timeline_detach_ancestor(tenant_shard_id, timeline_id)
+                .await
+        )
+    }
+
    pub(crate) async fn get_utilization(&self) -> Result<PageserverUtilization> {
        measured_request!(
            "utilization",
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -5,8 +5,6 @@ use std::time::Duration;
 use std::time::Instant;

 use self::split_state::SplitState;
-use camino::Utf8Path;
-use camino::Utf8PathBuf;
 use diesel::pg::PgConnection;
 use diesel::prelude::*;
 use diesel::Connection;
@@ -55,11 +53,6 @@ use crate::node::Node;
 /// we can UPDATE a node's scheduling mode reasonably quickly to mark a bad node offline.
 pub struct Persistence {
    connection_pool: diesel::r2d2::Pool<diesel::r2d2::ConnectionManager<PgConnection>>,
-
-    // In test environments, we support loading+saving a JSON file.  This is temporary, for the benefit of
-    // test_compatibility.py, so that we don't have to commit to making the database contents fully backward/forward
-    // compatible just yet.
-    json_path: Option<Utf8PathBuf>,
 }

 /// Legacy format, for use in JSON compat objects in test environment
@@ -124,7 +117,7 @@ impl Persistence {
    const IDLE_CONNECTION_TIMEOUT: Duration = Duration::from_secs(10);
    const MAX_CONNECTION_LIFETIME: Duration = Duration::from_secs(60);

-    pub fn new(database_url: String, json_path: Option<Utf8PathBuf>) -> Self {
+    pub fn new(database_url: String) -> Self {
        let manager = diesel::r2d2::ConnectionManager::<PgConnection>::new(database_url);

        // We will use a connection pool: this is primarily to _limit_ our connection count, rather than to optimize time
@@ -139,10 +132,7 @@ impl Persistence {
            .build(manager)
            .expect("Could not build connection pool");

-        Self {
-            connection_pool,
-            json_path,
-        }
+        Self { connection_pool }
    }

    /// A helper for use during startup, where we would like to tolerate concurrent restarts of the
@@ -302,85 +292,13 @@ impl Persistence {
    /// At startup, load the high level state for shards, such as their config + policy.  This will
    /// be enriched at runtime with state discovered on pageservers.
    pub(crate) async fn list_tenant_shards(&self) -> DatabaseResult<Vec<TenantShardPersistence>> {
-        let loaded = self
-            .with_measured_conn(
-                DatabaseOperation::ListTenantShards,
-                move |conn| -> DatabaseResult<_> {
-                    Ok(crate::schema::tenant_shards::table.load::<TenantShardPersistence>(conn)?)
-                },
-            )
-            .await?;
-
-        if loaded.is_empty() {
-            if let Some(path) = &self.json_path {
-                if tokio::fs::try_exists(path)
-                    .await
-                    .map_err(|e| DatabaseError::Logical(format!("Error stat'ing JSON file: {e}")))?
-                {
-                    tracing::info!("Importing from legacy JSON format at {path}");
-                    return self.list_tenant_shards_json(path).await;
-                }
-            }
-        }
-        Ok(loaded)
-    }
-
-    /// Shim for automated compatibility tests: load tenants from a JSON file instead of database
-    pub(crate) async fn list_tenant_shards_json(
-        &self,
-        path: &Utf8Path,
-    ) -> DatabaseResult<Vec<TenantShardPersistence>> {
-        let bytes = tokio::fs::read(path)
-            .await
-            .map_err(|e| DatabaseError::Logical(format!("Failed to load JSON: {e}")))?;
-
-        let mut decoded = serde_json::from_slice::<JsonPersistence>(&bytes)
-            .map_err(|e| DatabaseError::Logical(format!("Deserialization error: {e}")))?;
-        for shard in decoded.tenants.values_mut() {
-            if shard.placement_policy == "\"Single\"" {
-                // Backward compat for test data after PR https://github.com/neondatabase/neon/pull/7165
-                shard.placement_policy = "{\"Attached\":0}".to_string();
-            }
-
-            if shard.scheduling_policy.is_empty() {
-                shard.scheduling_policy =
-                    serde_json::to_string(&ShardSchedulingPolicy::default()).unwrap();
-            }
-        }
-
-        let tenants: Vec<TenantShardPersistence> = decoded.tenants.into_values().collect();
-
-        // Synchronize database with what is in the JSON file
-        self.insert_tenant_shards(tenants.clone()).await?;
-
-        Ok(tenants)
-    }
-
-    /// For use in testing environments, where we dump out JSON on shutdown.
-    pub async fn write_tenants_json(&self) -> anyhow::Result<()> {
-        let Some(path) = &self.json_path else {
-            anyhow::bail!("Cannot write JSON if path isn't set (test environment bug)");
-        };
-        tracing::info!("Writing state to {path}...");
-        let tenants = self.list_tenant_shards().await?;
-        let mut tenants_map = HashMap::new();
-        for tsp in tenants {
-            let tenant_shard_id = TenantShardId {
-                tenant_id: TenantId::from_str(tsp.tenant_id.as_str())?,
-                shard_number: ShardNumber(tsp.shard_number as u8),
-                shard_count: ShardCount::new(tsp.shard_count as u8),
-            };
-
-            tenants_map.insert(tenant_shard_id, tsp);
-        }
-        let json = serde_json::to_string(&JsonPersistence {
-            tenants: tenants_map,
-        })?;
-
-        tokio::fs::write(path, &json).await?;
-        tracing::info!("Wrote {} bytes to {path}...", json.len());
-
-        Ok(())
+        self.with_measured_conn(
+            DatabaseOperation::ListTenantShards,
+            move |conn| -> DatabaseResult<_> {
+                Ok(crate::schema::tenant_shards::table.load::<TenantShardPersistence>(conn)?)
+            },
+        )
+        .await
    }

    /// Tenants must be persisted before we schedule them for the first time.  This enables us
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -117,6 +117,7 @@ enum TenantOperations {
    TimelineCreate,
    TimelineDelete,
    AttachHook,
+    TimelineDetachAncestor,
 }

 #[derive(Clone, strum_macros::Display)]
@@ -2376,18 +2377,18 @@ impl Service {
                tracing::info!("Doing time travel recovery for shard {tenant_shard_id}",);

                client
-                        .tenant_time_travel_remote_storage(
-                            tenant_shard_id,
-                            &timestamp,
-                            &done_if_after,
-                        )
-                        .await
-                        .map_err(|e| {
-                            ApiError::InternalServerError(anyhow::anyhow!(
-                                "Error doing time travel recovery for shard {tenant_shard_id} on node {}: {e}",
-                                node
-                            ))
-                        })?;
+                    .tenant_time_travel_remote_storage(
+                        tenant_shard_id,
+                        &timestamp,
+                        &done_if_after,
+                    )
+                    .await
+                    .map_err(|e| {
+                        ApiError::InternalServerError(anyhow::anyhow!(
+                            "Error doing time travel recovery for shard {tenant_shard_id} on node {}: {e}",
+                            node
+                        ))
+                    })?;
            }
        }
        Ok(())
@@ -2757,7 +2758,7 @@ impl Service {
        // Create timeline on remaining shards with number >0
        if !targets.is_empty() {
            // If we had multiple shards, issue requests for the remainder now.
-            let jwt = self.config.jwt_token.clone();
+            let jwt = &self.config.jwt_token;
            self.tenant_for_shards(targets, |tenant_shard_id: TenantShardId, node: Node| {
                let create_req = create_req.clone();
                Box::pin(create_one(tenant_shard_id, node, jwt.clone(), create_req))
@@ -2768,6 +2769,114 @@ impl Service {
        Ok(timeline_info)
    }

+    pub(crate) async fn tenant_timeline_detach_ancestor(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> Result<models::detach_ancestor::AncestorDetached, ApiError> {
+        tracing::info!("Detaching timeline {tenant_id}/{timeline_id}",);
+
+        let _tenant_lock = trace_shared_lock(
+            &self.tenant_op_locks,
+            tenant_id,
+            TenantOperations::TimelineDetachAncestor,
+        )
+        .await;
+
+        self.ensure_attached_wait(tenant_id).await?;
+
+        let targets = {
+            let locked = self.inner.read().unwrap();
+            let mut targets = Vec::new();
+
+            for (tenant_shard_id, shard) in
+                locked.tenants.range(TenantShardId::tenant_range(tenant_id))
+            {
+                let node_id = shard.intent.get_attached().ok_or_else(|| {
+                    ApiError::InternalServerError(anyhow::anyhow!("Shard not scheduled"))
+                })?;
+                let node = locked
+                    .nodes
+                    .get(&node_id)
+                    .expect("Pageservers may not be deleted while referenced");
+
+                targets.push((*tenant_shard_id, node.clone()));
+            }
+            targets
+        };
+
+        if targets.is_empty() {
+            return Err(ApiError::NotFound(
+                anyhow::anyhow!("Tenant not found").into(),
+            ));
+        }
+
+        async fn detach_one(
+            tenant_shard_id: TenantShardId,
+            timeline_id: TimelineId,
+            node: Node,
+            jwt: Option<String>,
+        ) -> Result<(ShardNumber, models::detach_ancestor::AncestorDetached), ApiError> {
+            tracing::info!(
+                "Detaching timeline on shard {tenant_shard_id}/{timeline_id}, attached to node {node}",
+            );
+
+            let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref());
+            client
+                .timeline_detach_ancestor(tenant_shard_id, timeline_id)
+                .await
+                .map_err(|e| {
+                    use mgmt_api::Error;
+
+                    match e {
+                        // no ancestor (ever)
+                        Error::ApiError(StatusCode::CONFLICT, msg) => ApiError::Conflict(format!(
+                            "{node}: {}",
+                            msg.strip_prefix("Conflict: ").unwrap_or(&msg)
+                        )),
+                        // too many ancestors
+                        Error::ApiError(StatusCode::BAD_REQUEST, msg) => {
+                            ApiError::BadRequest(anyhow::anyhow!("{node}: {msg}"))
+                        }
+                        // rest can be mapped
+                        other => passthrough_api_error(&node, other),
+                    }
+                })
+                .map(|res| (tenant_shard_id.shard_number, res))
+        }
+
+        // no shard needs to go first/last; the operation should be idempotent
+        // TODO: it would be great to ensure that all shards return the same error
+        let mut results = self
+            .tenant_for_shards(targets, |tenant_shard_id, node| {
+                futures::FutureExt::boxed(detach_one(
+                    tenant_shard_id,
+                    timeline_id,
+                    node,
+                    self.config.jwt_token.clone(),
+                ))
+            })
+            .await?;
+
+        let any = results.pop().expect("we must have at least one response");
+
+        let mismatching = results
+            .iter()
+            .filter(|(_, res)| res != &any.1)
+            .collect::<Vec<_>>();
+        if !mismatching.is_empty() {
+            let matching = results.len() - mismatching.len();
+            tracing::error!(
+                matching,
+                compared_against=?any,
+                ?mismatching,
+                "shards returned different results"
+            );
+        }
+
+        Ok(any.1)
+    }
+
    /// Helper for concurrently calling a pageserver API on a number of shards, such as timeline creation.
    ///
    /// On success, the returned vector contains exactly the same number of elements as the input `locations`.
@@ -2894,8 +3003,8 @@ impl Service {
                .await
                .map_err(|e| {
                    ApiError::InternalServerError(anyhow::anyhow!(
-                    "Error deleting timeline {timeline_id} on {tenant_shard_id} on node {node}: {e}",
-                ))
+                        "Error deleting timeline {timeline_id} on {tenant_shard_id} on node {node}: {e}",
+                    ))
                })
        }

@@ -3847,6 +3956,8 @@ impl Service {
                "failpoint".to_string()
            )));

+            failpoint_support::sleep_millis_async!("shard-split-post-remote-sleep", &self.cancel);
+
            tracing::info!(
                "Split {} into {}",
                parent_id,
--- a/storage_scrubber/Cargo.toml
+++ b/storage_scrubber/Cargo.toml
@@ -34,6 +34,7 @@ camino.workspace = true
 rustls.workspace = true
 rustls-native-certs.workspace = true
 once_cell.workspace = true
+storage_controller_client.workspace = true

 tokio = { workspace = true, features = ["macros", "rt-multi-thread"] }
 chrono = { workspace = true, default-features = false, features = ["clock", "serde"] }
--- a/storage_scrubber/src/lib.rs
+++ b/storage_scrubber/src/lib.rs
@@ -24,6 +24,7 @@ use camino::{Utf8Path, Utf8PathBuf};
 use clap::ValueEnum;
 use pageserver::tenant::TENANTS_SEGMENT_NAME;
 use pageserver_api::shard::TenantShardId;
+use remote_storage::RemotePath;
 use reqwest::Url;
 use serde::{Deserialize, Serialize};
 use tokio::io::AsyncReadExt;
@@ -31,7 +32,7 @@ use tracing::error;
 use tracing_appender::non_blocking::WorkerGuard;
 use tracing_subscriber::{fmt, prelude::*, EnvFilter};
 use utils::fs_ext;
-use utils::id::{TenantId, TimelineId};
+use utils::id::{TenantId, TenantTimelineId, TimelineId};

 const MAX_RETRIES: usize = 20;
 const CLOUD_ADMIN_API_TOKEN_ENV_VAR: &str = "CLOUD_ADMIN_API_TOKEN";
@@ -54,7 +55,7 @@ pub struct S3Target {
 /// in the pageserver, as all timeline objects existing in the scope of a particular
 /// tenant: the scrubber is different in that it handles collections of data referring to many
 /// TenantShardTimelineIds in on place.
-#[derive(Serialize, Deserialize, Debug, Clone, Copy, Hash, PartialEq, Eq)]
+#[derive(Serialize, Deserialize, Debug, Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord)]
 pub struct TenantShardTimelineId {
    tenant_shard_id: TenantShardId,
    timeline_id: TimelineId,
@@ -67,6 +68,10 @@ impl TenantShardTimelineId {
            timeline_id,
        }
    }
+
+    fn as_tenant_timeline_id(&self) -> TenantTimelineId {
+        TenantTimelineId::new(self.tenant_shard_id.tenant_id, self.timeline_id)
+    }
 }

 impl Display for TenantShardTimelineId {
@@ -179,6 +184,22 @@ impl RootTarget {
            .with_sub_segment(&id.timeline_id.to_string())
    }

+    /// Given RemotePath "tenants/foo/timelines/bar/layerxyz", prefix it to a literal
+    /// key in the S3 bucket.
+    pub fn absolute_key(&self, key: &RemotePath) -> String {
+        let root = match self {
+            Self::Pageserver(root) => root,
+            Self::Safekeeper(root) => root,
+        };
+
+        let prefix = &root.prefix_in_bucket;
+        if prefix.ends_with('/') {
+            format!("{prefix}{key}")
+        } else {
+            format!("{prefix}/{key}")
+        }
+    }
+
    pub fn bucket_name(&self) -> &str {
        match self {
            Self::Pageserver(root) => &root.bucket_name,
@@ -216,6 +237,14 @@ impl BucketConfig {
    }
 }

+pub struct ControllerClientConfig {
+    /// URL to storage controller.  e.g. http://127.0.0.1:1234 when using `neon_local`
+    pub controller_api: Url,
+
+    /// JWT token for authenticating with storage controller.  Requires scope 'scrubber' or 'admin'.
+    pub controller_jwt: String,
+}
+
 pub struct ConsoleConfig {
    pub token: String,
    pub base_url: Url,
--- a/storage_scrubber/src/main.rs
+++ b/storage_scrubber/src/main.rs
@@ -1,11 +1,12 @@
-use anyhow::bail;
+use anyhow::{anyhow, bail};
 use camino::Utf8PathBuf;
 use pageserver_api::shard::TenantShardId;
-use storage_scrubber::find_large_objects;
+use reqwest::Url;
 use storage_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode};
 use storage_scrubber::pageserver_physical_gc::GcMode;
 use storage_scrubber::scan_pageserver_metadata::scan_metadata;
 use storage_scrubber::tenant_snapshot::SnapshotDownloader;
+use storage_scrubber::{find_large_objects, ControllerClientConfig};
 use storage_scrubber::{
    init_logging, pageserver_physical_gc::pageserver_physical_gc,
    scan_safekeeper_metadata::scan_safekeeper_metadata, BucketConfig, ConsoleConfig, NodeKind,
@@ -24,6 +25,14 @@ struct Cli {

    #[arg(short, long, default_value_t = false)]
    delete: bool,
+
+    #[arg(long)]
+    /// URL to storage controller.  e.g. http://127.0.0.1:1234 when using `neon_local`
+    controller_api: Option<Url>,
+
+    #[arg(long)]
+    /// JWT token for authenticating with storage controller.  Requires scope 'scrubber' or 'admin'.
+    controller_jwt: Option<String>,
 }

 #[derive(Subcommand, Debug)]
@@ -204,8 +213,37 @@ async fn main() -> anyhow::Result<()> {
            min_age,
            mode,
        } => {
-            let summary =
-                pageserver_physical_gc(bucket_config, tenant_ids, min_age.into(), mode).await?;
+            let controller_client_conf = cli.controller_api.map(|controller_api| {
+                ControllerClientConfig {
+                    controller_api,
+                    // Default to no key: this is a convenience when working in a development environment
+                    controller_jwt: cli.controller_jwt.unwrap_or("".to_owned()),
+                }
+            });
+
+            match (&controller_client_conf, mode) {
+                (Some(_), _) => {
+                    // Any mode may run when controller API is set
+                }
+                (None, GcMode::Full) => {
+                    // The part of physical GC where we erase ancestor layers cannot be done safely without
+                    // confirming the most recent complete shard split with the controller.  Refuse to run, rather
+                    // than doing it unsafely.
+                    return Err(anyhow!("Full physical GC requires `--controller-api` and `--controller-jwt` to run"));
+                }
+                (None, GcMode::DryRun | GcMode::IndicesOnly) => {
+                    // These GcModes do not require the controller to run.
+                }
+            }
+
+            let summary = pageserver_physical_gc(
+                bucket_config,
+                controller_client_conf,
+                tenant_ids,
+                min_age.into(),
+                mode,
+            )
+            .await?;
            println!("{}", serde_json::to_string(&summary).unwrap());
            Ok(())
        }
--- a/storage_scrubber/src/pageserver_physical_gc.rs
+++ b/storage_scrubber/src/pageserver_physical_gc.rs
@@ -1,22 +1,50 @@
-use std::time::{Duration, UNIX_EPOCH};
+use std::collections::{BTreeMap, HashMap};
+use std::sync::Arc;
+use std::time::{Duration, SystemTime};

 use crate::checks::{list_timeline_blobs, BlobDataParseResult};
 use crate::metadata_stream::{stream_tenant_timelines, stream_tenants};
-use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId};
+use crate::{
+    init_remote, BucketConfig, ControllerClientConfig, NodeKind, RootTarget, TenantShardTimelineId,
+};
 use aws_sdk_s3::Client;
 use futures_util::{StreamExt, TryStreamExt};
-use pageserver::tenant::remote_timeline_client::parse_remote_index_path;
+use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
+use pageserver::tenant::remote_timeline_client::{parse_remote_index_path, remote_layer_path};
+use pageserver::tenant::storage_layer::LayerName;
 use pageserver::tenant::IndexPart;
-use pageserver_api::shard::TenantShardId;
+use pageserver_api::controller_api::TenantDescribeResponse;
+use pageserver_api::shard::{ShardIndex, TenantShardId};
 use remote_storage::RemotePath;
+use reqwest::Method;
 use serde::Serialize;
+use storage_controller_client::control_api;
 use tracing::{info_span, Instrument};
 use utils::generation::Generation;
+use utils::id::{TenantId, TenantTimelineId};

 #[derive(Serialize, Default)]
 pub struct GcSummary {
    indices_deleted: usize,
    remote_storage_errors: usize,
+    controller_api_errors: usize,
+    ancestor_layers_deleted: usize,
+}
+
+impl GcSummary {
+    fn merge(&mut self, other: Self) {
+        let Self {
+            indices_deleted,
+            remote_storage_errors,
+            ancestor_layers_deleted,
+            controller_api_errors,
+        } = other;
+
+        self.indices_deleted += indices_deleted;
+        self.remote_storage_errors += remote_storage_errors;
+        self.ancestor_layers_deleted += ancestor_layers_deleted;
+        self.controller_api_errors += controller_api_errors;
+    }
 }

 #[derive(clap::ValueEnum, Debug, Clone, Copy)]
@@ -26,9 +54,9 @@ pub enum GcMode {

    // Enable only removing old-generation indices
    IndicesOnly,
+
    // Enable all forms of GC
-    // TODO: this will be used when shard split ancestor layer deletion is added
-    // All,
+    Full,
 }

 impl std::fmt::Display for GcMode {
@@ -36,10 +64,232 @@ impl std::fmt::Display for GcMode {
        match self {
            GcMode::DryRun => write!(f, "dry-run"),
            GcMode::IndicesOnly => write!(f, "indices-only"),
+            GcMode::Full => write!(f, "full"),
        }
    }
 }

+mod refs {
+    use super::*;
+    // Map of cross-shard layer references, giving a refcount for each layer in each shard that is referenced by some other
+    // shard in the same tenant.  This is sparse!  The vast majority of timelines will have no cross-shard refs, and those that
+    // do have cross shard refs should eventually drop most of them via compaction.
+    //
+    // In our inner map type, the TTID in the key is shard-agnostic, and the ShardIndex in the value refers to the _ancestor
+    // which is is referenced_.
+    #[derive(Default)]
+    pub(super) struct AncestorRefs(
+        BTreeMap<TenantTimelineId, HashMap<(ShardIndex, LayerName), usize>>,
+    );
+
+    impl AncestorRefs {
+        /// Insert references for layers discovered in a particular shard-timeline that refer to an ancestral shard-timeline.
+        pub(super) fn update(
+            &mut self,
+            ttid: TenantShardTimelineId,
+            layers: Vec<(LayerName, LayerFileMetadata)>,
+        ) {
+            let ttid_refs = self.0.entry(ttid.as_tenant_timeline_id()).or_default();
+            for (layer_name, layer_metadata) in layers {
+                // Increment refcount of this layer in the ancestor shard
+                *(ttid_refs
+                    .entry((layer_metadata.shard, layer_name))
+                    .or_default()) += 1;
+            }
+        }
+
+        /// For a particular TTID, return the map of all ancestor layers referenced by a descendent to their refcount
+        ///
+        /// The `ShardIndex` in the result's key is the index of the _ancestor_, not the descendent.
+        pub(super) fn get_ttid_refcounts(
+            &self,
+            ttid: &TenantTimelineId,
+        ) -> Option<&HashMap<(ShardIndex, LayerName), usize>> {
+            self.0.get(ttid)
+        }
+    }
+}
+
+use refs::AncestorRefs;
+
+// As we see shards for a tenant, acccumulate knowledge needed for cross-shard GC:
+// - Are there any ancestor shards?
+// - Are there any refs to ancestor shards' layers?
+#[derive(Default)]
+struct TenantRefAccumulator {
+    shards_seen: HashMap<TenantId, Vec<ShardIndex>>,
+
+    // For each shard that has refs to an ancestor's layers, the set of ancestor layers referred to
+    ancestor_ref_shards: AncestorRefs,
+}
+
+impl TenantRefAccumulator {
+    fn update(&mut self, ttid: TenantShardTimelineId, index_part: &IndexPart) {
+        let this_shard_idx = ttid.tenant_shard_id.to_index();
+        (*self
+            .shards_seen
+            .entry(ttid.tenant_shard_id.tenant_id)
+            .or_default())
+        .push(this_shard_idx);
+
+        let mut ancestor_refs = Vec::new();
+        for (layer_name, layer_metadata) in &index_part.layer_metadata {
+            if layer_metadata.shard != this_shard_idx {
+                // This is a reference from this shard to a layer in an ancestor shard: we must track this
+                // as a marker to not GC this layer from the parent.
+                ancestor_refs.push((layer_name.clone(), layer_metadata.clone()));
+            }
+        }
+
+        if !ancestor_refs.is_empty() {
+            tracing::info!(%ttid, "Found {} ancestor refs", ancestor_refs.len());
+            self.ancestor_ref_shards.update(ttid, ancestor_refs);
+        }
+    }
+
+    /// Consume Self and return a vector of ancestor tenant shards that should be GC'd, and map of referenced ancestor layers to preserve
+    async fn into_gc_ancestors(
+        self,
+        controller_client: &control_api::Client,
+        summary: &mut GcSummary,
+    ) -> (Vec<TenantShardId>, AncestorRefs) {
+        let mut ancestors_to_gc = Vec::new();
+        for (tenant_id, mut shard_indices) in self.shards_seen {
+            // Find the highest shard count
+            let latest_count = shard_indices
+                .iter()
+                .map(|i| i.shard_count)
+                .max()
+                .expect("Always at least one shard");
+
+            let (mut latest_shards, ancestor_shards) = {
+                let at =
+                    itertools::partition(&mut shard_indices, |i| i.shard_count == latest_count);
+                (shard_indices[0..at].to_owned(), &shard_indices[at..])
+            };
+            // Sort shards, as we will later compare them with a sorted list from the controller
+            latest_shards.sort();
+
+            // Check that we have a complete view of the latest shard count: this should always be the case unless we happened
+            // to scan the S3 bucket halfway through a shard split.
+            if latest_shards.len() != latest_count.count() as usize {
+                // This should be extremely rare, so we warn on it.
+                tracing::warn!(%tenant_id, "Missed some shards at count {:?}", latest_count);
+                continue;
+            }
+
+            // Check if we have any non-latest-count shards
+            if ancestor_shards.is_empty() {
+                tracing::debug!(%tenant_id, "No ancestor shards to clean up");
+                continue;
+            }
+
+            // Based on S3 view, this tenant looks like it might have some ancestor shard work to do.  We
+            // must only do this work if the tenant is not currently being split: otherwise, it is not safe
+            // to GC ancestors, because if the split fails then the controller will try to attach ancestor
+            // shards again.
+            match controller_client
+                .dispatch::<(), TenantDescribeResponse>(
+                    Method::GET,
+                    format!("control/v1/tenant/{tenant_id}"),
+                    None,
+                )
+                .await
+            {
+                Err(e) => {
+                    // We were not able to learn the latest shard split state from the controller, so we will not
+                    // do ancestor GC on this tenant.
+                    tracing::warn!(%tenant_id, "Failed to query storage controller, will not do ancestor GC: {e}");
+                    summary.controller_api_errors += 1;
+                    continue;
+                }
+                Ok(desc) => {
+                    // We expect to see that the latest shard count matches the one we saw in S3, and that none
+                    // of the shards indicate splitting in progress.
+
+                    let controller_indices: Vec<ShardIndex> = desc
+                        .shards
+                        .iter()
+                        .map(|s| s.tenant_shard_id.to_index())
+                        .collect();
+                    if controller_indices != latest_shards {
+                        tracing::info!(%tenant_id, "Latest shards seen in S3 ({latest_shards:?}) don't match controller state ({controller_indices:?})");
+                        continue;
+                    }
+
+                    if desc.shards.iter().any(|s| s.is_splitting) {
+                        tracing::info!(%tenant_id, "One or more shards is currently splitting");
+                        continue;
+                    }
+
+                    // This shouldn't be too noisy, because we only log this for tenants that have some ancestral refs.
+                    tracing::info!(%tenant_id, "Validated state with controller: {desc:?}");
+                }
+            }
+
+            // GC ancestor shards
+            for ancestor_shard in ancestor_shards.iter().map(|idx| TenantShardId {
+                tenant_id,
+                shard_count: idx.shard_count,
+                shard_number: idx.shard_number,
+            }) {
+                ancestors_to_gc.push(ancestor_shard);
+            }
+        }
+
+        (ancestors_to_gc, self.ancestor_ref_shards)
+    }
+}
+
+async fn is_old_enough(
+    s3_client: &Client,
+    bucket_config: &BucketConfig,
+    min_age: &Duration,
+    key: &str,
+    summary: &mut GcSummary,
+) -> bool {
+    // Validation: we will only GC indices & layers after a time threshold (e.g. one week) so that during an incident
+    // it is easier to read old data for analysis, and easier to roll back shard splits without having to un-delete any objects.
+    let age: Duration = match s3_client
+        .head_object()
+        .bucket(&bucket_config.bucket)
+        .key(key)
+        .send()
+        .await
+    {
+        Ok(response) => match response.last_modified {
+            None => {
+                tracing::warn!("Missing last_modified");
+                summary.remote_storage_errors += 1;
+                return false;
+            }
+            Some(last_modified) => match SystemTime::try_from(last_modified).map(|t| t.elapsed()) {
+                Ok(Ok(e)) => e,
+                Err(_) | Ok(Err(_)) => {
+                    tracing::warn!("Bad last_modified time: {last_modified:?}");
+                    return false;
+                }
+            },
+        },
+        Err(e) => {
+            tracing::warn!("Failed to HEAD {key}: {e}");
+            summary.remote_storage_errors += 1;
+            return false;
+        }
+    };
+    let old_enough = &age > min_age;
+
+    if !old_enough {
+        tracing::info!(
+            "Skipping young object {} < {}",
+            humantime::format_duration(age),
+            humantime::format_duration(*min_age)
+        );
+    }
+
+    old_enough
+}
+
 async fn maybe_delete_index(
    s3_client: &Client,
    bucket_config: &BucketConfig,
@@ -79,45 +329,7 @@ async fn maybe_delete_index(
        return;
    }

-    // Validation: we will only delete indices after one week, so that during incidents we will have
-    // easy access to recent indices.
-    let age: Duration = match s3_client
-        .head_object()
-        .bucket(&bucket_config.bucket)
-        .key(key)
-        .send()
-        .await
-    {
-        Ok(response) => match response.last_modified {
-            None => {
-                tracing::warn!("Missing last_modified");
-                summary.remote_storage_errors += 1;
-                return;
-            }
-            Some(last_modified) => {
-                let last_modified =
-                    UNIX_EPOCH + Duration::from_secs_f64(last_modified.as_secs_f64());
-                match last_modified.elapsed() {
-                    Ok(e) => e,
-                    Err(_) => {
-                        tracing::warn!("Bad last_modified time: {last_modified:?}");
-                        return;
-                    }
-                }
-            }
-        },
-        Err(e) => {
-            tracing::warn!("Failed to HEAD {key}: {e}");
-            summary.remote_storage_errors += 1;
-            return;
-        }
-    };
-    if &age < min_age {
-        tracing::info!(
-            "Skipping young object {} < {}",
-            age.as_secs_f64(),
-            min_age.as_secs_f64()
-        );
+    if !is_old_enough(s3_client, bucket_config, min_age, key, summary).await {
        return;
    }

@@ -145,6 +357,108 @@ async fn maybe_delete_index(
    }
 }

+#[allow(clippy::too_many_arguments)]
+async fn gc_ancestor(
+    s3_client: &Client,
+    bucket_config: &BucketConfig,
+    root_target: &RootTarget,
+    min_age: &Duration,
+    ancestor: TenantShardId,
+    refs: &AncestorRefs,
+    mode: GcMode,
+    summary: &mut GcSummary,
+) -> anyhow::Result<()> {
+    // Scan timelines in the ancestor
+    let timelines = stream_tenant_timelines(s3_client, root_target, ancestor).await?;
+    let mut timelines = std::pin::pin!(timelines);
+
+    // Build a list of keys to retain
+
+    while let Some(ttid) = timelines.next().await {
+        let ttid = ttid?;
+
+        let data = list_timeline_blobs(s3_client, ttid, root_target).await?;
+
+        let s3_layers = match data.blob_data {
+            BlobDataParseResult::Parsed {
+                index_part: _,
+                index_part_generation: _,
+                s3_layers,
+            } => s3_layers,
+            BlobDataParseResult::Relic => {
+                // Post-deletion tenant location: don't try and GC it.
+                continue;
+            }
+            BlobDataParseResult::Incorrect(reasons) => {
+                // Our primary purpose isn't to report on bad data, but log this rather than skipping silently
+                tracing::warn!(
+                    "Skipping ancestor GC for timeline {ttid}, bad metadata: {reasons:?}"
+                );
+                continue;
+            }
+        };
+
+        let ttid_refs = refs.get_ttid_refcounts(&ttid.as_tenant_timeline_id());
+        let ancestor_shard_index = ttid.tenant_shard_id.to_index();
+
+        for (layer_name, layer_gen) in s3_layers {
+            let ref_count = ttid_refs
+                .and_then(|m| m.get(&(ancestor_shard_index, layer_name.clone())))
+                .copied()
+                .unwrap_or(0);
+
+            if ref_count > 0 {
+                tracing::debug!(%ttid, "Ancestor layer {layer_name}  has {ref_count} refs");
+                continue;
+            }
+
+            tracing::info!(%ttid, "Ancestor layer {layer_name} is not referenced");
+
+            // Build the key for the layer we are considering deleting
+            let key = root_target.absolute_key(&remote_layer_path(
+                &ttid.tenant_shard_id.tenant_id,
+                &ttid.timeline_id,
+                ancestor_shard_index,
+                &layer_name,
+                layer_gen,
+            ));
+
+            // We apply a time threshold to GCing objects that are un-referenced: this preserves our ability
+            // to roll back a shard split if we have to, by avoiding deleting ancestor layers right away
+            if !is_old_enough(s3_client, bucket_config, min_age, &key, summary).await {
+                continue;
+            }
+
+            if !matches!(mode, GcMode::Full) {
+                tracing::info!("Dry run: would delete key {key}");
+                continue;
+            }
+
+            // All validations passed: erase the object
+            match s3_client
+                .delete_object()
+                .bucket(&bucket_config.bucket)
+                .key(&key)
+                .send()
+                .await
+            {
+                Ok(_) => {
+                    tracing::info!("Successfully deleted unreferenced ancestor layer {key}");
+                    summary.ancestor_layers_deleted += 1;
+                }
+                Err(e) => {
+                    tracing::warn!("Failed to delete layer {key}: {e}");
+                    summary.remote_storage_errors += 1;
+                }
+            }
+        }
+
+        // TODO: if all the layers are gone, clean up the whole timeline dir (remove index)
+    }
+
+    Ok(())
+}
+
 /// Physical garbage collection: removing unused S3 objects.  This is distinct from the garbage collection
 /// done inside the pageserver, which operates at a higher level (keys, layers).  This type of garbage collection
 /// is about removing:
@@ -156,22 +470,26 @@ async fn maybe_delete_index(
 /// make sure that object listings don't get slowed down by large numbers of garbage objects.
 pub async fn pageserver_physical_gc(
    bucket_config: BucketConfig,
-    tenant_ids: Vec<TenantShardId>,
+    controller_client_conf: Option<ControllerClientConfig>,
+    tenant_shard_ids: Vec<TenantShardId>,
    min_age: Duration,
    mode: GcMode,
 ) -> anyhow::Result<GcSummary> {
    let (s3_client, target) = init_remote(bucket_config.clone(), NodeKind::Pageserver).await?;

-    let tenants = if tenant_ids.is_empty() {
+    let tenants = if tenant_shard_ids.is_empty() {
        futures::future::Either::Left(stream_tenants(&s3_client, &target))
    } else {
-        futures::future::Either::Right(futures::stream::iter(tenant_ids.into_iter().map(Ok)))
+        futures::future::Either::Right(futures::stream::iter(tenant_shard_ids.into_iter().map(Ok)))
    };

    // How many tenants to process in parallel.  We need to be mindful of pageservers
    // accessing the same per tenant prefixes, so use a lower setting than pageservers.
    const CONCURRENCY: usize = 32;

+    // Accumulate information about each tenant for cross-shard GC step we'll do at the end
+    let accumulator = Arc::new(std::sync::Mutex::new(TenantRefAccumulator::default()));
+
    // Generate a stream of TenantTimelineId
    let timelines = tenants.map_ok(|t| stream_tenant_timelines(&s3_client, &target, t));
    let timelines = timelines.try_buffered(CONCURRENCY);
@@ -185,16 +503,17 @@ pub async fn pageserver_physical_gc(
        target: &RootTarget,
        mode: GcMode,
        ttid: TenantShardTimelineId,
+        accumulator: &Arc<std::sync::Mutex<TenantRefAccumulator>>,
    ) -> anyhow::Result<GcSummary> {
        let mut summary = GcSummary::default();
        let data = list_timeline_blobs(s3_client, ttid, target).await?;

-        let (latest_gen, candidates) = match &data.blob_data {
+        let (index_part, latest_gen, candidates) = match &data.blob_data {
            BlobDataParseResult::Parsed {
-                index_part: _index_part,
+                index_part,
                index_part_generation,
                s3_layers: _s3_layers,
-            } => (*index_part_generation, data.unused_index_keys),
+            } => (index_part, *index_part_generation, data.unused_index_keys),
            BlobDataParseResult::Relic => {
                // Post-deletion tenant location: don't try and GC it.
                return Ok(summary);
@@ -206,6 +525,8 @@ pub async fn pageserver_physical_gc(
            }
        };

+        accumulator.lock().unwrap().update(ttid, index_part);
+
        for key in candidates {
            maybe_delete_index(
                s3_client,
@@ -222,17 +543,61 @@ pub async fn pageserver_physical_gc(

        Ok(summary)
    }
-    let timelines = timelines
-        .map_ok(|ttid| gc_timeline(&s3_client, &bucket_config, &min_age, &target, mode, ttid));
-    let mut timelines = std::pin::pin!(timelines.try_buffered(CONCURRENCY));

    let mut summary = GcSummary::default();

-    while let Some(i) = timelines.next().await {
-        let tl_summary = i?;
+    // Drain futures for per-shard GC, populating accumulator as a side effect
+    {
+        let timelines = timelines.map_ok(|ttid| {
+            gc_timeline(
+                &s3_client,
+                &bucket_config,
+                &min_age,
+                &target,
+                mode,
+                ttid,
+                &accumulator,
+            )
+        });
+        let mut timelines = std::pin::pin!(timelines.try_buffered(CONCURRENCY));

-        summary.indices_deleted += tl_summary.indices_deleted;
-        summary.remote_storage_errors += tl_summary.remote_storage_errors;
+        while let Some(i) = timelines.next().await {
+            summary.merge(i?);
+        }
+    }
+
+    // Execute cross-shard GC, using the accumulator's full view of all the shards built in the per-shard GC
+    let Some(controller_client) = controller_client_conf.as_ref().map(|c| {
+        let ControllerClientConfig {
+            controller_api,
+            controller_jwt,
+        } = c;
+        control_api::Client::new(controller_api.clone(), Some(controller_jwt.clone()))
+    }) else {
+        tracing::info!("Skipping ancestor layer GC, because no `--controller-api` was specified");
+        return Ok(summary);
+    };
+
+    let (ancestor_shards, ancestor_refs) = Arc::into_inner(accumulator)
+        .unwrap()
+        .into_inner()
+        .unwrap()
+        .into_gc_ancestors(&controller_client, &mut summary)
+        .await;
+
+    for ancestor_shard in ancestor_shards {
+        gc_ancestor(
+            &s3_client,
+            &bucket_config,
+            &target,
+            &min_age,
+            ancestor_shard,
+            &ancestor_refs,
+            mode,
+            &mut summary,
+        )
+        .instrument(info_span!("gc_ancestor", %ancestor_shard))
+        .await?;
    }

    Ok(summary)
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -146,6 +146,8 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
    "pageserver_smgr_query_seconds_sum",
    "pageserver_archive_size",
    "pageserver_pitr_history_size",
+    "pageserver_layer_bytes",
+    "pageserver_layer_count",
    "pageserver_storage_operations_seconds_count_total",
    "pageserver_storage_operations_seconds_sum_total",
    "pageserver_evictions_total",
--- a/test_runner/fixtures/neon_api.py
+++ b/test_runner/fixtures/neon_api.py
@@ -261,3 +261,47 @@ class NeonAPI:
                if op["status"] in {"scheduling", "running", "cancelling"}:
                    has_running = True
            time.sleep(0.5)
+
+
+class NeonApiEndpoint:
+    def __init__(self, neon_api: NeonAPI, pg_version: PgVersion, project_id: Optional[str]):
+        self.neon_api = neon_api
+        if project_id is None:
+            project = neon_api.create_project(pg_version)
+            neon_api.wait_for_operation_to_finish(project["project"]["id"])
+            self.project_id = project["project"]["id"]
+            self.endpoint_id = project["endpoints"][0]["id"]
+            self.connstr = project["connection_uris"][0]["connection_uri"]
+            self.pgbench_env = connection_parameters_to_env(
+                project["connection_uris"][0]["connection_parameters"]
+            )
+            self.is_new = True
+        else:
+            project = neon_api.get_project_details(project_id)
+            if int(project["project"]["pg_version"]) != int(pg_version):
+                raise Exception(
+                    f"A project with the provided ID exists, but it's not of the specified version (expected {pg_version}, got {project['project']['pg_version']})"
+                )
+            self.project_id = project_id
+            eps = neon_api.get_endpoints(project_id)["endpoints"]
+            self.endpoint_id = eps[0]["id"]
+            self.connstr = neon_api.get_connection_uri(project_id, endpoint_id=self.endpoint_id)[
+                "uri"
+            ]
+            pw = self.connstr.split("@")[0].split(":")[-1]
+            self.pgbench_env = {
+                "PGHOST": eps[0]["host"],
+                "PGDATABASE": "neondb",
+                "PGUSER": "neondb_owner",
+                "PGPASSWORD": pw,
+            }
+            self.is_new = False
+
+    def restart(self):
+        self.neon_api.restart_endpoint(self.project_id, self.endpoint_id)
+        self.neon_api.wait_for_operation_to_finish(self.project_id)
+
+    def get_synthetic_storage_size(self) -> int:
+        return int(
+            self.neon_api.get_project_details(self.project_id)["project"]["synthetic_storage_size"]
+        )
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -31,6 +31,7 @@ import backoff
 import httpx
 import jwt
 import psycopg2
+import psycopg2.sql
 import pytest
 import requests
 import toml
@@ -87,7 +88,7 @@ from fixtures.utils import (
 )
 from fixtures.utils import AuxFileStore as AuxFileStore  # reexport

-from .neon_api import NeonAPI
+from .neon_api import NeonAPI, NeonApiEndpoint

 """
 This file contains pytest fixtures. A fixture is a test resource that can be
@@ -727,8 +728,30 @@ class NeonEnvBuilder:
                self.repo_dir / "local_fs_remote_storage",
            )

-        if (attachments_json := Path(repo_dir / "attachments.json")).exists():
-            shutil.copyfile(attachments_json, self.repo_dir / attachments_json.name)
+        # restore storage controller (the db is small, don't bother with overlayfs)
+        storcon_db_from_dir = repo_dir / "storage_controller_db"
+        storcon_db_to_dir = self.repo_dir / "storage_controller_db"
+        log.info(f"Copying storage_controller_db from {storcon_db_from_dir} to {storcon_db_to_dir}")
+        assert storcon_db_from_dir.is_dir()
+        assert not storcon_db_to_dir.exists()
+
+        def ignore_postgres_log(path: str, _names):
+            if Path(path) == storcon_db_from_dir:
+                return {"postgres.log"}
+            return set()
+
+        shutil.copytree(storcon_db_from_dir, storcon_db_to_dir, ignore=ignore_postgres_log)
+        assert not (storcon_db_to_dir / "postgres.log").exists()
+        # NB: neon_local rewrites postgresql.conf on each start based on neon_local config. No need to patch it.
+        # However, in this new NeonEnv, the pageservers listen on different ports, and the storage controller
+        # will currently reject re-attach requests from them because the NodeMetadata isn't identical.
+        # So, from_repo_dir patches up the the storcon database.
+        patch_script_path = self.repo_dir / "storage_controller_db.startup.sql"
+        assert not patch_script_path.exists()
+        patch_script = ""
+        for ps in self.env.pageservers:
+            patch_script += f"UPDATE nodes SET listen_http_port={ps.service_port.http}, listen_pg_port={ps.service_port.pg}  WHERE node_id = '{ps.id}';"
+        patch_script_path.write_text(patch_script)

        # Update the config with info about tenants and timelines
        with (self.repo_dir / "config").open("r") as f:
@@ -974,7 +997,7 @@ class NeonEnvBuilder:

            if self.scrub_on_exit:
                try:
-                    StorageScrubber(self).scan_metadata()
+                    self.env.storage_scrubber.scan_metadata()
                except Exception as e:
                    log.error(f"Error during remote storage scrub: {e}")
                    cleanup_error = e
@@ -1135,6 +1158,7 @@ class NeonEnv:
                "listen_http_addr": f"localhost:{pageserver_port.http}",
                "pg_auth_type": pg_auth_type,
                "http_auth_type": http_auth_type,
+                "image_compression": "zstd",
            }
            if self.pageserver_virtual_file_io_engine is not None:
                ps_cfg["virtual_file_io_engine"] = self.pageserver_virtual_file_io_engine
@@ -1201,6 +1225,9 @@ class NeonEnv:
            )
            cfg["safekeepers"].append(sk_cfg)

+        # Scrubber instance for tests that use it, and for use during teardown checks
+        self.storage_scrubber = StorageScrubber(self, log_dir=config.test_output_dir)
+
        log.info(f"Config: {cfg}")
        self.neon_cli.init(
            cfg,
@@ -2400,7 +2427,7 @@ class NeonStorageController(MetricsGetter, LogUtils):

    def locate(self, tenant_id: TenantId) -> list[dict[str, Any]]:
        """
-        :return: list of {"shard_id": "", "node_id": int, "listen_pg_addr": str, "listen_pg_port": int, "listen_http_addr: str, "listen_http_port: int}
+        :return: list of {"shard_id": "", "node_id": int, "listen_pg_addr": str, "listen_pg_port": int, "listen_http_addr": str, "listen_http_port": int}
        """
        response = self.request(
            "GET",
@@ -2786,8 +2813,8 @@ class NeonPageserver(PgProtocol, LogUtils):
            )
        return client.tenant_attach(
            tenant_id,
+            generation,
            config,
-            generation=generation,
        )

    def tenant_detach(self, tenant_id: TenantId):
@@ -3158,6 +3185,18 @@ class RemotePostgres(PgProtocol):
        pass


+@pytest.fixture(scope="function")
+def benchmark_project_pub(neon_api: NeonAPI, pg_version: PgVersion) -> NeonApiEndpoint:
+    project_id = os.getenv("BENCHMARK_PROJECT_ID_PUB")
+    return NeonApiEndpoint(neon_api, pg_version, project_id)
+
+
+@pytest.fixture(scope="function")
+def benchmark_project_sub(neon_api: NeonAPI, pg_version: PgVersion) -> NeonApiEndpoint:
+    project_id = os.getenv("BENCHMARK_PROJECT_ID_SUB")
+    return NeonApiEndpoint(neon_api, pg_version, project_id)
+
+
@pytest.fixture(scope="function")
 def remote_pg(
    test_output_dir: Path, pg_distrib_dir: Path, pg_version: PgVersion
@@ -3773,12 +3812,12 @@ class Endpoint(PgProtocol, LogUtils):
            self.endpoint_id, self.tenant_id, pageserver_id, self.active_safekeepers
        )

-    def respec(self, **kwargs):
+    def respec(self, **kwargs: Any) -> None:
        """Update the endpoint.json file used by control_plane."""
        # Read config
        config_path = os.path.join(self.endpoint_path(), "endpoint.json")
        with open(config_path, "r") as f:
-            data_dict = json.load(f)
+            data_dict: dict[str, Any] = json.load(f)

        # Write it back updated
        with open(config_path, "w") as file:
@@ -3786,13 +3825,13 @@ class Endpoint(PgProtocol, LogUtils):
            json.dump(dict(data_dict, **kwargs), file, indent=4)

    # Please note: Migrations only run if pg_skip_catalog_updates is false
-    def wait_for_migrations(self):
+    def wait_for_migrations(self, num_migrations: int = 10):
        with self.cursor() as cur:

            def check_migrations_done():
                cur.execute("SELECT id FROM neon_migration.migration_id")
-                migration_id = cur.fetchall()[0][0]
-                assert migration_id != 0
+                migration_id: int = cur.fetchall()[0][0]
+                assert migration_id >= num_migrations

            wait_until(20, 0.5, check_migrations_done)

@@ -4042,6 +4081,22 @@ class Safekeeper(LogUtils):
        self.id = id
        self.running = running
        self.logfile = Path(self.data_dir) / f"safekeeper-{id}.log"
+
+        if extra_opts is None:
+            # Testing defaults: enable everything, and set short timeouts so that background
+            # work will happen during short tests.
+            # **Note**: Any test that explicitly sets extra_opts will not get these defaults.
+            extra_opts = [
+                "--enable-offload",
+                "--delete-offloaded-wal",
+                "--partial-backup-timeout",
+                "10s",
+                "--control-file-save-interval",
+                "1s",
+                "--eviction-min-resident",
+                "10s",
+            ]
+
        self.extra_opts = extra_opts

    def start(
@@ -4213,9 +4268,9 @@ class Safekeeper(LogUtils):


 class StorageScrubber:
-    def __init__(self, env: NeonEnvBuilder, log_dir: Optional[Path] = None):
+    def __init__(self, env: NeonEnv, log_dir: Path):
        self.env = env
-        self.log_dir = log_dir or env.test_output_dir
+        self.log_dir = log_dir

    def scrubber_cli(self, args: list[str], timeout) -> str:
        assert isinstance(self.env.pageserver_remote_storage, S3Storage)
@@ -4232,11 +4287,14 @@ class StorageScrubber:
        if s3_storage.endpoint is not None:
            env.update({"AWS_ENDPOINT_URL": s3_storage.endpoint})

-        base_args = [str(self.env.neon_binpath / "storage_scrubber")]
+        base_args = [
+            str(self.env.neon_binpath / "storage_scrubber"),
+            f"--controller-api={self.env.storage_controller_api}",
+        ]
        args = base_args + args

        (output_path, stdout, status_code) = subprocess_capture(
-            self.env.test_output_dir,
+            self.log_dir,
            args,
            echo_stderr=True,
            echo_stdout=True,
@@ -4275,7 +4333,10 @@ class StorageScrubber:
        log.info(f"tenant-snapshot output: {stdout}")

    def pageserver_physical_gc(
-        self, min_age_secs: int, tenant_ids: Optional[list[TenantId]] = None
+        self,
+        min_age_secs: int,
+        tenant_ids: Optional[list[TenantId]] = None,
+        mode: Optional[str] = None,
    ):
        args = ["pageserver-physical-gc", "--min-age", f"{min_age_secs}s"]

@@ -4285,6 +4346,9 @@ class StorageScrubber:
        for tenant_id in tenant_ids:
            args.extend(["--tenant-id", str(tenant_id)])

+        if mode is not None:
+            args.extend(["--mode", mode])
+
        stdout = self.scrubber_cli(
            args,
            timeout=30,
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -117,6 +117,9 @@ class LayerMapInfo:
    def image_layers(self) -> List[HistoricLayerInfo]:
        return [x for x in self.historic_layers if x.kind == "Image"]

+    def delta_l0_layers(self) -> List[HistoricLayerInfo]:
+        return [x for x in self.historic_layers if x.kind == "Delta" and x.l0]
+
    def historic_by_name(self) -> Set[str]:
        return set(x.layer_file_name for x in self.historic_layers)

@@ -172,6 +175,21 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
        if auth_token is not None:
            self.headers["Authorization"] = f"Bearer {auth_token}"

+    def without_status_retrying(self) -> PageserverHttpClient:
+        retries = Retry(
+            status=0,
+            connect=5,
+            read=False,
+            backoff_factor=0.2,
+            status_forcelist=[],
+            allowed_methods=None,
+            remove_headers_on_redirect=[],
+        )
+
+        return PageserverHttpClient(
+            self.port, self.is_testing_enabled_or_skip, self.auth_token, retries
+        )
+
    @property
    def base_url(self) -> str:
        return f"http://localhost:{self.port}"
@@ -223,8 +241,8 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
    def tenant_attach(
        self,
        tenant_id: Union[TenantId, TenantShardId],
+        generation: int,
        config: None | Dict[str, Any] = None,
-        generation: Optional[int] = None,
    ):
        config = config or {}

@@ -814,17 +832,19 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
        tenant_id: Union[TenantId, TenantShardId],
        timeline_id: TimelineId,
        batch_size: int | None = None,
-    ) -> Set[TimelineId]:
+        **kwargs,
+    ) -> List[TimelineId]:
        params = {}
        if batch_size is not None:
            params["batch_size"] = batch_size
        res = self.put(
            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/detach_ancestor",
            params=params,
+            **kwargs,
        )
        self.verbose_error(res)
        json = res.json()
-        return set(map(TimelineId, json["reparented_timelines"]))
+        return list(map(TimelineId, json["reparented_timelines"]))

    def evict_layer(
        self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, layer_name: str
--- a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
+++ b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
@@ -255,11 +255,3 @@ def run_pagebench_benchmark(
            unit="ms",
            report=MetricReport.LOWER_IS_BETTER,
        )
-
-    env.storage_controller.allowed_errors.append(
-        # The test setup swaps NeonEnv instances, hence different
-        # pg instances are used for the storage controller db. This means
-        # the storage controller doesn't know about the nodes mentioned
-        # in attachments.json at start-up.
-        ".* Scheduler missing node 1",
-    )
--- a/test_runner/performance/test_compaction.py
+++ b/test_runner/performance/test_compaction.py
@@ -2,6 +2,7 @@ from contextlib import closing

 import pytest
 from fixtures.compare_fixtures import NeonCompare
+from fixtures.log_helper import log
 from fixtures.neon_fixtures import wait_for_last_flush_lsn


@@ -56,3 +57,98 @@ def test_compaction(neon_compare: NeonCompare):
        pageserver_http.timeline_compact(tenant_id, timeline_id)

    neon_compare.report_size()
+
+
+def test_compaction_l0_memory(neon_compare: NeonCompare):
+    """
+    Generate a large stack of L0s pending compaction into L1s, and
+    measure the pageserver's peak RSS while doing so
+    """
+
+    env = neon_compare.env
+    pageserver_http = env.pageserver.http_client()
+
+    tenant_id, timeline_id = env.neon_cli.create_tenant(
+        conf={
+            # Initially disable compaction so that we will build up a stack of L0s
+            "compaction_period": "0s",
+            "gc_period": "0s",
+        }
+    )
+    neon_compare.tenant = tenant_id
+    neon_compare.timeline = timeline_id
+
+    endpoint = env.endpoints.create_start(
+        "main", tenant_id=tenant_id, config_lines=["shared_buffers=512MB"]
+    )
+
+    # Read tenant effective config and assert on checkpoint_distance and compaction_threshold,
+    # as we do want to test with defaults (to be same as the field), but this test's workload size makes assumptions about them.
+    #
+    # If these assertions fail, it probably means we changed the default.
+    tenant_conf = pageserver_http.tenant_config(tenant_id)
+    assert tenant_conf.effective_config["checkpoint_distance"] == 256 * 1024 * 1024
+    assert tenant_conf.effective_config["compaction_threshold"] == 10
+
+    # Aim to write about 20 L0s, so that we will hit the limit on how many
+    # to compact at once
+    with closing(endpoint.connect()) as conn:
+        with conn.cursor() as cur:
+            for i in range(200):
+                cur.execute(f"create table tbl{i} (i int, j int);")
+                cur.execute(f"insert into tbl{i} values (generate_series(1, 1000), 0);")
+                for j in range(100):
+                    cur.execute(f"update tbl{i} set j = {j};")
+
+    wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
+    endpoint.stop()
+
+    # Check we have generated the L0 stack we expected
+    layers = pageserver_http.layer_map_info(tenant_id, timeline_id)
+    initial_l0s = len(layers.delta_l0_layers())
+    initial_l0s_size = sum(x.layer_file_size for x in layers.delta_l0_layers())
+    log.info(f"l0s before compaction {initial_l0s} ({initial_l0s_size})")
+
+    def rss_hwm():
+        v = pageserver_http.get_metric_value("libmetrics_maxrss_kb")
+        assert v is not None
+        assert v > 0
+        return v * 1024
+
+    before = rss_hwm()
+    pageserver_http.timeline_compact(tenant_id, timeline_id)
+    after = rss_hwm()
+
+    log.info(f"RSS across compaction: {before} -> {after} (grew {after - before})")
+
+    layers = pageserver_http.layer_map_info(tenant_id, timeline_id)
+    final_l0s_size = sum(x.layer_file_size for x in layers.delta_l0_layers())
+    log.info(f"l0s after compaction {len(layers.delta_l0_layers())} ({final_l0s_size})")
+
+    assert after > before  # If we didn't use some memory the test is probably buggy
+    compaction_mapped_rss = after - before
+
+    # During L0 compaction, we require as much memory as the physical size of what we compacted, and then some,
+    # because the key->value mapping in L0s compaction is exhaustive, non-streaming, and does not de-duplicate
+    # repeated references to the same key.
+    #
+    # To be fixed in https://github.com/neondatabase/neon/issues/8184, after which
+    # this memory estimate can be revised far downwards to something that doesn't scale
+    # linearly with the layer sizes.
+    MEMORY_ESTIMATE = (initial_l0s_size - final_l0s_size) * 1.25
+
+    # If we find that compaction is using more memory, this may indicate a regression
+    assert compaction_mapped_rss < MEMORY_ESTIMATE
+
+    # If we find that compaction is using <0.5 the expected memory then:
+    # - maybe we made a big efficiency improvement, in which case update the test
+    # - maybe something is functionally wrong with the test and it's not driving the system as expected
+    assert compaction_mapped_rss > MEMORY_ESTIMATE / 2
+
+    # We should have compacted some but not all of the l0s, based on the limit on how much
+    # l0 to compact in one go
+    assert len(layers.delta_l0_layers()) > 0
+    assert len(layers.delta_l0_layers()) < initial_l0s
+
+    # The pageserver should have logged when it hit the compaction size limit
+    env.pageserver.assert_log_contains(".*hit max delta layer size limit.*")
--- a/test_runner/performance/test_logical_replication.py
+++ b/test_runner/performance/test_logical_replication.py
@@ -1,7 +1,6 @@
 from __future__ import annotations

 import time
-import traceback
 from typing import TYPE_CHECKING

 import psycopg2
@@ -10,15 +9,12 @@ import pytest
 from fixtures.benchmark_fixture import MetricReport
 from fixtures.common_types import Lsn
 from fixtures.log_helper import log
-from fixtures.neon_api import connection_parameters_to_env
 from fixtures.neon_fixtures import AuxFileStore, logical_replication_sync
-from fixtures.pg_version import PgVersion

 if TYPE_CHECKING:
    from fixtures.benchmark_fixture import NeonBenchmarker
-    from fixtures.neon_api import NeonAPI
+    from fixtures.neon_api import NeonApiEndpoint
    from fixtures.neon_fixtures import NeonEnv, PgBin
-    from fixtures.pg_version import PgVersion


@pytest.mark.parametrize("pageserver_aux_file_policy", [AuxFileStore.V2])
@@ -86,8 +82,8 @@ def measure_logical_replication_lag(sub_cur, pub_cur, timeout_sec=600):
@pytest.mark.timeout(2 * 60 * 60)
 def test_subscriber_lag(
    pg_bin: PgBin,
-    neon_api: NeonAPI,
-    pg_version: PgVersion,
+    benchmark_project_pub: NeonApiEndpoint,
+    benchmark_project_sub: NeonApiEndpoint,
    zenbenchmark: NeonBenchmarker,
 ):
    """
@@ -99,125 +95,82 @@ def test_subscriber_lag(
    sync_interval_min = 5
    pgbench_duration = f"-T{test_duration_min * 60 * 2}"

-    pub_project = neon_api.create_project(pg_version)
-    pub_project_id = pub_project["project"]["id"]
-    neon_api.wait_for_operation_to_finish(pub_project_id)
-    error_occurred = False
+    pub_env = benchmark_project_pub.pgbench_env
+    sub_env = benchmark_project_sub.pgbench_env
+    pub_connstr = benchmark_project_pub.connstr
+    sub_connstr = benchmark_project_sub.connstr
+
+    pg_bin.run_capture(["pgbench", "-i", "-s100"], env=pub_env)
+    pg_bin.run_capture(["pgbench", "-i", "-s100"], env=sub_env)
+
+    pub_conn = psycopg2.connect(pub_connstr)
+    sub_conn = psycopg2.connect(sub_connstr)
+    pub_conn.autocommit = True
+    sub_conn.autocommit = True
+    with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
+        if benchmark_project_pub.is_new:
+            pub_cur.execute("create publication pub1 for table pgbench_accounts, pgbench_history")
+
+        if benchmark_project_sub.is_new:
+            sub_cur.execute("truncate table pgbench_accounts")
+            sub_cur.execute("truncate table pgbench_history")
+
+            sub_cur.execute(f"create subscription sub1 connection '{pub_connstr}' publication pub1")
+
+        initial_sync_lag = measure_logical_replication_lag(sub_cur, pub_cur)
+    pub_conn.close()
+    sub_conn.close()
+
+    zenbenchmark.record("initial_sync_lag", initial_sync_lag, "s", MetricReport.LOWER_IS_BETTER)
+
+    pub_workload = pg_bin.run_nonblocking(
+        ["pgbench", "-c10", pgbench_duration, "-Mprepared"], env=pub_env
+    )
    try:
-        sub_project = neon_api.create_project(pg_version)
-        sub_project_id = sub_project["project"]["id"]
-        sub_endpoint_id = sub_project["endpoints"][0]["id"]
-        neon_api.wait_for_operation_to_finish(sub_project_id)
+        sub_workload = pg_bin.run_nonblocking(
+            ["pgbench", "-c10", pgbench_duration, "-S"],
+            env=sub_env,
+        )
        try:
-            pub_env = connection_parameters_to_env(
-                pub_project["connection_uris"][0]["connection_parameters"]
-            )
-            sub_env = connection_parameters_to_env(
-                sub_project["connection_uris"][0]["connection_parameters"]
-            )
-            pub_connstr = pub_project["connection_uris"][0]["connection_uri"]
-            sub_connstr = sub_project["connection_uris"][0]["connection_uri"]
+            start = time.time()
+            while time.time() - start < test_duration_min * 60:
+                time.sleep(sync_interval_min * 60)
+                check_pgbench_still_running(pub_workload, "pub")
+                check_pgbench_still_running(sub_workload, "sub")

-            pg_bin.run_capture(["pgbench", "-i", "-s100"], env=pub_env)
-            pg_bin.run_capture(["pgbench", "-i", "-s100"], env=sub_env)
+                with psycopg2.connect(pub_connstr) as pub_conn, psycopg2.connect(
+                    sub_connstr
+                ) as sub_conn:
+                    with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
+                        lag = measure_logical_replication_lag(sub_cur, pub_cur)

-            pub_conn = psycopg2.connect(pub_connstr)
-            sub_conn = psycopg2.connect(sub_connstr)
-            pub_conn.autocommit = True
-            sub_conn.autocommit = True
-            with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
-                sub_cur.execute("truncate table pgbench_accounts")
-                sub_cur.execute("truncate table pgbench_history")
+                log.info(f"Replica lagged behind master by {lag} seconds")
+                zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER)
+                sub_workload.terminate()
+                benchmark_project_sub.restart()

-                pub_cur.execute(
-                    "create publication pub1 for table pgbench_accounts, pgbench_history"
-                )
-                sub_cur.execute(
-                    f"create subscription sub1 connection '{pub_connstr}' publication pub1"
-                )
-
-                initial_sync_lag = measure_logical_replication_lag(sub_cur, pub_cur)
-            pub_conn.close()
-            sub_conn.close()
-
-            zenbenchmark.record(
-                "initial_sync_lag", initial_sync_lag, "s", MetricReport.LOWER_IS_BETTER
-            )
-
-            pub_workload = pg_bin.run_nonblocking(
-                ["pgbench", "-c10", pgbench_duration, "-Mprepared"], env=pub_env
-            )
-            try:
                sub_workload = pg_bin.run_nonblocking(
                    ["pgbench", "-c10", pgbench_duration, "-S"],
                    env=sub_env,
                )
-                try:
-                    start = time.time()
-                    while time.time() - start < test_duration_min * 60:
-                        time.sleep(sync_interval_min * 60)
-                        check_pgbench_still_running(pub_workload, "pub")
-                        check_pgbench_still_running(sub_workload, "sub")

-                        with psycopg2.connect(pub_connstr) as pub_conn, psycopg2.connect(
-                            sub_connstr
-                        ) as sub_conn:
-                            with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
-                                lag = measure_logical_replication_lag(sub_cur, pub_cur)
-
-                        log.info(f"Replica lagged behind master by {lag} seconds")
-                        zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER)
-                        sub_workload.terminate()
-                        neon_api.restart_endpoint(
-                            sub_project_id,
-                            sub_endpoint_id,
-                        )
-                        neon_api.wait_for_operation_to_finish(sub_project_id)
-                        sub_workload = pg_bin.run_nonblocking(
-                            ["pgbench", "-c10", pgbench_duration, "-S"],
-                            env=sub_env,
-                        )
-
-                        # Measure storage to make sure replication information isn't bloating storage
-                        sub_storage = neon_api.get_project_details(sub_project_id)["project"][
-                            "synthetic_storage_size"
-                        ]
-                        pub_storage = neon_api.get_project_details(pub_project_id)["project"][
-                            "synthetic_storage_size"
-                        ]
-                        zenbenchmark.record(
-                            "sub_storage", sub_storage, "B", MetricReport.LOWER_IS_BETTER
-                        )
-                        zenbenchmark.record(
-                            "pub_storage", pub_storage, "B", MetricReport.LOWER_IS_BETTER
-                        )
-
-                finally:
-                    sub_workload.terminate()
-            finally:
-                pub_workload.terminate()
-        except Exception as e:
-            error_occurred = True
-            log.error(f"Caught exception {e}")
-            log.error(traceback.format_exc())
+                # Measure storage to make sure replication information isn't bloating storage
+                sub_storage = benchmark_project_sub.get_synthetic_storage_size()
+                pub_storage = benchmark_project_pub.get_synthetic_storage_size()
+                zenbenchmark.record("sub_storage", sub_storage, "B", MetricReport.LOWER_IS_BETTER)
+                zenbenchmark.record("pub_storage", pub_storage, "B", MetricReport.LOWER_IS_BETTER)
        finally:
-            if not error_occurred:
-                neon_api.delete_project(sub_project_id)
-    except Exception as e:
-        error_occurred = True
-        log.error(f"Caught exception {e}")
-        log.error(traceback.format_exc())
+            sub_workload.terminate()
    finally:
-        assert not error_occurred
-        neon_api.delete_project(pub_project_id)
+        pub_workload.terminate()


@pytest.mark.remote_cluster
@pytest.mark.timeout(2 * 60 * 60)
 def test_publisher_restart(
    pg_bin: PgBin,
-    neon_api: NeonAPI,
-    pg_version: PgVersion,
+    benchmark_project_pub: NeonApiEndpoint,
+    benchmark_project_sub: NeonApiEndpoint,
    zenbenchmark: NeonBenchmarker,
 ):
    """
@@ -229,114 +182,70 @@ def test_publisher_restart(
    sync_interval_min = 5
    pgbench_duration = f"-T{test_duration_min * 60 * 2}"

-    pub_project = neon_api.create_project(pg_version)
-    pub_project_id = pub_project["project"]["id"]
-    pub_endpoint_id = pub_project["endpoints"][0]["id"]
-    neon_api.wait_for_operation_to_finish(pub_project_id)
-    error_occurred = False
+    pub_env = benchmark_project_pub.pgbench_env
+    sub_env = benchmark_project_sub.pgbench_env
+    pub_connstr = benchmark_project_pub.connstr
+    sub_connstr = benchmark_project_sub.connstr
+
+    pg_bin.run_capture(["pgbench", "-i", "-s100"], env=pub_env)
+    pg_bin.run_capture(["pgbench", "-i", "-s100"], env=sub_env)
+
+    pub_conn = psycopg2.connect(pub_connstr)
+    sub_conn = psycopg2.connect(sub_connstr)
+    pub_conn.autocommit = True
+    sub_conn.autocommit = True
+    with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
+        if benchmark_project_pub.is_new:
+            pub_cur.execute("create publication pub1 for table pgbench_accounts, pgbench_history")
+
+        if benchmark_project_sub.is_new:
+            sub_cur.execute("truncate table pgbench_accounts")
+            sub_cur.execute("truncate table pgbench_history")
+
+            sub_cur.execute(f"create subscription sub1 connection '{pub_connstr}' publication pub1")
+
+        initial_sync_lag = measure_logical_replication_lag(sub_cur, pub_cur)
+    pub_conn.close()
+    sub_conn.close()
+
+    zenbenchmark.record("initial_sync_lag", initial_sync_lag, "s", MetricReport.LOWER_IS_BETTER)
+
+    pub_workload = pg_bin.run_nonblocking(
+        ["pgbench", "-c10", pgbench_duration, "-Mprepared"], env=pub_env
+    )
    try:
-        sub_project = neon_api.create_project(pg_version)
-        sub_project_id = sub_project["project"]["id"]
-        neon_api.wait_for_operation_to_finish(sub_project_id)
+        sub_workload = pg_bin.run_nonblocking(
+            ["pgbench", "-c10", pgbench_duration, "-S"],
+            env=sub_env,
+        )
        try:
-            pub_env = connection_parameters_to_env(
-                pub_project["connection_uris"][0]["connection_parameters"]
-            )
-            sub_env = connection_parameters_to_env(
-                sub_project["connection_uris"][0]["connection_parameters"]
-            )
-            pub_connstr = pub_project["connection_uris"][0]["connection_uri"]
-            sub_connstr = sub_project["connection_uris"][0]["connection_uri"]
+            start = time.time()
+            while time.time() - start < test_duration_min * 60:
+                time.sleep(sync_interval_min * 60)
+                check_pgbench_still_running(pub_workload, "pub")
+                check_pgbench_still_running(sub_workload, "sub")

-            pg_bin.run_capture(["pgbench", "-i", "-s100"], env=pub_env)
-            pg_bin.run_capture(["pgbench", "-i", "-s100"], env=sub_env)
-
-            pub_conn = psycopg2.connect(pub_connstr)
-            sub_conn = psycopg2.connect(sub_connstr)
-            pub_conn.autocommit = True
-            sub_conn.autocommit = True
-            with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
-                sub_cur.execute("truncate table pgbench_accounts")
-                sub_cur.execute("truncate table pgbench_history")
-
-                pub_cur.execute(
-                    "create publication pub1 for table pgbench_accounts, pgbench_history"
-                )
-                sub_cur.execute(
-                    f"create subscription sub1 connection '{pub_connstr}' publication pub1"
-                )
-
-                initial_sync_lag = measure_logical_replication_lag(sub_cur, pub_cur)
-            pub_conn.close()
-            sub_conn.close()
-
-            zenbenchmark.record(
-                "initial_sync_lag", initial_sync_lag, "s", MetricReport.LOWER_IS_BETTER
-            )
-
-            pub_workload = pg_bin.run_nonblocking(
-                ["pgbench", "-c10", pgbench_duration, "-Mprepared"], env=pub_env
-            )
-            try:
-                sub_workload = pg_bin.run_nonblocking(
-                    ["pgbench", "-c10", pgbench_duration, "-S"],
-                    env=sub_env,
-                )
-                try:
-                    start = time.time()
-                    while time.time() - start < test_duration_min * 60:
-                        time.sleep(sync_interval_min * 60)
-                        check_pgbench_still_running(pub_workload, "pub")
-                        check_pgbench_still_running(sub_workload, "sub")
-
-                        pub_workload.terminate()
-                        neon_api.restart_endpoint(
-                            pub_project_id,
-                            pub_endpoint_id,
-                        )
-                        neon_api.wait_for_operation_to_finish(pub_project_id)
-                        pub_workload = pg_bin.run_nonblocking(
-                            ["pgbench", "-c10", pgbench_duration, "-Mprepared"],
-                            env=pub_env,
-                        )
-                        with psycopg2.connect(pub_connstr) as pub_conn, psycopg2.connect(
-                            sub_connstr
-                        ) as sub_conn:
-                            with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
-                                lag = measure_logical_replication_lag(sub_cur, pub_cur)
-
-                        log.info(f"Replica lagged behind master by {lag} seconds")
-                        zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER)
-
-                        # Measure storage to make sure replication information isn't bloating storage
-                        sub_storage = neon_api.get_project_details(sub_project_id)["project"][
-                            "synthetic_storage_size"
-                        ]
-                        pub_storage = neon_api.get_project_details(pub_project_id)["project"][
-                            "synthetic_storage_size"
-                        ]
-                        zenbenchmark.record(
-                            "sub_storage", sub_storage, "B", MetricReport.LOWER_IS_BETTER
-                        )
-                        zenbenchmark.record(
-                            "pub_storage", pub_storage, "B", MetricReport.LOWER_IS_BETTER
-                        )
-
-                finally:
-                    sub_workload.terminate()
-            finally:
                pub_workload.terminate()
-        except Exception as e:
-            error_occurred = True
-            log.error(f"Caught exception {e}")
-            log.error(traceback.format_exc())
+                benchmark_project_pub.restart()
+                pub_workload = pg_bin.run_nonblocking(
+                    ["pgbench", "-c10", pgbench_duration, "-Mprepared"],
+                    env=pub_env,
+                )
+                with psycopg2.connect(pub_connstr) as pub_conn, psycopg2.connect(
+                    sub_connstr
+                ) as sub_conn:
+                    with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
+                        lag = measure_logical_replication_lag(sub_cur, pub_cur)
+
+                log.info(f"Replica lagged behind master by {lag} seconds")
+                zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER)
+
+                # Measure storage to make sure replication information isn't bloating storage
+                sub_storage = benchmark_project_sub.get_synthetic_storage_size()
+                pub_storage = benchmark_project_pub.get_synthetic_storage_size()
+                zenbenchmark.record("sub_storage", sub_storage, "B", MetricReport.LOWER_IS_BETTER)
+                zenbenchmark.record("pub_storage", pub_storage, "B", MetricReport.LOWER_IS_BETTER)
        finally:
-            if not error_occurred:
-                neon_api.delete_project(sub_project_id)
-    except Exception as e:
-        error_occurred = True
-        log.error(f"Caught exception {e}")
-        log.error(traceback.format_exc())
+            sub_workload.terminate()
    finally:
-        assert not error_occurred
-        neon_api.delete_project(pub_project_id)
+        pub_workload.terminate()
--- a/test_runner/regress/test_auth.py
+++ b/test_runner/regress/test_auth.py
@@ -211,7 +211,7 @@ def test_auth_failures(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
    def check_pageserver(expect_success: bool, **conn_kwargs):
        check_connection(
            env.pageserver,
-            f"show {env.initial_tenant}",
+            f"pagestream {env.initial_tenant} {env.initial_timeline}",
            expect_success,
            **conn_kwargs,
        )
--- a/test_runner/regress/test_branch_and_gc.py
+++ b/test_runner/regress/test_branch_and_gc.py
@@ -65,8 +65,8 @@ def test_branch_and_gc(neon_simple_env: NeonEnv, build_type: str):
            "compaction_period": "1 s",
            "compaction_threshold": "2",
            "image_creation_threshold": "1",
-            # set PITR interval to be small, so we can do GC
-            "pitr_interval": "1 s",
+            # Disable PITR, this test will set an explicit space-based GC limit
+            "pitr_interval": "0 s",
        }
    )

--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -6,7 +6,10 @@ from typing import Optional

 import pytest
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnvBuilder, generate_uploads_and_deletions
+from fixtures.neon_fixtures import (
+    NeonEnvBuilder,
+    generate_uploads_and_deletions,
+)
 from fixtures.pageserver.http import PageserverApiException
 from fixtures.utils import wait_until
 from fixtures.workload import Workload
@@ -142,6 +145,10 @@ def test_sharding_compaction(
        "image_layer_creation_check_threshold": 0,
    }

+    # Disable compression, as we can't estimate the size of layers with compression enabled
+    # TODO: implement eager layer cutting during compaction
+    neon_env_builder.pageserver_config_override = "image_compression='disabled'"
+
    neon_env_builder.num_pageservers = 1 if shard_count is None else shard_count
    env = neon_env_builder.init_start(
        initial_tenant_conf=TENANT_CONF,
@@ -320,3 +327,87 @@ def test_pageserver_compaction_circuit_breaker(neon_env_builder: NeonEnvBuilder)
        or 0
    ) == 0
    assert not env.pageserver.log_contains(".*Circuit breaker failure ended.*")
+
+
+@pytest.mark.parametrize("enabled", [True, False])
+def test_image_layer_compression(neon_env_builder: NeonEnvBuilder, enabled: bool):
+    tenant_conf = {
+        # small checkpointing and compaction targets to ensure we generate many upload operations
+        "checkpoint_distance": f"{128 * 1024}",
+        "compaction_threshold": "1",
+        "compaction_target_size": f"{128 * 1024}",
+        # no PITR horizon, we specify the horizon when we request on-demand GC
+        "pitr_interval": "0s",
+        # disable background compaction and GC. We invoke it manually when we want it to happen.
+        "gc_period": "0s",
+        "compaction_period": "0s",
+        # create image layers as eagerly as possible
+        "image_creation_threshold": "1",
+        "image_layer_creation_check_threshold": "0",
+    }
+
+    # Explicitly enable/disable compression, rather than using default
+    if enabled:
+        neon_env_builder.pageserver_config_override = "image_compression='zstd'"
+    else:
+        neon_env_builder.pageserver_config_override = "image_compression='disabled'"
+
+    env = neon_env_builder.init_start(initial_tenant_conf=tenant_conf)
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    pageserver = env.pageserver
+    ps_http = env.pageserver.http_client()
+    with env.endpoints.create_start(
+        "main", tenant_id=tenant_id, pageserver_id=pageserver.id
+    ) as endpoint:
+        endpoint.safe_psql("CREATE TABLE foo (id INTEGER PRIMARY KEY, val text)")
+        # Generate around 800k worth of easily compressible data to store
+        for v in range(100):
+            endpoint.safe_psql(
+                f"INSERT INTO foo (id, val) VALUES ({v}, repeat('abcde{v:0>3}', 500))"
+            )
+    # run compaction to create image layers
+    ps_http.timeline_checkpoint(tenant_id, timeline_id, wait_until_uploaded=True)
+
+    layer_map = ps_http.layer_map_info(tenant_id, timeline_id)
+    image_layer_count = 0
+    delta_layer_count = 0
+    for layer in layer_map.historic_layers:
+        if layer.kind == "Image":
+            image_layer_count += 1
+        elif layer.kind == "Delta":
+            delta_layer_count += 1
+    assert image_layer_count > 0
+    assert delta_layer_count > 0
+
+    log.info(f"images: {image_layer_count}, deltas: {delta_layer_count}")
+
+    bytes_in = pageserver.http_client().get_metric_value(
+        "pageserver_compression_image_in_bytes_total"
+    )
+    bytes_out = pageserver.http_client().get_metric_value(
+        "pageserver_compression_image_out_bytes_total"
+    )
+    assert bytes_in is not None
+    assert bytes_out is not None
+    log.info(f"Compression ratio: {bytes_out/bytes_in} ({bytes_out} in, {bytes_out} out)")
+
+    if enabled:
+        # We are writing high compressible repetitive plain text, expect excellent compression
+        EXPECT_RATIO = 0.2
+        assert bytes_out / bytes_in < EXPECT_RATIO
+    else:
+        # Nothing should be compressed if we disabled it.
+        assert bytes_out >= bytes_in
+
+    # Destroy the endpoint and create a new one to resetthe caches
+    with env.endpoints.create_start(
+        "main", tenant_id=tenant_id, pageserver_id=pageserver.id
+    ) as endpoint:
+        for v in range(100):
+            res = endpoint.safe_psql(
+                f"SELECT count(*) FROM foo WHERE id={v} and val=repeat('abcde{v:0>3}', 500)"
+            )
+            assert res[0][0] == 1
--- a/Show More
+++ b/Show More