Merge pull request #9426 from neondatabase/rc/proxy/2024-10-17

Proxy release 2024-10-17
Merge pull request #9341 from neondatabase/rc/proxy/2024-10-10
2026-03-23 02:00:38 +00:00 · 2024-10-17 12:18:51 +02:00 · 2024-10-10 09:17:11 +02:00 · 2024-10-03 11:01:41 +02:00 · 2024-09-26 09:22:33 +01:00 · 2024-09-19 10:41:17 +01:00
232 changed files with 4756 additions and 10092 deletions
--- a/.github/workflows/_benchmarking_preparation.yml
+++ b/.github/workflows/_benchmarking_preparation.yml
@@ -27,7 +27,7 @@ jobs:

    runs-on: [ self-hosted, us-east-2, x64 ]
    container:
-      image: neondatabase/build-tools:pinned-bookworm
+      image: neondatabase/build-tools:pinned
      credentials:
        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -53,6 +53,20 @@ jobs:
      BUILD_TAG: ${{ inputs.build-tag }}

    steps:
+      - name: Fix git ownership
+        run: |
+          # Workaround for `fatal: detected dubious ownership in repository at ...`
+          #
+          # Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers
+          #   Ref https://github.com/actions/checkout/issues/785
+          #
+          git config --global --add safe.directory ${{ github.workspace }}
+          git config --global --add safe.directory ${GITHUB_WORKSPACE}
+          for r in 14 15 16 17; do
+            git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
+            git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
+          done
+
      - uses: actions/checkout@v4
        with:
          submodules: true
@@ -110,28 +124,28 @@ jobs:
        uses: actions/cache@v4
        with:
          path: pg_install/v14
-          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools.Dockerfile') }}
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}

      - name: Cache postgres v15 build
        id: cache_pg_15
        uses: actions/cache@v4
        with:
          path: pg_install/v15
-          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools.Dockerfile') }}
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}

      - name: Cache postgres v16 build
        id: cache_pg_16
        uses: actions/cache@v4
        with:
          path: pg_install/v16
-          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools.Dockerfile') }}
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}

      - name: Cache postgres v17 build
        id: cache_pg_17
        uses: actions/cache@v4
        with:
          path: pg_install/v17
-          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v17_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools.Dockerfile') }}
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v17_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}

      - name: Build postgres v14
        if: steps.cache_pg_14.outputs.cache-hit != 'true'
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -83,7 +83,7 @@ jobs:

    runs-on: ${{ matrix.RUNNER }}
    container:
-      image: neondatabase/build-tools:pinned-bookworm
+      image: neondatabase/build-tools:pinned
      credentials:
        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
@@ -178,7 +178,7 @@ jobs:

    runs-on: [ self-hosted, us-east-2, x64 ]
    container:
-      image: neondatabase/build-tools:pinned-bookworm
+      image: neondatabase/build-tools:pinned
      credentials:
        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
@@ -280,7 +280,7 @@ jobs:
        region_id_default=${{ env.DEFAULT_REGION_ID }}
        runner_default='["self-hosted", "us-east-2", "x64"]'
        runner_azure='["self-hosted", "eastus2", "x64"]'
-        image_default="neondatabase/build-tools:pinned-bookworm"
+        image_default="neondatabase/build-tools:pinned"
        matrix='{
          "pg_version" : [
            16
@@ -299,9 +299,9 @@ jobs:
          "include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier",       "db_size": "3gb" ,"runner": '"$runner_default"', "image": "'"$image_default"'" },
                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new",            "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" },
                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new",            "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" },
-                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-freetier", "db_size": "3gb" ,"runner": '"$runner_azure"',   "image": "neondatabase/build-tools:pinned-bookworm" },
-                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-new",      "db_size": "10gb","runner": '"$runner_azure"',   "image": "neondatabase/build-tools:pinned-bookworm" },
-                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-new",      "db_size": "50gb","runner": '"$runner_azure"',   "image": "neondatabase/build-tools:pinned-bookworm" },
+                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-freetier", "db_size": "3gb" ,"runner": '"$runner_azure"',   "image": "neondatabase/build-tools:pinned" },
+                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-new",      "db_size": "10gb","runner": '"$runner_azure"',   "image": "neondatabase/build-tools:pinned" },
+                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-new",      "db_size": "50gb","runner": '"$runner_azure"',   "image": "neondatabase/build-tools:pinned" },
                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" }]
        }'

@@ -665,7 +665,7 @@ jobs:

    runs-on: [ self-hosted, us-east-2, x64 ]
    container:
-      image: neondatabase/build-tools:pinned-bookworm
+      image: neondatabase/build-tools:pinned
      credentials:
        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
@@ -772,7 +772,7 @@ jobs:

    runs-on: [ self-hosted, us-east-2, x64 ]
    container:
-      image: neondatabase/build-tools:pinned-bookworm
+      image: neondatabase/build-tools:pinned
      credentials:
        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
@@ -877,7 +877,7 @@ jobs:

    runs-on: [ self-hosted, us-east-2, x64 ]
    container:
-      image: neondatabase/build-tools:pinned-bookworm
+      image: neondatabase/build-tools:pinned
      credentials:
        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
--- a/.github/workflows/build-build-tools-image.yml
+++ b/.github/workflows/build-build-tools-image.yml
@@ -82,7 +82,7 @@ jobs:

      - uses: docker/build-push-action@v6
        with:
-          file: build-tools.Dockerfile
+          file: Dockerfile.build-tools
          context: .
          provenance: false
          push: true
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -683,7 +683,7 @@ jobs:
          provenance: false
          push: true
          pull: true
-          file: compute/compute-node.Dockerfile
+          file: compute/Dockerfile.compute-node
          cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }}
          cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/compute-node-{0}:cache-{1}-{2},mode=max', matrix.version.pg, matrix.version.debian, matrix.arch) || '' }}
          tags: |
@@ -703,7 +703,7 @@ jobs:
          provenance: false
          push: true
          pull: true
-          file: compute/compute-node.Dockerfile
+          file: compute/Dockerfile.compute-node
          target: neon-pg-ext-test
          cache-from: type=registry,ref=cache.neon.build/neon-test-extensions-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }}
          cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/neon-test-extensions-{0}:cache-{1}-{2},mode=max', matrix.version.pg, matrix.version.debian, matrix.arch) || '' }}
@@ -728,7 +728,7 @@ jobs:
          provenance: false
          push: true
          pull: true
-          file: compute/compute-node.Dockerfile
+          file: compute/Dockerfile.compute-node
          cache-from: type=registry,ref=cache.neon.build/neon-test-extensions-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }}
          cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/compute-tools-{0}:cache-{1}-{2},mode=max', matrix.version.pg, matrix.version.debian, matrix.arch) || '' }}
          tags: |
@@ -1078,6 +1078,20 @@ jobs:
    runs-on: [ self-hosted, small ]
    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
    steps:
+      - name: Fix git ownership
+        run: |
+          # Workaround for `fatal: detected dubious ownership in repository at ...`
+          #
+          # Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers
+          #   Ref https://github.com/actions/checkout/issues/785
+          #
+          git config --global --add safe.directory ${{ github.workspace }}
+          git config --global --add safe.directory ${GITHUB_WORKSPACE}
+          for r in 14 15 16 17; do
+            git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
+            git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
+          done
+
      - uses: actions/checkout@v4

      - name: Trigger deploy workflow
@@ -1116,11 +1130,7 @@ jobs:

            gh workflow --repo neondatabase/infra run deploy-proxy-prod.yml --ref main \
              -f deployPgSniRouter=true \
-              -f deployProxyLink=true \
-              -f deployPrivatelinkProxy=true \
-              -f deployLegacyProxyScram=true \
-              -f deployProxyScram=true \
-              -f deployProxyAuthBroker=true \
+              -f deployProxy=true \
              -f branch=main \
              -f dockerTag=${{needs.tag.outputs.build-tag}}
          else
--- a/.github/workflows/check-build-tools-image.yml
+++ b/.github/workflows/check-build-tools-image.yml
@@ -31,7 +31,7 @@ jobs:
        id: get-build-tools-tag
        env:
          IMAGE_TAG: |
-            ${{ hashFiles('build-tools.Dockerfile',
+            ${{ hashFiles('Dockerfile.build-tools',
                          '.github/workflows/check-build-tools-image.yml',
                          '.github/workflows/build-build-tools-image.yml') }}
        run: |
--- a/.github/workflows/cloud-regress.yml
+++ b/.github/workflows/cloud-regress.yml
@@ -31,7 +31,7 @@ jobs:

    runs-on: us-east-2
    container:
-      image: neondatabase/build-tools:pinned-bookworm
+      image: neondatabase/build-tools:pinned
      options: --init

    steps:
--- a/.github/workflows/trigger-e2e-tests.yml
+++ b/.github/workflows/trigger-e2e-tests.yml
@@ -112,7 +112,7 @@ jobs:
                # This isn't exhaustive, just the paths that are most directly compute-related.
                # For example, compute_ctl also depends on libs/utils, but we don't trigger
                # an e2e run on that.
-                vendor/*|pgxn/*|compute_tools/*|libs/vm_monitor/*|compute/compute-node.Dockerfile)
+                vendor/*|pgxn/*|compute_tools/*|libs/vm_monitor/*|compute/Dockerfile.compute-node)
                  platforms=$(echo "${platforms}" | jq --compact-output '. += ["k8s-neonvm"] | unique')
                  ;;
                *)
--- a/.gitignore
+++ b/.gitignore
@@ -6,8 +6,6 @@ __pycache__/
 test_output/
 .vscode
 .idea
-*.swp
-tags
 neon.iml
 /.neon
 /integration_tests/.neon
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -148,9 +148,9 @@ dependencies = [

 [[package]]
 name = "asn1-rs"
-version = "0.6.2"
+version = "0.5.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5493c3bedbacf7fd7382c6346bbd66687d12bbaad3a89a2d2c303ee6cf20b048"
+checksum = "7f6fd5ddaf0351dff5b8da21b2fb4ff8e08ddd02857f0bf69c47639106c0fff0"
 dependencies = [
 "asn1-rs-derive",
 "asn1-rs-impl",
@@ -164,25 +164,25 @@ dependencies = [

 [[package]]
 name = "asn1-rs-derive"
-version = "0.5.1"
+version = "0.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "965c2d33e53cb6b267e148a4cb0760bc01f4904c1cd4bb4002a085bb016d1490"
+checksum = "726535892e8eae7e70657b4c8ea93d26b8553afb1ce617caee529ef96d7dee6c"
 dependencies = [
 "proc-macro2",
 "quote",
- "syn 2.0.52",
+ "syn 1.0.109",
 "synstructure",
 ]

 [[package]]
 name = "asn1-rs-impl"
-version = "0.2.0"
+version = "0.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7b18050c2cd6fe86c3a76584ef5e0baf286d038cda203eb6223df2cc413565f7"
+checksum = "2777730b2039ac0f95f093556e61b6d26cebed5393ca6f152717777cec3a42ed"
 dependencies = [
 "proc-macro2",
 "quote",
- "syn 2.0.52",
+ "syn 1.0.109",
 ]

 [[package]]
@@ -310,33 +310,6 @@ dependencies = [
 "zeroize",
 ]

-[[package]]
-name = "aws-lc-rs"
-version = "1.9.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2f95446d919226d587817a7d21379e6eb099b97b45110a7f272a444ca5c54070"
-dependencies = [
- "aws-lc-sys",
- "mirai-annotations",
- "paste",
- "zeroize",
-]
-
-[[package]]
-name = "aws-lc-sys"
-version = "0.21.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b3ddc4a5b231dd6958b140ff3151b6412b3f4321fab354f399eec8f14b06df62"
-dependencies = [
- "bindgen 0.69.5",
- "cc",
- "cmake",
- "dunce",
- "fs_extra",
- "libc",
- "paste",
-]
-
 [[package]]
 name = "aws-runtime"
 version = "1.4.3"
@@ -622,7 +595,7 @@ dependencies = [
 "once_cell",
 "pin-project-lite",
 "pin-utils",
- "rustls 0.21.12",
+ "rustls 0.21.11",
 "tokio",
 "tracing",
 ]
@@ -942,29 +915,6 @@ dependencies = [
 "serde",
 ]

-[[package]]
-name = "bindgen"
-version = "0.69.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "271383c67ccabffb7381723dea0672a673f292304fcb45c01cc648c7a8d58088"
-dependencies = [
- "bitflags 2.4.1",
- "cexpr",
- "clang-sys",
- "itertools 0.10.5",
- "lazy_static",
- "lazycell",
- "log",
- "prettyplease",
- "proc-macro2",
- "quote",
- "regex",
- "rustc-hash",
- "shlex",
- "syn 2.0.52",
- "which",
-]
-
 [[package]]
 name = "bindgen"
 version = "0.70.1"
@@ -974,7 +924,7 @@ dependencies = [
 "bitflags 2.4.1",
 "cexpr",
 "clang-sys",
- "itertools 0.10.5",
+ "itertools 0.12.1",
 "log",
 "prettyplease",
 "proc-macro2",
@@ -1088,13 +1038,12 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"

 [[package]]
 name = "cc"
-version = "1.1.30"
+version = "1.0.83"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b16803a61b81d9eabb7eae2588776c4c1e584b738ede45fdbb4c972cec1e9945"
+checksum = "f1174fb0b6ec23863f8b971027804a42614e347eafb0a95bf0b12cdae21fc4d0"
 dependencies = [
 "jobserver",
 "libc",
- "shlex",
 ]

 [[package]]
@@ -1220,15 +1169,6 @@ version = "0.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2da6da31387c7e4ef160ffab6d5e7f00c42626fe39aea70a7b0f1773f7dd6c1b"

-[[package]]
-name = "cmake"
-version = "0.1.51"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fb1e43aa7fd152b1f968787f7dbcdeb306d1867ff373c69955211876c053f91a"
-dependencies = [
- "cc",
-]
-
 [[package]]
 name = "colorchoice"
 version = "1.0.0"
@@ -1684,9 +1624,9 @@ dependencies = [

 [[package]]
 name = "der-parser"
-version = "9.0.0"
+version = "8.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5cd0a5c643689626bec213c4d8bd4d96acc8ffdb4ad4bb6bc16abf27d5f4b553"
+checksum = "dbd676fbbab537128ef0278adb5576cf363cff6aa22a7b24effe97347cfab61e"
 dependencies = [
 "asn1-rs",
 "displaydoc",
@@ -1815,12 +1755,6 @@ dependencies = [
 "syn 2.0.52",
 ]

-[[package]]
-name = "dunce"
-version = "1.0.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813"
-
 [[package]]
 name = "dyn-clone"
 version = "1.0.14"
@@ -2125,12 +2059,6 @@ dependencies = [
 "tokio-util",
 ]

-[[package]]
-name = "fs_extra"
-version = "1.3.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c"
-
 [[package]]
 name = "fsevent-sys"
 version = "4.1.0"
@@ -2484,15 +2412,6 @@ dependencies = [
 "digest",
 ]

-[[package]]
-name = "home"
-version = "0.5.9"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e3d1354bf6b7235cb4a0576c2619fd4ed18183f689b12b006a0ee7329eeff9a5"
-dependencies = [
- "windows-sys 0.52.0",
-]
-
 [[package]]
 name = "hostname"
 version = "0.4.0"
@@ -2662,7 +2581,7 @@ dependencies = [
 "http 0.2.9",
 "hyper 0.14.30",
 "log",
- "rustls 0.21.12",
+ "rustls 0.21.11",
 "rustls-native-certs 0.6.2",
 "tokio",
 "tokio-rustls 0.24.0",
@@ -2882,9 +2801,9 @@ checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b"

 [[package]]
 name = "jobserver"
-version = "0.1.32"
+version = "0.1.26"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "48d1dbcbbeb6a7fec7e059840aa538bd62aaccf972c7346c4d9d2059312853d0"
+checksum = "936cfd212a0155903bcbc060e316fb6cc7cbf2e1907329391ebadc1fe0ce77c2"
 dependencies = [
 "libc",
 ]
@@ -2988,12 +2907,6 @@ dependencies = [
 "spin",
 ]

-[[package]]
-name = "lazycell"
-version = "1.3.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
-
 [[package]]
 name = "libc"
 version = "0.2.150"
@@ -3224,12 +3137,6 @@ dependencies = [
 "windows-sys 0.48.0",
 ]

-[[package]]
-name = "mirai-annotations"
-version = "1.12.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c9be0862c1b3f26a88803c4a49de6889c10e608b3ee9344e6ef5b45fb37ad3d1"
-
 [[package]]
 name = "multimap"
 version = "0.8.3"
@@ -3449,9 +3356,9 @@ dependencies = [

 [[package]]
 name = "oid-registry"
-version = "0.7.1"
+version = "0.6.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a8d8034d9489cdaf79228eb9f6a3b8d7bb32ba00d6645ebd48eef4077ceb5bd9"
+checksum = "9bedf36ffb6ba96c2eb7144ef6270557b52e54b20c0a8e1eb2ff99a6c6959bff"
 dependencies = [
 "asn1-rs",
 ]
@@ -4146,14 +4053,14 @@ dependencies = [
 "bytes",
 "once_cell",
 "pq_proto",
- "rustls 0.23.7",
+ "rustls 0.22.4",
 "rustls-pemfile 2.1.1",
 "serde",
 "thiserror",
 "tokio",
 "tokio-postgres",
 "tokio-postgres-rustls",
- "tokio-rustls 0.26.0",
+ "tokio-rustls 0.25.0",
 "tokio-util",
 "tracing",
 ]
@@ -4175,7 +4082,7 @@ name = "postgres_ffi"
 version = "0.1.0"
 dependencies = [
 "anyhow",
- "bindgen 0.70.1",
+ "bindgen",
 "bytes",
 "crc32c",
 "env_logger",
@@ -4312,7 +4219,7 @@ checksum = "0c1318b19085f08681016926435853bbf7858f9c082d0999b80550ff5d9abe15"
 dependencies = [
 "bytes",
 "heck 0.5.0",
- "itertools 0.10.5",
+ "itertools 0.12.1",
 "log",
 "multimap",
 "once_cell",
@@ -4332,7 +4239,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e9552f850d5f0964a4e4d0bf306459ac29323ddfbae05e35a7c0d35cb0803cc5"
 dependencies = [
 "anyhow",
- "itertools 0.10.5",
+ "itertools 0.12.1",
 "proc-macro2",
 "quote",
 "syn 2.0.52",
@@ -4420,8 +4327,8 @@ dependencies = [
 "rsa",
 "rstest",
 "rustc-hash",
- "rustls 0.23.7",
- "rustls-native-certs 0.8.0",
+ "rustls 0.22.4",
+ "rustls-native-certs 0.7.0",
 "rustls-pemfile 2.1.1",
 "scopeguard",
 "serde",
@@ -4438,7 +4345,7 @@ dependencies = [
 "tokio",
 "tokio-postgres",
 "tokio-postgres-rustls",
- "tokio-rustls 0.26.0",
+ "tokio-rustls 0.25.0",
 "tokio-tungstenite",
 "tokio-util",
 "tracing",
@@ -4602,13 +4509,12 @@ dependencies = [

 [[package]]
 name = "rcgen"
-version = "0.13.1"
+version = "0.12.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "54077e1872c46788540de1ea3d7f4ccb1983d12f9aa909b234468676c1a36779"
+checksum = "48406db8ac1f3cbc7dcdb56ec355343817958a356ff430259bb07baf7607e1e1"
 dependencies = [
 "pem",
 "ring",
- "rustls-pki-types",
 "time",
 "yasna",
 ]
@@ -4787,7 +4693,7 @@ dependencies = [
 "once_cell",
 "percent-encoding",
 "pin-project-lite",
- "rustls 0.21.12",
+ "rustls 0.21.11",
 "rustls-pemfile 1.0.2",
 "serde",
 "serde_json",
@@ -5085,9 +4991,9 @@ dependencies = [

 [[package]]
 name = "rustls"
-version = "0.21.12"
+version = "0.21.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3f56a14d1f48b391359b22f731fd4bd7e43c97f3c50eee276f3aa09c94784d3e"
+checksum = "7fecbfb7b1444f477b345853b1fce097a2c6fb637b2bfb87e6bc5db0f043fae4"
 dependencies = [
 "log",
 "ring",
@@ -5115,7 +5021,6 @@ version = "0.23.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ebbbdb961df0ad3f2652da8f3fdc4b36122f568f968f45ad3316f26c025c677b"
 dependencies = [
- "aws-lc-rs",
 "log",
 "once_cell",
 "ring",
@@ -5184,9 +5089,9 @@ dependencies = [

 [[package]]
 name = "rustls-pki-types"
-version = "1.10.0"
+version = "1.3.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "16f1201b3c9a7ee8039bcadc17b7e605e2945b27eee7631788c1bd2b0643674b"
+checksum = "5ede67b28608b4c60685c7d54122d4400d90f62b40caee7700e700380a390fa8"

 [[package]]
 name = "rustls-webpki"
@@ -5204,7 +5109,6 @@ version = "0.102.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "faaa0a62740bedb9b2ef5afa303da42764c012f743917351dc9a237ea1663610"
 dependencies = [
- "aws-lc-rs",
 "ring",
 "rustls-pki-types",
 "untrusted",
@@ -5408,7 +5312,7 @@ checksum = "00421ed8fa0c995f07cde48ba6c89e80f2b312f74ff637326f392fbfd23abe02"
 dependencies = [
 "httpdate",
 "reqwest 0.12.4",
- "rustls 0.21.12",
+ "rustls 0.21.11",
 "sentry-backtrace",
 "sentry-contexts",
 "sentry-core",
@@ -5903,8 +5807,8 @@ dependencies = [
 "postgres_ffi",
 "remote_storage",
 "reqwest 0.12.4",
- "rustls 0.23.7",
- "rustls-native-certs 0.8.0",
+ "rustls 0.22.4",
+ "rustls-native-certs 0.7.0",
 "serde",
 "serde_json",
 "storage_controller_client",
@@ -6026,13 +5930,14 @@ checksum = "a7065abeca94b6a8a577f9bd45aa0867a2238b74e8eb67cf10d492bc39351394"

 [[package]]
 name = "synstructure"
-version = "0.13.1"
+version = "0.12.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971"
+checksum = "f36bdaa60a83aca3921b5259d5400cbf5e90fc51931376a9bd4a0eb79aa7210f"
 dependencies = [
 "proc-macro2",
 "quote",
- "syn 2.0.52",
+ "syn 1.0.109",
+ "unicode-xid",
 ]

 [[package]]
@@ -6272,7 +6177,7 @@ dependencies = [
 [[package]]
 name = "tokio-epoll-uring"
 version = "0.1.0"
-source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#cb2dcea2058034bc209e7917b01c5097712a3168"
+source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#08ccfa94ff5507727bf4d8d006666b5b192e04c6"
 dependencies = [
 "futures",
 "nix 0.26.4",
@@ -6331,15 +6236,16 @@ dependencies = [

 [[package]]
 name = "tokio-postgres-rustls"
-version = "0.12.0"
+version = "0.11.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "04fb792ccd6bbcd4bba408eb8a292f70fc4a3589e5d793626f45190e6454b6ab"
+checksum = "0ea13f22eda7127c827983bdaf0d7fff9df21c8817bab02815ac277a21143677"
 dependencies = [
+ "futures",
 "ring",
- "rustls 0.23.7",
+ "rustls 0.22.4",
 "tokio",
 "tokio-postgres",
- "tokio-rustls 0.26.0",
+ "tokio-rustls 0.25.0",
 "x509-certificate",
 ]

@@ -6349,7 +6255,7 @@ version = "0.24.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e0d409377ff5b1e3ca6437aa86c1eb7d40c134bfec254e44c830defa92669db5"
 dependencies = [
- "rustls 0.21.12",
+ "rustls 0.21.11",
 "tokio",
 ]

@@ -6772,15 +6678,16 @@ checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1"

 [[package]]
 name = "ureq"
-version = "2.10.1"
+version = "2.9.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b74fc6b57825be3373f7054754755f03ac3a8f5d70015ccad699ba2029956f4a"
+checksum = "d11a831e3c0b56e438a28308e7c810799e3c118417f342d30ecec080105395cd"
 dependencies = [
 "base64 0.22.1",
 "log",
 "once_cell",
- "rustls 0.23.7",
+ "rustls 0.22.4",
 "rustls-pki-types",
+ "rustls-webpki 0.102.2",
 "url",
 "webpki-roots 0.26.1",
 ]
@@ -6788,7 +6695,7 @@ dependencies = [
 [[package]]
 name = "uring-common"
 version = "0.1.0"
-source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#cb2dcea2058034bc209e7917b01c5097712a3168"
+source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#08ccfa94ff5507727bf4d8d006666b5b192e04c6"
 dependencies = [
 "bytes",
 "io-uring",
@@ -6969,7 +6876,7 @@ name = "walproposer"
 version = "0.1.0"
 dependencies = [
 "anyhow",
- "bindgen 0.70.1",
+ "bindgen",
 "postgres_ffi",
 "utils",
 ]
@@ -7144,18 +7051,6 @@ dependencies = [
 "rustls-pki-types",
 ]

-[[package]]
-name = "which"
-version = "4.4.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "87ba24419a2078cd2b0f2ede2691b6c66d8e47836da3b6db8265ebad47afbfc7"
-dependencies = [
- "either",
- "home",
- "once_cell",
- "rustix",
-]
-
 [[package]]
 name = "whoami"
 version = "1.5.1"
@@ -7400,6 +7295,7 @@ dependencies = [
 "digest",
 "either",
 "fail",
+ "futures",
 "futures-channel",
 "futures-executor",
 "futures-io",
@@ -7415,7 +7311,7 @@ dependencies = [
 "hyper-util",
 "indexmap 1.9.3",
 "indexmap 2.0.1",
- "itertools 0.10.5",
+ "itertools 0.12.1",
 "lazy_static",
 "libc",
 "log",
@@ -7436,8 +7332,6 @@ dependencies = [
 "regex-automata 0.4.3",
 "regex-syntax 0.8.2",
 "reqwest 0.12.4",
- "rustls 0.23.7",
- "rustls-webpki 0.102.2",
 "scopeguard",
 "serde",
 "serde_json",
@@ -7446,6 +7340,7 @@ dependencies = [
 "smallvec",
 "spki 0.7.3",
 "subtle",
+ "syn 1.0.109",
 "syn 2.0.52",
 "sync_wrapper 0.1.2",
 "tikv-jemalloc-sys",
@@ -7453,7 +7348,6 @@ dependencies = [
 "time-macros",
 "tokio",
 "tokio-postgres",
- "tokio-rustls 0.26.0",
 "tokio-stream",
 "tokio-util",
 "toml_edit",
@@ -7489,9 +7383,9 @@ dependencies = [

 [[package]]
 name = "x509-parser"
-version = "0.16.0"
+version = "0.15.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fcbc162f30700d6f3f82a24bf7cc62ffe7caea42c0b2cba8bf7f3ae50cf51f69"
+checksum = "bab0c2f54ae1d92f4fcb99c0b7ccf0b1e3451cbd395e5f115ccbdbcb18d4f634"
 dependencies = [
 "asn1-rs",
 "data-encoding",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -142,7 +142,7 @@ reqwest-retry = "0.5"
 routerify = "3"
 rpds = "0.13"
 rustc-hash = "1.1.0"
-rustls = "0.23"
+rustls = "0.22"
 rustls-pemfile = "2"
 scopeguard = "1.1"
 sysinfo = "0.29.2"
@@ -172,8 +172,8 @@ tikv-jemalloc-ctl = "0.5"
 tokio = { version = "1.17", features = ["macros"] }
 tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
 tokio-io-timeout = "1.2.0"
-tokio-postgres-rustls = "0.12.0"
-tokio-rustls = "0.26"
+tokio-postgres-rustls = "0.11.0"
+tokio-rustls = "0.25"
 tokio-stream = "0.1"
 tokio-tar = "0.3"
 tokio-util = { version = "0.7.10", features = ["io", "rt"] }
@@ -192,8 +192,8 @@ url = "2.2"
 urlencoding = "2.1"
 uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] }
 walkdir = "2.3.2"
-rustls-native-certs = "0.8"
-x509-parser = "0.16"
+rustls-native-certs = "0.7"
+x509-parser = "0.15"
 whoami = "1.5.1"

 ## TODO replace this with tracing
@@ -244,7 +244,7 @@ workspace_hack = { version = "0.1", path = "./workspace_hack/" }

 ## Build dependencies
 criterion = "0.5.1"
-rcgen = "0.13"
+rcgen = "0.12"
 rstest = "0.18"
 camino-tempfile = "1.0.2"
 tonic-build = "0.12"
--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -72,7 +72,7 @@ RUN curl -sL "https://github.com/peak/s5cmd/releases/download/v${S5CMD_VERSION}/
    && mv s5cmd /usr/local/bin/s5cmd

 # LLVM
-ENV LLVM_VERSION=19
+ENV LLVM_VERSION=18
 RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \
    && echo "deb http://apt.llvm.org/${DEBIAN_VERSION}/ llvm-toolchain-${DEBIAN_VERSION}-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \
    && apt update \
@@ -99,7 +99,7 @@ RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "aws
    && rm awscliv2.zip

 # Mold: A Modern Linker
-ENV MOLD_VERSION=v2.34.1
+ENV MOLD_VERSION=v2.33.0
 RUN set -e \
    && git clone https://github.com/rui314/mold.git \
    && mkdir mold/build \
@@ -142,7 +142,7 @@ RUN wget -O /tmp/openssl-${OPENSSL_VERSION}.tar.gz https://www.openssl.org/sourc
 # Use the same version of libicu as the compute nodes so that
 # clusters created using inidb on pageserver can be used by computes.
 #
-# TODO: at this time, compute-node.Dockerfile uses the debian bullseye libicu
+# TODO: at this time, Dockerfile.compute-node uses the debian bullseye libicu
 # package, which is 67.1. We're duplicating that knowledge here, and also, technically,
 # Debian has a few patches on top of 67.1 that we're not adding here.
 ENV ICU_VERSION=67.1
@@ -192,7 +192,7 @@ WORKDIR /home/nonroot

 # Rust
 # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
-ENV RUSTC_VERSION=1.82.0
+ENV RUSTC_VERSION=1.81.0
 ENV RUSTUP_HOME="/home/nonroot/.rustup"
 ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
 ARG RUSTFILT_VERSION=0.2.1
--- a/4
+++ b/4
@@ -297,7 +297,7 @@ clean: postgres-clean neon-pg-clean-ext
 # This removes everything
 .PHONY: distclean
 distclean:
-	$(RM) -r $(POSTGRES_INSTALL_DIR)
+	rm -rf $(POSTGRES_INSTALL_DIR)
 	$(CARGO_CMD_PREFIX) cargo clean

 .PHONY: fmt
@@ -329,7 +329,7 @@ postgres-%-pgindent: postgres-%-pg-bsd-indent postgres-%-typedefs.list
 		$(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/pgindent/pgindent --typedefs postgres-$*-typedefs-full.list \
 		$(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/ \
 		--excludes $(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/pgindent/exclude_file_patterns
-	$(RM) pg*.BAK
+	rm -f pg*.BAK

 # Indent pxgn/neon.
 .PHONY: neon-pgindent
--- a/compute/Dockerfile.compute-node
+++ b/compute/Dockerfile.compute-node
@@ -353,10 +353,13 @@ COPY compute/patches/pgvector.patch /pgvector.patch
 # because we build the images on different machines than where we run them.
 # Pass OPTFLAGS="" to remove it.
 #
-# vector 0.7.4 supports v17
-# last release v0.7.4 - Aug 5, 2024
-RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.4.tar.gz -O pgvector.tar.gz && \
-    echo "0341edf89b1924ae0d552f617e14fb7f8867c0194ed775bcc44fa40288642583 pgvector.tar.gz" | sha256sum --check && \
+# v17 is not supported yet because of upstream issue
+# https://github.com/pgvector/pgvector/issues/669
+RUN case "${PG_VERSION}" in "v17") \
+    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+    esac && \
+    wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.2.tar.gz -O pgvector.tar.gz && \
+    echo "617fba855c9bcb41a2a9bc78a78567fd2e147c72afd5bf9d37b31b9591632b30 pgvector.tar.gz" | sha256sum --check && \
    mkdir pgvector-src && cd pgvector-src && tar xzf ../pgvector.tar.gz --strip-components=1 -C . && \
    patch -p1 < /pgvector.patch && \
    make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -975,8 +978,8 @@ ARG PG_VERSION
 RUN case "${PG_VERSION}" in "v17") \
    echo "pg_session_jwt does not yet have a release that supports pg17" && exit 0;; \
    esac && \
-    wget https://github.com/neondatabase/pg_session_jwt/archive/e1310b08ba51377a19e0559e4d1194883b9b2ba2.tar.gz -O pg_session_jwt.tar.gz && \
-    echo "837932a077888d5545fd54b0abcc79e5f8e37017c2769a930afc2f5c94df6f4e pg_session_jwt.tar.gz" | sha256sum --check && \
+    wget https://github.com/neondatabase/pg_session_jwt/archive/5aee2625af38213650e1a07ae038fdc427250ee4.tar.gz -O pg_session_jwt.tar.gz && \
+    echo "5d91b10bc1347d36cffc456cb87bec25047935d6503dc652ca046f04760828e7 pg_session_jwt.tar.gz" | sha256sum --check && \
    mkdir pg_session_jwt-src && cd pg_session_jwt-src && tar xzf ../pg_session_jwt.tar.gz --strip-components=1 -C . && \
    sed -i 's/pgrx = "=0.11.3"/pgrx = { version = "=0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
    cargo pgrx install --release
--- a/compute/Makefile
+++ b/compute/Makefile
@@ -20,21 +20,19 @@ neon_collector_autoscaling.yml: $(jsonnet_files)
 sql_exporter.yml: $(jsonnet_files)
 	JSONNET_PATH=etc jsonnet \
 		--output-file etc/$@ \
-		--tla-str collector_name=neon_collector \
 		--tla-str collector_file=neon_collector.yml \
 		etc/sql_exporter.jsonnet

 sql_exporter_autoscaling.yml: $(jsonnet_files)
 	JSONNET_PATH=etc jsonnet \
 		--output-file etc/$@ \
-		--tla-str collector_name=neon_collector_autoscaling \
 		--tla-str collector_file=neon_collector_autoscaling.yml \
 		--tla-str application_name=sql_exporter_autoscaling \
 		etc/sql_exporter.jsonnet

 .PHONY: clean
 clean:
-	$(RM) \
+	rm --force \
 		etc/neon_collector.yml \
 		etc/neon_collector_autoscaling.yml \
 		etc/sql_exporter.yml \
--- a/compute/README.md
+++ b/compute/README.md
@@ -1,7 +1,7 @@
 This directory contains files that are needed to build the compute
 images, or included in the compute images.

-compute-node.Dockerfile
+Dockerfile.compute-node
 	To build the compute image

 vm-image-spec.yaml
@@ -14,8 +14,8 @@ etc/
 patches/
 	Some extensions need to be patched to work with Neon. This
 	directory contains such patches. They are applied to the extension
-	sources in compute-node.Dockerfile
+	sources in Dockerfile.compute-node

 In addition to these, postgres itself, the neon postgres extension,
 and compute_ctl are built and copied into the compute image by
-compute-node.Dockerfile.
+Dockerfile.compute-node.
--- a/compute/etc/sql_exporter.jsonnet
+++ b/compute/etc/sql_exporter.jsonnet
@@ -1,4 +1,4 @@
-function(collector_name, collector_file, application_name='sql_exporter') {
+function(collector_file, application_name='sql_exporter') {
  // Configuration for sql_exporter for autoscaling-agent
  // Global defaults.
  global: {
@@ -28,7 +28,7 @@ function(collector_name, collector_file, application_name='sql_exporter') {
    // Collectors (referenced by name) to execute on the target.
    // Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
    collectors: [
-      collector_name,
+      'neon_collector',
    ],
  },

--- a/compute/etc/sql_exporter/checkpoints_timed.libsonnet
+++ b/compute/etc/sql_exporter/checkpoints_timed.libsonnet
@@ -1,7 +1,7 @@
 local neon = import 'neon.libsonnet';

-local pg_stat_bgwriter = importstr 'sql_exporter/checkpoints_timed.sql';
-local pg_stat_checkpointer = importstr 'sql_exporter/checkpoints_timed.17.sql';
+local pg_stat_bgwriter = importstr 'sql_exporter/checkpoints_req.sql';
+local pg_stat_checkpointer = importstr 'sql_exporter/checkpoints_req.17.sql';

 {
  metric_name: 'checkpoints_timed',
--- a/compute/etc/sql_exporter/retained_wal.sql
+++ b/compute/etc/sql_exporter/retained_wal.sql
@@ -1,10 +1,5 @@
 SELECT
  slot_name,
-  pg_wal_lsn_diff(
-    CASE
-      WHEN pg_is_in_recovery() THEN pg_last_wal_replay_lsn()
-      ELSE pg_current_wal_lsn()
-    END,
-    restart_lsn)::FLOAT8 AS retained_wal
+  pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn)::FLOAT8 AS retained_wal
 FROM pg_replication_slots
 WHERE active = false;
--- a/compute/vm-image-spec-bookworm.yaml
+++ b/compute/vm-image-spec-bookworm.yaml
@@ -18,7 +18,7 @@ commands:
  - name: pgbouncer
    user: postgres
    sysvInitAction: respawn
-    shell: '/usr/local/bin/pgbouncer /etc/pgbouncer.ini 2>&1 > /dev/virtio-ports/tech.neon.log.0'
+    shell: '/usr/local/bin/pgbouncer /etc/pgbouncer.ini'
  - name: local_proxy
    user: postgres
    sysvInitAction: respawn
--- a/compute/vm-image-spec-bullseye.yaml
+++ b/compute/vm-image-spec-bullseye.yaml
@@ -18,7 +18,7 @@ commands:
  - name: pgbouncer
    user: postgres
    sysvInitAction: respawn
-    shell: '/usr/local/bin/pgbouncer /etc/pgbouncer.ini 2>&1 > /dev/virtio-ports/tech.neon.log.0'
+    shell: '/usr/local/bin/pgbouncer /etc/pgbouncer.ini'
  - name: local_proxy
    user: postgres
    sysvInitAction: respawn
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -15,7 +15,6 @@ use std::time::Instant;

 use anyhow::{Context, Result};
 use chrono::{DateTime, Utc};
-use compute_api::spec::PgIdent;
 use futures::future::join_all;
 use futures::stream::FuturesUnordered;
 use futures::StreamExt;
@@ -26,9 +25,8 @@ use tracing::{debug, error, info, instrument, warn};
 use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;

-use compute_api::privilege::Privilege;
 use compute_api::responses::{ComputeMetrics, ComputeStatus};
-use compute_api::spec::{ComputeFeature, ComputeMode, ComputeSpec, ExtVersion};
+use compute_api::spec::{ComputeFeature, ComputeMode, ComputeSpec};
 use utils::measured_stream::MeasuredReader;

 use nix::sys::signal::{kill, Signal};
@@ -36,7 +34,6 @@ use nix::sys::signal::{kill, Signal};
 use remote_storage::{DownloadError, RemotePath};

 use crate::checker::create_availability_check_data;
-use crate::installed_extensions::get_installed_extensions_sync;
 use crate::local_proxy;
 use crate::logger::inlinify;
 use crate::pg_helpers::*;
@@ -1124,11 +1121,6 @@ impl ComputeNode {
                self.pg_reload_conf()?;
            }
            self.post_apply_config()?;
-
-            let connstr = self.connstr.clone();
-            thread::spawn(move || {
-                get_installed_extensions_sync(connstr).context("get_installed_extensions")
-            });
        }

        let startup_end_time = Utc::now();
@@ -1375,97 +1367,6 @@ LIMIT 100",
        download_size
    }

-    pub async fn set_role_grants(
-        &self,
-        db_name: &PgIdent,
-        schema_name: &PgIdent,
-        privileges: &[Privilege],
-        role_name: &PgIdent,
-    ) -> Result<()> {
-        use tokio_postgres::config::Config;
-        use tokio_postgres::NoTls;
-
-        let mut conf = Config::from_str(self.connstr.as_str()).unwrap();
-        conf.dbname(db_name);
-
-        let (db_client, conn) = conf
-            .connect(NoTls)
-            .await
-            .context("Failed to connect to the database")?;
-        tokio::spawn(conn);
-
-        // TODO: support other types of grants apart from schemas?
-        let query = format!(
-            "GRANT {} ON SCHEMA {} TO {}",
-            privileges
-                .iter()
-                // should not be quoted as it's part of the command.
-                // is already sanitized so it's ok
-                .map(|p| p.as_str())
-                .collect::<Vec<&'static str>>()
-                .join(", "),
-            // quote the schema and role name as identifiers to sanitize them.
-            schema_name.pg_quote(),
-            role_name.pg_quote(),
-        );
-        db_client
-            .simple_query(&query)
-            .await
-            .with_context(|| format!("Failed to execute query: {}", query))?;
-
-        Ok(())
-    }
-
-    pub async fn install_extension(
-        &self,
-        ext_name: &PgIdent,
-        db_name: &PgIdent,
-        ext_version: ExtVersion,
-    ) -> Result<ExtVersion> {
-        use tokio_postgres::config::Config;
-        use tokio_postgres::NoTls;
-
-        let mut conf = Config::from_str(self.connstr.as_str()).unwrap();
-        conf.dbname(db_name);
-
-        let (db_client, conn) = conf
-            .connect(NoTls)
-            .await
-            .context("Failed to connect to the database")?;
-        tokio::spawn(conn);
-
-        let version_query = "SELECT extversion FROM pg_extension WHERE extname = $1";
-        let version: Option<ExtVersion> = db_client
-            .query_opt(version_query, &[&ext_name])
-            .await
-            .with_context(|| format!("Failed to execute query: {}", version_query))?
-            .map(|row| row.get(0));
-
-        // sanitize the inputs as postgres idents.
-        let ext_name: String = ext_name.pg_quote();
-        let quoted_version: String = ext_version.pg_quote();
-
-        if let Some(installed_version) = version {
-            if installed_version == ext_version {
-                return Ok(installed_version);
-            }
-            let query = format!("ALTER EXTENSION {ext_name} UPDATE TO {quoted_version}");
-            db_client
-                .simple_query(&query)
-                .await
-                .with_context(|| format!("Failed to execute query: {}", query))?;
-        } else {
-            let query =
-                format!("CREATE EXTENSION IF NOT EXISTS {ext_name} WITH VERSION {quoted_version}");
-            db_client
-                .simple_query(&query)
-                .await
-                .with_context(|| format!("Failed to execute query: {}", query))?;
-        }
-
-        Ok(ext_version)
-    }
-
    #[tokio::main]
    pub async fn prepare_preload_libraries(
        &self,
@@ -1583,6 +1484,28 @@ LIMIT 100",
            info!("Pageserver config changed");
        }
    }
+
+    // Gather info about installed extensions
+    pub fn get_installed_extensions(&self) -> Result<()> {
+        let connstr = self.connstr.clone();
+
+        let rt = tokio::runtime::Builder::new_current_thread()
+            .enable_all()
+            .build()
+            .expect("failed to create runtime");
+        let result = rt
+            .block_on(crate::installed_extensions::get_installed_extensions(
+                connstr,
+            ))
+            .expect("failed to get installed extensions");
+
+        info!(
+            "{}",
+            serde_json::to_string(&result).expect("failed to serialize extensions list")
+        );
+
+        Ok(())
+    }
 }

 pub fn forward_termination_signal() {
--- a/compute_tools/src/extension_server.rs
+++ b/compute_tools/src/extension_server.rs
@@ -107,7 +107,7 @@ pub fn get_pg_version(pgbin: &str) -> String {
    // pg_config --version returns a (platform specific) human readable string
    // such as "PostgreSQL 15.4". We parse this to v14/v15/v16 etc.
    let human_version = get_pg_config("--version", pgbin);
-    parse_pg_version(&human_version).to_string()
+    return parse_pg_version(&human_version).to_string();
 }

 fn parse_pg_version(human_version: &str) -> &str {
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -9,11 +9,8 @@ use crate::catalog::SchemaDumpError;
 use crate::catalog::{get_database_schema, get_dbs_and_roles};
 use crate::compute::forward_termination_signal;
 use crate::compute::{ComputeNode, ComputeState, ParsedSpec};
-use compute_api::requests::{ConfigurationRequest, ExtensionInstallRequest, SetRoleGrantsRequest};
-use compute_api::responses::{
-    ComputeStatus, ComputeStatusResponse, ExtensionInstallResult, GenericAPIError,
-    SetRoleGrantsResponse,
-};
+use compute_api::requests::ConfigurationRequest;
+use compute_api::responses::{ComputeStatus, ComputeStatusResponse, GenericAPIError};

 use anyhow::Result;
 use hyper::header::CONTENT_TYPE;
@@ -101,38 +98,6 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
            }
        }

-        (&Method::POST, "/extensions") => {
-            info!("serving /extensions POST request");
-            let status = compute.get_status();
-            if status != ComputeStatus::Running {
-                let msg = format!(
-                    "invalid compute status for extensions request: {:?}",
-                    status
-                );
-                error!(msg);
-                return render_json_error(&msg, StatusCode::PRECONDITION_FAILED);
-            }
-
-            let request = hyper::body::to_bytes(req.into_body()).await.unwrap();
-            let request = serde_json::from_slice::<ExtensionInstallRequest>(&request).unwrap();
-            let res = compute
-                .install_extension(&request.extension, &request.database, request.version)
-                .await;
-            match res {
-                Ok(version) => render_json(Body::from(
-                    serde_json::to_string(&ExtensionInstallResult {
-                        extension: request.extension,
-                        version,
-                    })
-                    .unwrap(),
-                )),
-                Err(e) => {
-                    error!("install_extension failed: {}", e);
-                    render_json_error(&e.to_string(), StatusCode::INTERNAL_SERVER_ERROR)
-                }
-            }
-        }
-
        (&Method::GET, "/info") => {
            let num_cpus = num_cpus::get_physical();
            info!("serving /info GET request. num_cpus: {}", num_cpus);
@@ -200,48 +165,6 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
            }
        }

-        (&Method::POST, "/grants") => {
-            info!("serving /grants POST request");
-            let status = compute.get_status();
-            if status != ComputeStatus::Running {
-                let msg = format!(
-                    "invalid compute status for set_role_grants request: {:?}",
-                    status
-                );
-                error!(msg);
-                return render_json_error(&msg, StatusCode::PRECONDITION_FAILED);
-            }
-
-            let request = hyper::body::to_bytes(req.into_body()).await.unwrap();
-            let request = serde_json::from_slice::<SetRoleGrantsRequest>(&request).unwrap();
-
-            let res = compute
-                .set_role_grants(
-                    &request.database,
-                    &request.schema,
-                    &request.privileges,
-                    &request.role,
-                )
-                .await;
-            match res {
-                Ok(()) => render_json(Body::from(
-                    serde_json::to_string(&SetRoleGrantsResponse {
-                        database: request.database,
-                        schema: request.schema,
-                        role: request.role,
-                        privileges: request.privileges,
-                    })
-                    .unwrap(),
-                )),
-                Err(e) => render_json_error(
-                    &format!("could not grant role privileges to the schema: {e}"),
-                    // TODO: can we filter on role/schema not found errors
-                    // and return appropriate error code?
-                    StatusCode::INTERNAL_SERVER_ERROR,
-                ),
-            }
-        }
-
        // get the list of installed extensions
        // currently only used in python tests
        // TODO: call it from cplane
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -127,41 +127,6 @@ paths:
              schema:
                $ref: "#/components/schemas/GenericError"

-  /grants:
-    post:
-      tags:
-        - Grants
-      summary: Apply grants to the database.
-      description: ""
-      operationId: setRoleGrants
-      requestBody:
-        description: Grants request.
-        required: true
-        content:
-          application/json:
-            schema:
-                $ref: "#/components/schemas/SetRoleGrantsRequest"
-      responses:
-        200:
-          description: Grants applied.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/SetRoleGrantsResponse"
-        412:
-          description: |
-            Compute is not in the right state for processing the request.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/GenericError"
-        500:
-          description: Error occurred during grants application.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/GenericError"
-
  /check_writability:
    post:
      tags:
@@ -179,41 +144,6 @@ paths:
                description: Error text or 'true' if check passed.
                example: "true"

-  /extensions:
-    post:
-      tags:
-        - Extensions
-      summary: Install extension if possible.
-      description: ""
-      operationId: installExtension
-      requestBody:
-        description: Extension name and database to install it to.
-        required: true
-        content:
-          application/json:
-            schema:
-              $ref: "#/components/schemas/ExtensionInstallRequest"
-      responses:
-        200:
-          description: Result from extension installation
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ExtensionInstallResult"
-        412:
-          description: |
-            Compute is in the wrong state for processing the request.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/GenericError"
-        500:
-          description: Error during extension installation.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/GenericError"
-
  /configure:
    post:
      tags:
@@ -439,7 +369,7 @@ components:
            moment, when spec was received.
          example: "2022-10-12T07:20:50.52Z"
        status:
-          $ref: "#/components/schemas/ComputeStatus"
+          $ref: '#/components/schemas/ComputeStatus'
        last_active:
          type: string
          description: |
@@ -479,38 +409,6 @@ components:
        - configuration
      example: running

-    ExtensionInstallRequest:
-      type: object
-      required:
-        - extension
-        - database
-        - version
-      properties:
-        extension:
-          type: string
-          description: Extension name.
-          example: "pg_session_jwt"
-        version:
-          type: string
-          description: Version of the extension.
-          example: "1.0.0"
-        database:
-          type: string
-          description: Database name.
-          example: "neondb"
-
-    ExtensionInstallResult:
-      type: object
-      properties:
-        extension:
-          description: Name of the extension.
-          type: string
-          example: "pg_session_jwt"
-        version:
-          description: Version of the extension.
-          type: string
-          example: "1.0.0"
-
    InstalledExtensions:
      type: object
      properties:
@@ -529,60 +427,6 @@ components:
              n_databases:
                type: integer

-    SetRoleGrantsRequest:
-      type: object
-      required:
-        - database
-        - schema
-        - privileges
-        - role
-      properties:
-        database:
-          type: string
-          description: Database name.
-          example: "neondb"
-        schema:
-          type: string
-          description: Schema name.
-          example: "public"
-        privileges:
-          type: array
-          items:
-            type: string
-          description: List of privileges to set.
-          example: ["SELECT", "INSERT"]
-        role:
-          type: string
-          description: Role name.
-          example: "neon"
-
-    SetRoleGrantsResponse:
-      type: object
-      required:
-        - database
-        - schema
-        - privileges
-        - role
-      properties:
-        database:
-          type: string
-          description: Database name.
-          example: "neondb"
-        schema:
-          type: string
-          description: Schema name.
-          example: "public"
-        privileges:
-          type: array
-          items:
-            type: string
-          description: List of privileges set.
-          example: ["SELECT", "INSERT"]
-        role:
-          type: string
-          description: Role name.
-          example: "neon"
-
    #
    # Errors
    #
--- a/compute_tools/src/installed_extensions.rs
+++ b/compute_tools/src/installed_extensions.rs
@@ -1,7 +1,6 @@
 use compute_api::responses::{InstalledExtension, InstalledExtensions};
 use std::collections::HashMap;
 use std::collections::HashSet;
-use tracing::info;
 use url::Url;

 use anyhow::Result;
@@ -80,23 +79,3 @@ pub async fn get_installed_extensions(connstr: Url) -> Result<InstalledExtension
    })
    .await?
 }
-
-// Gather info about installed extensions
-pub fn get_installed_extensions_sync(connstr: Url) -> Result<()> {
-    let rt = tokio::runtime::Builder::new_current_thread()
-        .enable_all()
-        .build()
-        .expect("failed to create runtime");
-    let result = rt
-        .block_on(crate::installed_extensions::get_installed_extensions(
-            connstr,
-        ))
-        .expect("failed to get installed extensions");
-
-    info!(
-        "[NEON_EXT_STAT] {}",
-        serde_json::to_string(&result).expect("failed to serialize extensions list")
-    );
-
-    Ok(())
-}
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -1073,10 +1073,10 @@ async fn handle_tenant(subcmd: &TenantCmd, env: &mut local_env::LocalEnv) -> any
                    tenant_id,
                    TimelineCreateRequest {
                        new_timeline_id,
-                        mode: pageserver_api::models::TimelineCreateRequestMode::Bootstrap {
-                            existing_initdb_timeline_id: None,
-                            pg_version: Some(args.pg_version),
-                        },
+                        ancestor_timeline_id: None,
+                        ancestor_start_lsn: None,
+                        existing_initdb_timeline_id: None,
+                        pg_version: Some(args.pg_version),
                    },
                )
                .await?;
@@ -1133,10 +1133,10 @@ async fn handle_timeline(cmd: &TimelineCmd, env: &mut local_env::LocalEnv) -> Re
            let storage_controller = StorageController::from_env(env);
            let create_req = TimelineCreateRequest {
                new_timeline_id,
-                mode: pageserver_api::models::TimelineCreateRequestMode::Bootstrap {
-                    existing_initdb_timeline_id: None,
-                    pg_version: Some(args.pg_version),
-                },
+                ancestor_timeline_id: None,
+                existing_initdb_timeline_id: None,
+                ancestor_start_lsn: None,
+                pg_version: Some(args.pg_version),
            };
            let timeline_info = storage_controller
                .tenant_timeline_create(tenant_id, create_req)
@@ -1189,11 +1189,10 @@ async fn handle_timeline(cmd: &TimelineCmd, env: &mut local_env::LocalEnv) -> Re
            let storage_controller = StorageController::from_env(env);
            let create_req = TimelineCreateRequest {
                new_timeline_id,
-                mode: pageserver_api::models::TimelineCreateRequestMode::Branch {
-                    ancestor_timeline_id,
-                    ancestor_start_lsn: start_lsn,
-                    pg_version: None,
-                },
+                ancestor_timeline_id: Some(ancestor_timeline_id),
+                existing_initdb_timeline_id: None,
+                ancestor_start_lsn: start_lsn,
+                pg_version: None,
            };
            let timeline_info = storage_controller
                .tenant_timeline_create(tenant_id, create_req)
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -529,6 +529,28 @@ impl PageServerNode {
        Ok(self.http_client.list_timelines(*tenant_shard_id).await?)
    }

+    pub async fn timeline_create(
+        &self,
+        tenant_shard_id: TenantShardId,
+        new_timeline_id: TimelineId,
+        ancestor_start_lsn: Option<Lsn>,
+        ancestor_timeline_id: Option<TimelineId>,
+        pg_version: Option<u32>,
+        existing_initdb_timeline_id: Option<TimelineId>,
+    ) -> anyhow::Result<TimelineInfo> {
+        let req = models::TimelineCreateRequest {
+            new_timeline_id,
+            ancestor_start_lsn,
+            ancestor_timeline_id,
+            pg_version,
+            existing_initdb_timeline_id,
+        };
+        Ok(self
+            .http_client
+            .timeline_create(tenant_shard_id, &req)
+            .await?)
+    }
+
    /// Import a basebackup prepared using either:
    /// a) `pg_basebackup -F tar`, or
    /// b) The `fullbackup` pageserver endpoint
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -20,16 +20,7 @@ use pageserver_client::mgmt_api::ResponseErrorMessageExt;
 use postgres_backend::AuthType;
 use reqwest::Method;
 use serde::{de::DeserializeOwned, Deserialize, Serialize};
-use std::{
-    ffi::OsStr,
-    fs,
-    net::SocketAddr,
-    path::PathBuf,
-    process::ExitStatus,
-    str::FromStr,
-    sync::OnceLock,
-    time::{Duration, Instant},
-};
+use std::{fs, net::SocketAddr, path::PathBuf, str::FromStr, sync::OnceLock};
 use tokio::process::Command;
 use tracing::instrument;
 use url::Url;
@@ -177,6 +168,16 @@ impl StorageController {
        .expect("non-Unicode path")
    }

+    /// PIDFile for the postgres instance used to store storage controller state
+    fn postgres_pid_file(&self) -> Utf8PathBuf {
+        Utf8PathBuf::from_path_buf(
+            self.env
+                .base_data_dir
+                .join("storage_controller_postgres.pid"),
+        )
+        .expect("non-Unicode path")
+    }
+
    /// Find the directory containing postgres subdirectories, such `bin` and `lib`
    ///
    /// This usually uses STORAGE_CONTROLLER_POSTGRES_VERSION of postgres, but will fall back
@@ -295,31 +296,6 @@ impl StorageController {
            .map_err(anyhow::Error::new)
    }

-    /// Wrapper for the pg_ctl binary, which we spawn as a short-lived subprocess when starting and stopping postgres
-    async fn pg_ctl<I, S>(&self, args: I) -> ExitStatus
-    where
-        I: IntoIterator<Item = S>,
-        S: AsRef<OsStr>,
-    {
-        let pg_bin_dir = self.get_pg_bin_dir().await.unwrap();
-        let bin_path = pg_bin_dir.join("pg_ctl");
-
-        let pg_lib_dir = self.get_pg_lib_dir().await.unwrap();
-        let envs = [
-            ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
-            ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
-        ];
-
-        Command::new(bin_path)
-            .args(args)
-            .envs(envs)
-            .spawn()
-            .expect("Failed to spawn pg_ctl, binary_missing?")
-            .wait()
-            .await
-            .expect("Failed to wait for pg_ctl termination")
-    }
-
    pub async fn start(&self, start_args: NeonStorageControllerStartArgs) -> anyhow::Result<()> {
        let instance_dir = self.storage_controller_instance_dir(start_args.instance_id);
        if let Err(err) = tokio::fs::create_dir(&instance_dir).await {
@@ -428,34 +404,20 @@ impl StorageController {
                db_start_args
            );

-            let db_start_status = self.pg_ctl(db_start_args).await;
-            let start_timeout: Duration = start_args.start_timeout.into();
-            let db_start_deadline = Instant::now() + start_timeout;
-            if !db_start_status.success() {
-                return Err(anyhow::anyhow!(
-                    "Failed to start postgres {}",
-                    db_start_status.code().unwrap()
-                ));
-            }
-
-            loop {
-                if Instant::now() > db_start_deadline {
-                    return Err(anyhow::anyhow!("Timed out waiting for postgres to start"));
-                }
-
-                match self.pg_isready(&pg_bin_dir, postgres_port).await {
-                    Ok(true) => {
-                        tracing::info!("storage controller postgres is now ready");
-                        break;
-                    }
-                    Ok(false) => {
-                        tokio::time::sleep(Duration::from_millis(100)).await;
-                    }
-                    Err(e) => {
-                        tracing::warn!("Failed to check postgres status: {e}")
-                    }
-                }
-            }
+            background_process::start_process(
+                "storage_controller_db",
+                &self.env.base_data_dir,
+                pg_bin_dir.join("pg_ctl").as_std_path(),
+                db_start_args,
+                vec![
+                    ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
+                    ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
+                ],
+                background_process::InitialPidFile::Create(self.postgres_pid_file()),
+                &start_args.start_timeout,
+                || self.pg_isready(&pg_bin_dir, postgres_port),
+            )
+            .await?;

            self.setup_database(postgres_port).await?;
        }
@@ -621,10 +583,15 @@ impl StorageController {
        }

        let pg_data_path = self.env.base_data_dir.join("storage_controller_db");
+        let pg_bin_dir = self.get_pg_bin_dir().await?;

        println!("Stopping storage controller database...");
        let pg_stop_args = ["-D", &pg_data_path.to_string_lossy(), "stop"];
-        let stop_status = self.pg_ctl(pg_stop_args).await;
+        let stop_status = Command::new(pg_bin_dir.join("pg_ctl"))
+            .args(pg_stop_args)
+            .spawn()?
+            .wait()
+            .await?;
        if !stop_status.success() {
            match self.is_postgres_running().await {
                Ok(false) => {
@@ -645,9 +612,14 @@ impl StorageController {

    async fn is_postgres_running(&self) -> anyhow::Result<bool> {
        let pg_data_path = self.env.base_data_dir.join("storage_controller_db");
+        let pg_bin_dir = self.get_pg_bin_dir().await?;

        let pg_status_args = ["-D", &pg_data_path.to_string_lossy(), "status"];
-        let status_exitcode = self.pg_ctl(pg_status_args).await;
+        let status_exitcode = Command::new(pg_bin_dir.join("pg_ctl"))
+            .args(pg_status_args)
+            .spawn()?
+            .wait()
+            .await?;

        // pg_ctl status returns this exit code if postgres is not running: in this case it is
        // fine that stop failed.  Otherwise it is an error that stop failed.
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -111,11 +111,6 @@ enum Command {
        #[arg(long)]
        node: NodeId,
    },
-    /// Cancel any ongoing reconciliation for this shard
-    TenantShardCancelReconcile {
-        #[arg(long)]
-        tenant_shard_id: TenantShardId,
-    },
    /// Modify the pageserver tenant configuration of a tenant: this is the configuration structure
    /// that is passed through to pageservers, and does not affect storage controller behavior.
    TenantConfig {
@@ -540,15 +535,6 @@ async fn main() -> anyhow::Result<()> {
                )
                .await?;
        }
-        Command::TenantShardCancelReconcile { tenant_shard_id } => {
-            storcon_client
-                .dispatch::<(), ()>(
-                    Method::PUT,
-                    format!("control/v1/tenant/{tenant_shard_id}/cancel_reconcile"),
-                    None,
-                )
-                .await?;
-        }
        Command::TenantConfig { tenant_id, config } => {
            let tenant_conf = serde_json::from_str(&config)?;

--- a/docs/docker.md
+++ b/docs/docker.md
@@ -5,7 +5,7 @@
 Currently we build two main images:

 - [neondatabase/neon](https://hub.docker.com/repository/docker/neondatabase/neon) — image with pre-built `pageserver`, `safekeeper` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile).
- [neondatabase/compute-node-v16](https://hub.docker.com/repository/docker/neondatabase/compute-node-v16) — compute node image with pre-built Postgres binaries from [neondatabase/postgres](https://github.com/neondatabase/postgres). Similar images exist for v15 and v14. Built from [/compute-node/Dockerfile](/compute/compute-node.Dockerfile).
+- [neondatabase/compute-node-v16](https://hub.docker.com/repository/docker/neondatabase/compute-node-v16) — compute node image with pre-built Postgres binaries from [neondatabase/postgres](https://github.com/neondatabase/postgres). Similar images exist for v15 and v14. Built from [/compute-node/Dockerfile](/compute/Dockerfile.compute-node).

 And additional intermediate image:

@@ -56,7 +56,7 @@ CREATE TABLE
 postgres=# insert into t values(1, 1);
 INSERT 0 1
 postgres=# select * from t;
- key | value
+ key | value 
 -----+-------
   1 | 1
 (1 row)
@@ -84,4 +84,4 @@ Access http://localhost:9001 and sign in.
 - Username: `minio`
 - Password: `password`

-You can see durable pages and WAL data in `neon` bucket.
+You can see durable pages and WAL data in `neon` bucket.
--- a/libs/compute_api/src/lib.rs
+++ b/libs/compute_api/src/lib.rs
@@ -1,6 +1,5 @@
 #![deny(unsafe_code)]
 #![deny(clippy::undocumented_unsafe_blocks)]
-pub mod privilege;
 pub mod requests;
 pub mod responses;
 pub mod spec;
--- a/libs/compute_api/src/privilege.rs
+++ b/libs/compute_api/src/privilege.rs
@@ -1,35 +0,0 @@
-#[derive(Debug, Clone, serde::Deserialize, serde::Serialize)]
-#[serde(rename_all = "UPPERCASE")]
-pub enum Privilege {
-    Select,
-    Insert,
-    Update,
-    Delete,
-    Truncate,
-    References,
-    Trigger,
-    Usage,
-    Create,
-    Connect,
-    Temporary,
-    Execute,
-}
-
-impl Privilege {
-    pub fn as_str(&self) -> &'static str {
-        match self {
-            Privilege::Select => "SELECT",
-            Privilege::Insert => "INSERT",
-            Privilege::Update => "UPDATE",
-            Privilege::Delete => "DELETE",
-            Privilege::Truncate => "TRUNCATE",
-            Privilege::References => "REFERENCES",
-            Privilege::Trigger => "TRIGGER",
-            Privilege::Usage => "USAGE",
-            Privilege::Create => "CREATE",
-            Privilege::Connect => "CONNECT",
-            Privilege::Temporary => "TEMPORARY",
-            Privilege::Execute => "EXECUTE",
-        }
-    }
-}
--- a/libs/compute_api/src/requests.rs
+++ b/libs/compute_api/src/requests.rs
@@ -1,8 +1,6 @@
 //! Structs representing the JSON formats used in the compute_ctl's HTTP API.
-use crate::{
-    privilege::Privilege,
-    spec::{ComputeSpec, ExtVersion, PgIdent},
-};
+
+use crate::spec::ComputeSpec;
 use serde::Deserialize;

 /// Request of the /configure API
@@ -14,18 +12,3 @@ use serde::Deserialize;
 pub struct ConfigurationRequest {
    pub spec: ComputeSpec,
 }
-
-#[derive(Deserialize, Debug)]
-pub struct ExtensionInstallRequest {
-    pub extension: PgIdent,
-    pub database: PgIdent,
-    pub version: ExtVersion,
-}
-
-#[derive(Deserialize, Debug)]
-pub struct SetRoleGrantsRequest {
-    pub database: PgIdent,
-    pub schema: PgIdent,
-    pub privileges: Vec<Privilege>,
-    pub role: PgIdent,
-}
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -6,10 +6,7 @@ use std::fmt::Display;
 use chrono::{DateTime, Utc};
 use serde::{Deserialize, Serialize, Serializer};

-use crate::{
-    privilege::Privilege,
-    spec::{ComputeSpec, Database, ExtVersion, PgIdent, Role},
-};
+use crate::spec::{ComputeSpec, Database, Role};

 #[derive(Serialize, Debug, Deserialize)]
 pub struct GenericAPIError {
@@ -171,16 +168,3 @@ pub struct InstalledExtension {
 pub struct InstalledExtensions {
    pub extensions: Vec<InstalledExtension>,
 }
-
-#[derive(Clone, Debug, Default, Serialize)]
-pub struct ExtensionInstallResult {
-    pub extension: PgIdent,
-    pub version: ExtVersion,
-}
-#[derive(Clone, Debug, Default, Serialize)]
-pub struct SetRoleGrantsResponse {
-    pub database: PgIdent,
-    pub schema: PgIdent,
-    pub privileges: Vec<Privilege>,
-    pub role: PgIdent,
-}
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -16,9 +16,6 @@ use remote_storage::RemotePath;
 /// intended to be used for DB / role names.
 pub type PgIdent = String;

-/// String type alias representing Postgres extension version
-pub type ExtVersion = String;
-
 /// Cluster spec or configuration represented as an optional number of
 /// delta operations + final cluster state description.
 #[derive(Clone, Debug, Default, Deserialize, Serialize)]
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -19,7 +19,6 @@ use once_cell::sync::Lazy;
 use prometheus::core::{
    Atomic, AtomicU64, Collector, GenericCounter, GenericCounterVec, GenericGauge, GenericGaugeVec,
 };
-pub use prometheus::local::LocalHistogram;
 pub use prometheus::opts;
 pub use prometheus::register;
 pub use prometheus::Error;
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -102,7 +102,6 @@ pub struct ConfigToml {
    pub ingest_batch_size: u64,
    pub max_vectored_read_bytes: MaxVectoredReadBytes,
    pub image_compression: ImageCompressionAlgorithm,
-    pub timeline_offloading: bool,
    pub ephemeral_bytes_per_memory_kb: usize,
    pub l0_flush: Option<crate::models::L0FlushConfig>,
    pub virtual_file_io_mode: Option<crate::models::virtual_file::IoMode>,
@@ -386,7 +385,6 @@ impl Default for ConfigToml {
                NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
            )),
            image_compression: (DEFAULT_IMAGE_COMPRESSION),
-            timeline_offloading: false,
            ephemeral_bytes_per_memory_kb: (DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
            l0_flush: None,
            virtual_file_io_mode: None,
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -211,30 +211,13 @@ pub enum TimelineState {
 #[derive(Serialize, Deserialize, Clone)]
 pub struct TimelineCreateRequest {
    pub new_timeline_id: TimelineId,
-    #[serde(flatten)]
-    pub mode: TimelineCreateRequestMode,
-}
-
-#[derive(Serialize, Deserialize, Clone)]
-#[serde(untagged)]
-pub enum TimelineCreateRequestMode {
-    Branch {
-        ancestor_timeline_id: TimelineId,
-        #[serde(default)]
-        ancestor_start_lsn: Option<Lsn>,
-        // TODO: cplane sets this, but, the branching code always
-        // inherits the ancestor's pg_version. Earlier code wasn't
-        // using a flattened enum, so, it was an accepted field, and
-        // we continue to accept it by having it here.
-        pg_version: Option<u32>,
-    },
-    // NB: Bootstrap is all-optional, and thus the serde(untagged) will cause serde to stop at Bootstrap.
-    // (serde picks the first matching enum variant, in declaration order).
-    Bootstrap {
-        #[serde(default)]
-        existing_initdb_timeline_id: Option<TimelineId>,
-        pg_version: Option<u32>,
-    },
+    #[serde(default)]
+    pub ancestor_timeline_id: Option<TimelineId>,
+    #[serde(default)]
+    pub existing_initdb_timeline_id: Option<TimelineId>,
+    #[serde(default)]
+    pub ancestor_start_lsn: Option<Lsn>,
+    pub pg_version: Option<u32>,
 }

 #[derive(Serialize, Deserialize, Clone)]
@@ -701,25 +684,6 @@ pub struct TimelineArchivalConfigRequest {
    pub state: TimelineArchivalState,
 }

-#[derive(Debug, Serialize, Deserialize, Clone)]
-pub struct TimelinesInfoAndOffloaded {
-    pub timelines: Vec<TimelineInfo>,
-    pub offloaded: Vec<OffloadedTimelineInfo>,
-}
-
-/// Analog of [`TimelineInfo`] for offloaded timelines.
-#[derive(Debug, Serialize, Deserialize, Clone)]
-pub struct OffloadedTimelineInfo {
-    pub tenant_id: TenantShardId,
-    pub timeline_id: TimelineId,
-    /// Whether the timeline has a parent it has been branched off from or not
-    pub ancestor_timeline_id: Option<TimelineId>,
-    /// Whether to retain the branch lsn at the ancestor or not
-    pub ancestor_retain_lsn: Option<Lsn>,
-    /// The time point when the timeline was archived
-    pub archived_at: chrono::DateTime<chrono::Utc>,
-}
-
 /// This represents the output of the "timeline_detail" and "timeline_list" API calls.
 #[derive(Debug, Serialize, Deserialize, Clone)]
 pub struct TimelineInfo {
@@ -779,6 +743,8 @@ pub struct TimelineInfo {
    // Forward compatibility: a previous version of the pageserver will receive a JSON. serde::Deserialize does
    // not deny unknown fields by default so it's safe to set the field to some value, though it won't be
    // read.
+    /// The last aux file policy being used on this timeline
+    pub last_aux_file_policy: Option<AuxFilePolicy>,
    pub is_archived: Option<bool>,
 }

@@ -1068,12 +1034,6 @@ pub mod virtual_file {
    }
 }

-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct ScanDisposableKeysResponse {
-    pub disposable_count: usize,
-    pub not_disposable_count: usize,
-}
-
 // Wrapped in libpq CopyData
 #[derive(PartialEq, Eq, Debug)]
 pub enum PagestreamFeMessage {
--- a/libs/pageserver_api/src/models/partitioning.rs
+++ b/libs/pageserver_api/src/models/partitioning.rs
@@ -16,7 +16,7 @@ impl serde::Serialize for Partitioning {
    {
        pub struct KeySpace<'a>(&'a crate::keyspace::KeySpace);

-        impl serde::Serialize for KeySpace<'_> {
+        impl<'a> serde::Serialize for KeySpace<'a> {
            fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
            where
                S: serde::Serializer,
@@ -44,7 +44,7 @@ impl serde::Serialize for Partitioning {

 pub struct WithDisplay<'a, T>(&'a T);

-impl<T: std::fmt::Display> serde::Serialize for WithDisplay<'_, T> {
+impl<'a, T: std::fmt::Display> serde::Serialize for WithDisplay<'a, T> {
    fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
    where
        S: serde::Serializer,
@@ -55,7 +55,7 @@ impl<T: std::fmt::Display> serde::Serialize for WithDisplay<'_, T> {

 pub struct KeyRange<'a>(&'a std::ops::Range<crate::key::Key>);

-impl serde::Serialize for KeyRange<'_> {
+impl<'a> serde::Serialize for KeyRange<'a> {
    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
    where
        S: serde::Serializer,
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -738,20 +738,6 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
                        QueryError::SimulatedConnectionError => {
                            return Err(QueryError::SimulatedConnectionError)
                        }
-                        err @ QueryError::Reconnect => {
-                            // Instruct the client to reconnect, stop processing messages
-                            // from this libpq connection and, finally, disconnect from the
-                            // server side (returning an Err achieves the later).
-                            //
-                            // Note the flushing is done by the caller.
-                            let reconnect_error = short_error(&err);
-                            self.write_message_noflush(&BeMessage::ErrorResponse(
-                                &reconnect_error,
-                                Some(err.pg_error_code()),
-                            ))?;
-
-                            return Err(err);
-                        }
                        e => {
                            log_query_error(query_string, &e);
                            let short_error = short_error(&e);
@@ -935,11 +921,12 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackendReader<IO> {
 /// A futures::AsyncWrite implementation that wraps all data written to it in CopyData
 /// messages.
 ///
+
 pub struct CopyDataWriter<'a, IO> {
    pgb: &'a mut PostgresBackend<IO>,
 }

-impl<IO: AsyncRead + AsyncWrite + Unpin> AsyncWrite for CopyDataWriter<'_, IO> {
+impl<'a, IO: AsyncRead + AsyncWrite + Unpin> AsyncWrite for CopyDataWriter<'a, IO> {
    fn poll_write(
        self: Pin<&mut Self>,
        cx: &mut std::task::Context<'_>,
--- a/libs/postgres_backend/tests/simple_select.rs
+++ b/libs/postgres_backend/tests/simple_select.rs
@@ -2,7 +2,6 @@
 use once_cell::sync::Lazy;
 use postgres_backend::{AuthType, Handler, PostgresBackend, QueryError};
 use pq_proto::{BeMessage, RowDescriptor};
-use rustls::crypto::aws_lc_rs;
 use std::io::Cursor;
 use std::sync::Arc;
 use tokio::io::{AsyncRead, AsyncWrite};
@@ -93,13 +92,10 @@ static CERT: Lazy<rustls::pki_types::CertificateDer<'static>> = Lazy::new(|| {
 async fn simple_select_ssl() {
    let (client_sock, server_sock) = make_tcp_pair().await;

-    let server_cfg =
-        rustls::ServerConfig::builder_with_provider(Arc::new(aws_lc_rs::default_provider()))
-            .with_safe_default_protocol_versions()
-            .expect("aws_lc_rs should support the default protocol versions")
-            .with_no_client_auth()
-            .with_single_cert(vec![CERT.clone()], KEY.clone_key())
-            .unwrap();
+    let server_cfg = rustls::ServerConfig::builder()
+        .with_no_client_auth()
+        .with_single_cert(vec![CERT.clone()], KEY.clone_key())
+        .unwrap();
    let tls_config = Some(Arc::new(server_cfg));
    let pgbackend =
        PostgresBackend::new(server_sock, AuthType::Trust, tls_config).expect("pgbackend creation");
@@ -109,16 +105,13 @@ async fn simple_select_ssl() {
        pgbackend.run(&mut handler, &CancellationToken::new()).await
    });

-    let client_cfg =
-        rustls::ClientConfig::builder_with_provider(Arc::new(aws_lc_rs::default_provider()))
-            .with_safe_default_protocol_versions()
-            .expect("aws_lc_rs should support the default protocol versions")
-            .with_root_certificates({
-                let mut store = rustls::RootCertStore::empty();
-                store.add(CERT.clone()).unwrap();
-                store
-            })
-            .with_no_client_auth();
+    let client_cfg = rustls::ClientConfig::builder()
+        .with_root_certificates({
+            let mut store = rustls::RootCertStore::empty();
+            store.add(CERT.clone()).unwrap();
+            store
+        })
+        .with_no_client_auth();
    let mut make_tls_connect = tokio_postgres_rustls::MakeRustlsConnect::new(client_cfg);
    let tls_connect = <MakeRustlsConnect as MakeTlsConnect<TcpStream>>::make_tls_connect(
        &mut make_tls_connect,
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -727,7 +727,7 @@ pub const SQLSTATE_INTERNAL_ERROR: &[u8; 5] = b"XX000";
 pub const SQLSTATE_ADMIN_SHUTDOWN: &[u8; 5] = b"57P01";
 pub const SQLSTATE_SUCCESSFUL_COMPLETION: &[u8; 5] = b"00000";

-impl BeMessage<'_> {
+impl<'a> BeMessage<'a> {
    /// Serialize `message` to the given `buf`.
    /// Apart from smart memory managemet, BytesMut is good here as msg len
    /// precedes its body and it is handy to write it down first and then fill
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -19,12 +19,7 @@ mod simulate_failures;
 mod support;

 use std::{
-    collections::HashMap,
-    fmt::Debug,
-    num::NonZeroU32,
-    ops::Bound,
-    pin::{pin, Pin},
-    sync::Arc,
+    collections::HashMap, fmt::Debug, num::NonZeroU32, ops::Bound, pin::Pin, sync::Arc,
    time::SystemTime,
 };

@@ -33,7 +28,6 @@ use camino::{Utf8Path, Utf8PathBuf};

 use bytes::Bytes;
 use futures::{stream::Stream, StreamExt};
-use itertools::Itertools as _;
 use serde::{Deserialize, Serialize};
 use tokio::sync::Semaphore;
 use tokio_util::sync::CancellationToken;
@@ -267,7 +261,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
        max_keys: Option<NonZeroU32>,
        cancel: &CancellationToken,
    ) -> Result<Listing, DownloadError> {
-        let mut stream = pin!(self.list_streaming(prefix, mode, max_keys, cancel));
+        let mut stream = std::pin::pin!(self.list_streaming(prefix, mode, max_keys, cancel));
        let mut combined = stream.next().await.expect("At least one item required")?;
        while let Some(list) = stream.next().await {
            let list = list?;
@@ -330,35 +324,6 @@ pub trait RemoteStorage: Send + Sync + 'static {
        cancel: &CancellationToken,
    ) -> anyhow::Result<()>;

-    /// Deletes all objects matching the given prefix.
-    ///
-    /// NB: this uses NoDelimiter and will match partial prefixes. For example, the prefix /a/b will
-    /// delete /a/b, /a/b/*, /a/bc, /a/bc/*, etc.
-    ///
-    /// If the operation fails because of timeout or cancellation, the root cause of the error will
-    /// be set to `TimeoutOrCancel`. In such situation it is unknown which deletions, if any, went
-    /// through.
-    async fn delete_prefix(
-        &self,
-        prefix: &RemotePath,
-        cancel: &CancellationToken,
-    ) -> anyhow::Result<()> {
-        let mut stream =
-            pin!(self.list_streaming(Some(prefix), ListingMode::NoDelimiter, None, cancel));
-        while let Some(result) = stream.next().await {
-            let keys = match result {
-                Ok(listing) if listing.keys.is_empty() => continue,
-                Ok(listing) => listing.keys.into_iter().map(|o| o.key).collect_vec(),
-                Err(DownloadError::Cancelled) => return Err(TimeoutOrCancel::Cancel.into()),
-                Err(DownloadError::Timeout) => return Err(TimeoutOrCancel::Timeout.into()),
-                Err(err) => return Err(err.into()),
-            };
-            tracing::info!("Deleting {} keys from remote storage", keys.len());
-            self.delete_objects(&keys, cancel).await?;
-        }
-        Ok(())
-    }
-
    /// Copy a remote object inside a bucket from one path to another.
    async fn copy(
        &self,
@@ -523,20 +488,6 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
        }
    }

-    /// See [`RemoteStorage::delete_prefix`]
-    pub async fn delete_prefix(
-        &self,
-        prefix: &RemotePath,
-        cancel: &CancellationToken,
-    ) -> anyhow::Result<()> {
-        match self {
-            Self::LocalFs(s) => s.delete_prefix(prefix, cancel).await,
-            Self::AwsS3(s) => s.delete_prefix(prefix, cancel).await,
-            Self::AzureBlob(s) => s.delete_prefix(prefix, cancel).await,
-            Self::Unreliable(s) => s.delete_prefix(prefix, cancel).await,
-        }
-    }
-
    /// See [`RemoteStorage::copy`]
    pub async fn copy_object(
        &self,
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -357,20 +357,22 @@ impl RemoteStorage for LocalFs {
                .list_recursive(prefix)
                .await
                .map_err(DownloadError::Other)?;
-            let mut objects = Vec::with_capacity(keys.len());
-            for key in keys {
-                let path = key.with_base(&self.storage_root);
-                let metadata = file_metadata(&path).await?;
-                if metadata.is_dir() {
-                    continue;
-                }
-                objects.push(ListingObject {
-                    key: key.clone(),
-                    last_modified: metadata.modified()?,
-                    size: metadata.len(),
-                });
-            }
-            let objects = objects;
+            let objects = keys
+                .into_iter()
+                .filter_map(|k| {
+                    let path = k.with_base(&self.storage_root);
+                    if path.is_dir() {
+                        None
+                    } else {
+                        Some(ListingObject {
+                            key: k.clone(),
+                            // LocalFs is just for testing, so just specify a dummy time
+                            last_modified: SystemTime::now(),
+                            size: 0,
+                        })
+                    }
+                })
+                .collect();

            if let ListingMode::NoDelimiter = mode {
                result.keys = objects;
@@ -408,8 +410,9 @@ impl RemoteStorage for LocalFs {
                    } else {
                        result.keys.push(ListingObject {
                            key: RemotePath::from_string(&relative_key).unwrap(),
-                            last_modified: object.last_modified,
-                            size: object.size,
+                            // LocalFs is just for testing
+                            last_modified: SystemTime::now(),
+                            size: 0,
                        });
                    }
                }
--- a/libs/remote_storage/tests/common/tests.rs
+++ b/libs/remote_storage/tests/common/tests.rs
@@ -199,138 +199,6 @@ async fn list_no_delimiter_works(
    Ok(())
 }

-/// Tests that giving a partial prefix returns all matches (e.g. "/foo" yields "/foobar/baz"),
-/// but only with NoDelimiter.
-#[test_context(MaybeEnabledStorageWithSimpleTestBlobs)]
-#[tokio::test]
-async fn list_partial_prefix(
-    ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs,
-) -> anyhow::Result<()> {
-    let ctx = match ctx {
-        MaybeEnabledStorageWithSimpleTestBlobs::Enabled(ctx) => ctx,
-        MaybeEnabledStorageWithSimpleTestBlobs::Disabled => return Ok(()),
-        MaybeEnabledStorageWithSimpleTestBlobs::UploadsFailed(e, _) => {
-            anyhow::bail!("S3 init failed: {e:?}")
-        }
-    };
-
-    let cancel = CancellationToken::new();
-    let test_client = Arc::clone(&ctx.enabled.client);
-
-    // Prefix "fold" should match all "folder{i}" directories with NoDelimiter.
-    let objects: HashSet<_> = test_client
-        .list(
-            Some(&RemotePath::from_string("fold")?),
-            ListingMode::NoDelimiter,
-            None,
-            &cancel,
-        )
-        .await?
-        .keys
-        .into_iter()
-        .map(|o| o.key)
-        .collect();
-    assert_eq!(&objects, &ctx.remote_blobs);
-
-    // Prefix "fold" matches nothing with WithDelimiter.
-    let objects: HashSet<_> = test_client
-        .list(
-            Some(&RemotePath::from_string("fold")?),
-            ListingMode::WithDelimiter,
-            None,
-            &cancel,
-        )
-        .await?
-        .keys
-        .into_iter()
-        .map(|o| o.key)
-        .collect();
-    assert!(objects.is_empty());
-
-    // Prefix "" matches everything.
-    let objects: HashSet<_> = test_client
-        .list(
-            Some(&RemotePath::from_string("")?),
-            ListingMode::NoDelimiter,
-            None,
-            &cancel,
-        )
-        .await?
-        .keys
-        .into_iter()
-        .map(|o| o.key)
-        .collect();
-    assert_eq!(&objects, &ctx.remote_blobs);
-
-    // Prefix "" matches nothing with WithDelimiter.
-    let objects: HashSet<_> = test_client
-        .list(
-            Some(&RemotePath::from_string("")?),
-            ListingMode::WithDelimiter,
-            None,
-            &cancel,
-        )
-        .await?
-        .keys
-        .into_iter()
-        .map(|o| o.key)
-        .collect();
-    assert!(objects.is_empty());
-
-    // Prefix "foo" matches nothing.
-    let objects: HashSet<_> = test_client
-        .list(
-            Some(&RemotePath::from_string("foo")?),
-            ListingMode::NoDelimiter,
-            None,
-            &cancel,
-        )
-        .await?
-        .keys
-        .into_iter()
-        .map(|o| o.key)
-        .collect();
-    assert!(objects.is_empty());
-
-    // Prefix "folder2/blob" matches.
-    let objects: HashSet<_> = test_client
-        .list(
-            Some(&RemotePath::from_string("folder2/blob")?),
-            ListingMode::NoDelimiter,
-            None,
-            &cancel,
-        )
-        .await?
-        .keys
-        .into_iter()
-        .map(|o| o.key)
-        .collect();
-    let expect: HashSet<_> = ctx
-        .remote_blobs
-        .iter()
-        .filter(|o| o.get_path().starts_with("folder2"))
-        .cloned()
-        .collect();
-    assert_eq!(&objects, &expect);
-
-    // Prefix "folder2/foo" matches nothing.
-    let objects: HashSet<_> = test_client
-        .list(
-            Some(&RemotePath::from_string("folder2/foo")?),
-            ListingMode::NoDelimiter,
-            None,
-            &cancel,
-        )
-        .await?
-        .keys
-        .into_iter()
-        .map(|o| o.key)
-        .collect();
-    assert!(objects.is_empty());
-
-    Ok(())
-}
-
 #[test_context(MaybeEnabledStorage)]
 #[tokio::test]
 async fn delete_non_exising_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> {
@@ -397,80 +265,6 @@ async fn delete_objects_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<(
    Ok(())
 }

-/// Tests that delete_prefix() will delete all objects matching a prefix, including
-/// partial prefixes (i.e. "/foo" matches "/foobar").
-#[test_context(MaybeEnabledStorageWithSimpleTestBlobs)]
-#[tokio::test]
-async fn delete_prefix(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> anyhow::Result<()> {
-    let ctx = match ctx {
-        MaybeEnabledStorageWithSimpleTestBlobs::Enabled(ctx) => ctx,
-        MaybeEnabledStorageWithSimpleTestBlobs::Disabled => return Ok(()),
-        MaybeEnabledStorageWithSimpleTestBlobs::UploadsFailed(e, _) => {
-            anyhow::bail!("S3 init failed: {e:?}")
-        }
-    };
-
-    let cancel = CancellationToken::new();
-    let test_client = Arc::clone(&ctx.enabled.client);
-
-    /// Asserts that the S3 listing matches the given paths.
-    macro_rules! assert_list {
-        ($expect:expr) => {{
-            let listing = test_client
-                .list(None, ListingMode::NoDelimiter, None, &cancel)
-                .await?
-                .keys
-                .into_iter()
-                .map(|o| o.key)
-                .collect();
-            assert_eq!($expect, listing);
-        }};
-    }
-
-    // We start with the full set of uploaded files.
-    let mut expect = ctx.remote_blobs.clone();
-
-    // Deleting a non-existing prefix should do nothing.
-    test_client
-        .delete_prefix(&RemotePath::from_string("xyz")?, &cancel)
-        .await?;
-    assert_list!(expect);
-
-    // Prefixes are case-sensitive.
-    test_client
-        .delete_prefix(&RemotePath::from_string("Folder")?, &cancel)
-        .await?;
-    assert_list!(expect);
-
-    // Deleting a path which overlaps with an existing object should do nothing. We pick the first
-    // path in the set as our common prefix.
-    let path = expect.iter().next().expect("empty set").clone().join("xyz");
-    test_client.delete_prefix(&path, &cancel).await?;
-    assert_list!(expect);
-
-    // Deleting an exact path should work. We pick the first path in the set.
-    let path = expect.iter().next().expect("empty set").clone();
-    test_client.delete_prefix(&path, &cancel).await?;
-    expect.remove(&path);
-    assert_list!(expect);
-
-    // Deleting a prefix should delete all matching objects.
-    test_client
-        .delete_prefix(&RemotePath::from_string("folder0/blob_")?, &cancel)
-        .await?;
-    expect.retain(|p| !p.get_path().as_str().starts_with("folder0/"));
-    assert_list!(expect);
-
-    // Deleting a common prefix should delete all objects.
-    test_client
-        .delete_prefix(&RemotePath::from_string("fold")?, &cancel)
-        .await?;
-    expect.clear();
-    assert_list!(expect);
-
-    Ok(())
-}
-
 #[test_context(MaybeEnabledStorage)]
 #[tokio::test]
 async fn upload_download_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> {
--- a/libs/tenant_size_model/src/svg.rs
+++ b/libs/tenant_size_model/src/svg.rs
@@ -97,7 +97,7 @@ pub fn draw_svg(
    Ok(result)
 }

-impl SvgDraw<'_> {
+impl<'a> SvgDraw<'a> {
    fn calculate_svg_layout(&mut self) {
        // Find x scale
        let segments = &self.storage.segments;
--- a/libs/tracing-utils/src/http.rs
+++ b/libs/tracing-utils/src/http.rs
@@ -82,7 +82,7 @@ where
 fn extract_remote_context(headers: &HeaderMap) -> opentelemetry::Context {
    struct HeaderExtractor<'a>(&'a HeaderMap);

-    impl opentelemetry::propagation::Extractor for HeaderExtractor<'_> {
+    impl<'a> opentelemetry::propagation::Extractor for HeaderExtractor<'a> {
        fn get(&self, key: &str) -> Option<&str> {
            self.0.get(key).and_then(|value| value.to_str().ok())
        }
--- a/libs/utils/src/lsn.rs
+++ b/libs/utils/src/lsn.rs
@@ -37,7 +37,7 @@ impl<'de> Deserialize<'de> for Lsn {
            is_human_readable_deserializer: bool,
        }

-        impl Visitor<'_> for LsnVisitor {
+        impl<'de> Visitor<'de> for LsnVisitor {
            type Value = Lsn;

            fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
--- a/libs/utils/src/poison.rs
+++ b/libs/utils/src/poison.rs
@@ -73,7 +73,7 @@ impl<T> Poison<T> {
 /// and subsequent calls to [`Poison::check_and_arm`] will fail with an error.
 pub struct Guard<'a, T>(&'a mut Poison<T>);

-impl<T> Guard<'_, T> {
+impl<'a, T> Guard<'a, T> {
    pub fn data(&self) -> &T {
        &self.0.data
    }
@@ -94,7 +94,7 @@ impl<T> Guard<'_, T> {
    }
 }

-impl<T> Drop for Guard<'_, T> {
+impl<'a, T> Drop for Guard<'a, T> {
    fn drop(&mut self) {
        match self.0.state {
            State::Clean => {
--- a/libs/utils/src/shard.rs
+++ b/libs/utils/src/shard.rs
@@ -164,7 +164,7 @@ impl TenantShardId {
    }
 }

-impl std::fmt::Display for ShardSlug<'_> {
+impl<'a> std::fmt::Display for ShardSlug<'a> {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(
            f,
--- a/libs/utils/src/simple_rcu.rs
+++ b/libs/utils/src/simple_rcu.rs
@@ -152,7 +152,7 @@ pub struct RcuWriteGuard<'a, V> {
    inner: RwLockWriteGuard<'a, RcuInner<V>>,
 }

-impl<V> Deref for RcuWriteGuard<'_, V> {
+impl<'a, V> Deref for RcuWriteGuard<'a, V> {
    type Target = V;

    fn deref(&self) -> &V {
@@ -160,7 +160,7 @@ impl<V> Deref for RcuWriteGuard<'_, V> {
    }
 }

-impl<V> RcuWriteGuard<'_, V> {
+impl<'a, V> RcuWriteGuard<'a, V> {
    ///
    /// Store a new value. The new value will be written to the Rcu immediately,
    /// and will be immediately seen by any `read` calls that start afterwards.
--- a/libs/utils/src/sync/heavier_once_cell.rs
+++ b/libs/utils/src/sync/heavier_once_cell.rs
@@ -219,7 +219,7 @@ impl<'a, T> CountWaitingInitializers<'a, T> {
    }
 }

-impl<T> Drop for CountWaitingInitializers<'_, T> {
+impl<'a, T> Drop for CountWaitingInitializers<'a, T> {
    fn drop(&mut self) {
        self.0.initializers.fetch_sub(1, Ordering::Relaxed);
    }
@@ -250,7 +250,7 @@ impl<T> std::ops::DerefMut for Guard<'_, T> {
    }
 }

-impl<T> Guard<'_, T> {
+impl<'a, T> Guard<'a, T> {
    /// Take the current value, and a new permit for it's deinitialization.
    ///
    /// The permit will be on a semaphore part of the new internal value, and any following
--- a/libs/utils/src/tracing_span_assert.rs
+++ b/libs/utils/src/tracing_span_assert.rs
@@ -184,23 +184,23 @@ mod tests {

    struct MemoryIdentity<'a>(&'a dyn Extractor);

-    impl MemoryIdentity<'_> {
+    impl<'a> MemoryIdentity<'a> {
        fn as_ptr(&self) -> *const () {
            self.0 as *const _ as *const ()
        }
    }
-    impl PartialEq for MemoryIdentity<'_> {
+    impl<'a> PartialEq for MemoryIdentity<'a> {
        fn eq(&self, other: &Self) -> bool {
            self.as_ptr() == other.as_ptr()
        }
    }
-    impl Eq for MemoryIdentity<'_> {}
-    impl Hash for MemoryIdentity<'_> {
+    impl<'a> Eq for MemoryIdentity<'a> {}
+    impl<'a> Hash for MemoryIdentity<'a> {
        fn hash<H: Hasher>(&self, state: &mut H) {
            self.as_ptr().hash(state);
        }
    }
-    impl fmt::Debug for MemoryIdentity<'_> {
+    impl<'a> fmt::Debug for MemoryIdentity<'a> {
        fn fmt(&self, f: &mut fmt::Formatter<'_>) -> std::fmt::Result {
            write!(f, "{:p}: {}", self.as_ptr(), self.0.id())
        }
--- a/pageserver/benches/bench_ingest.rs
+++ b/pageserver/benches/bench_ingest.rs
@@ -164,11 +164,7 @@ fn criterion_benchmark(c: &mut Criterion) {
    let conf: &'static PageServerConf = Box::leak(Box::new(
        pageserver::config::PageServerConf::dummy_conf(temp_dir.path().to_path_buf()),
    ));
-    virtual_file::init(
-        16384,
-        virtual_file::io_engine_for_bench(),
-        conf.virtual_file_io_mode,
-    );
+    virtual_file::init(16384, virtual_file::io_engine_for_bench());
    page_cache::init(conf.page_cache_size);

    {
--- a/pageserver/compaction/src/helpers.rs
+++ b/pageserver/compaction/src/helpers.rs
@@ -133,7 +133,7 @@ enum LazyLoadLayer<'a, E: CompactionJobExecutor> {
    Loaded(VecDeque<<E::DeltaLayer as CompactionDeltaLayer<E>>::DeltaEntry<'a>>),
    Unloaded(&'a E::DeltaLayer),
 }
-impl<E: CompactionJobExecutor> LazyLoadLayer<'_, E> {
+impl<'a, E: CompactionJobExecutor> LazyLoadLayer<'a, E> {
    fn min_key(&self) -> E::Key {
        match self {
            Self::Loaded(entries) => entries.front().unwrap().key(),
@@ -147,23 +147,23 @@ impl<E: CompactionJobExecutor> LazyLoadLayer<'_, E> {
        }
    }
 }
-impl<E: CompactionJobExecutor> PartialOrd for LazyLoadLayer<'_, E> {
+impl<'a, E: CompactionJobExecutor> PartialOrd for LazyLoadLayer<'a, E> {
    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
        Some(self.cmp(other))
    }
 }
-impl<E: CompactionJobExecutor> Ord for LazyLoadLayer<'_, E> {
+impl<'a, E: CompactionJobExecutor> Ord for LazyLoadLayer<'a, E> {
    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
        // reverse order so that we get a min-heap
        (other.min_key(), other.min_lsn()).cmp(&(self.min_key(), self.min_lsn()))
    }
 }
-impl<E: CompactionJobExecutor> PartialEq for LazyLoadLayer<'_, E> {
+impl<'a, E: CompactionJobExecutor> PartialEq for LazyLoadLayer<'a, E> {
    fn eq(&self, other: &Self) -> bool {
        self.cmp(other) == std::cmp::Ordering::Equal
    }
 }
-impl<E: CompactionJobExecutor> Eq for LazyLoadLayer<'_, E> {}
+impl<'a, E: CompactionJobExecutor> Eq for LazyLoadLayer<'a, E> {}

 type LoadFuture<'a, E> = BoxFuture<'a, anyhow::Result<Vec<E>>>;

--- a/pageserver/ctl/src/index_part.rs
+++ b/pageserver/ctl/src/index_part.rs
@@ -11,7 +11,7 @@ pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> {
    match cmd {
        IndexPartCmd::Dump { path } => {
            let bytes = tokio::fs::read(path).await.context("read file")?;
-            let des: IndexPart = IndexPart::from_json_bytes(&bytes).context("deserialize")?;
+            let des: IndexPart = IndexPart::from_s3_bytes(&bytes).context("deserialize")?;
            let output = serde_json::to_string_pretty(&des).context("serialize output")?;
            println!("{output}");
            Ok(())
--- a/pageserver/ctl/src/layer_map_analyzer.rs
+++ b/pageserver/ctl/src/layer_map_analyzer.rs
@@ -7,7 +7,6 @@ use camino::{Utf8Path, Utf8PathBuf};
 use pageserver::context::{DownloadBehavior, RequestContext};
 use pageserver::task_mgr::TaskKind;
 use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
-use pageserver::virtual_file::api::IoMode;
 use std::cmp::Ordering;
 use std::collections::BinaryHeap;
 use std::ops::Range;
@@ -153,11 +152,7 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
    let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);

    // Initialize virtual_file (file desriptor cache) and page cache which are needed to access layer persistent B-Tree.
-    pageserver::virtual_file::init(
-        10,
-        virtual_file::api::IoEngineKind::StdFs,
-        IoMode::preferred(),
-    );
+    pageserver::virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
    pageserver::page_cache::init(100);

    let mut total_delta_layers = 0usize;
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -11,7 +11,6 @@ use pageserver::tenant::storage_layer::delta_layer::{BlobRef, Summary};
 use pageserver::tenant::storage_layer::{delta_layer, image_layer};
 use pageserver::tenant::storage_layer::{DeltaLayer, ImageLayer};
 use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
-use pageserver::virtual_file::api::IoMode;
 use pageserver::{page_cache, virtual_file};
 use pageserver::{
    repository::{Key, KEY_SIZE},
@@ -60,11 +59,7 @@ pub(crate) enum LayerCmd {

 async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result<()> {
    let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path");
-    virtual_file::init(
-        10,
-        virtual_file::api::IoEngineKind::StdFs,
-        IoMode::preferred(),
-    );
+    virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
    page_cache::init(100);
    let file = VirtualFile::open(path, ctx).await?;
    let file_id = page_cache::next_file_id();
@@ -195,11 +190,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
            new_tenant_id,
            new_timeline_id,
        } => {
-            pageserver::virtual_file::init(
-                10,
-                virtual_file::api::IoEngineKind::StdFs,
-                IoMode::preferred(),
-            );
+            pageserver::virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
            pageserver::page_cache::init(100);

            let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -24,7 +24,7 @@ use pageserver::{
    page_cache,
    task_mgr::TaskKind,
    tenant::{dump_layerfile_from_path, metadata::TimelineMetadata},
-    virtual_file::{self, api::IoMode},
+    virtual_file,
 };
 use pageserver_api::shard::TenantShardId;
 use postgres_ffi::ControlFileData;
@@ -205,11 +205,7 @@ fn read_pg_control_file(control_file_path: &Utf8Path) -> anyhow::Result<()> {

 async fn print_layerfile(path: &Utf8Path) -> anyhow::Result<()> {
    // Basic initialization of things that don't change after startup
-    virtual_file::init(
-        10,
-        virtual_file::api::IoEngineKind::StdFs,
-        IoMode::preferred(),
-    );
+    virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
    page_cache::init(100);
    let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
    dump_layerfile_from_path(path, true, &ctx).await
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -167,11 +167,7 @@ fn main() -> anyhow::Result<()> {
    let scenario = failpoint_support::init();

    // Basic initialization of things that don't change after startup
-    virtual_file::init(
-        conf.max_file_descriptors,
-        conf.virtual_file_io_engine,
-        conf.virtual_file_io_mode,
-    );
+    virtual_file::init(conf.max_file_descriptors, conf.virtual_file_io_engine);
    page_cache::init(conf.page_cache_size);

    start_pageserver(launch_ts, conf).context("Failed to start pageserver")?;
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -164,9 +164,6 @@ pub struct PageServerConf {

    pub image_compression: ImageCompressionAlgorithm,

-    /// Whether to offload archived timelines automatically
-    pub timeline_offloading: bool,
-
    /// How many bytes of ephemeral layer content will we allow per kilobyte of RAM.  When this
    /// is exceeded, we start proactively closing ephemeral layers to limit the total amount
    /// of ephemeral data.
@@ -324,7 +321,6 @@ impl PageServerConf {
            ingest_batch_size,
            max_vectored_read_bytes,
            image_compression,
-            timeline_offloading,
            ephemeral_bytes_per_memory_kb,
            l0_flush,
            virtual_file_io_mode,
@@ -368,7 +364,6 @@ impl PageServerConf {
            ingest_batch_size,
            max_vectored_read_bytes,
            image_compression,
-            timeline_offloading,
            ephemeral_bytes_per_memory_kb,

            // ------------------------------------------------------------
--- a/pageserver/src/consumption_metrics/upload.rs
+++ b/pageserver/src/consumption_metrics/upload.rs
@@ -198,7 +198,7 @@ fn serialize_in_chunks<'a>(
        }
    }

-    impl ExactSizeIterator for Iter<'_> {}
+    impl<'a> ExactSizeIterator for Iter<'a> {}

    let buffer = bytes::BytesMut::new();
    let inner = input.chunks(chunk_size);
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -654,7 +654,7 @@ impl std::fmt::Debug for EvictionCandidate {
        let ts = chrono::DateTime::<chrono::Utc>::from(self.last_activity_ts);
        let ts = ts.to_rfc3339_opts(chrono::SecondsFormat::Nanos, true);
        struct DisplayIsDebug<'a, T>(&'a T);
-        impl<T: std::fmt::Display> std::fmt::Debug for DisplayIsDebug<'_, T> {
+        impl<'a, T: std::fmt::Display> std::fmt::Debug for DisplayIsDebug<'a, T> {
            fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
                write!(f, "{}", self.0)
            }
@@ -1218,7 +1218,16 @@ mod filesystem_level_usage {
        let stat = Statvfs::get(tenants_dir, mock_config)
            .context("statvfs failed, presumably directory got unlinked")?;

-        let (avail_bytes, total_bytes) = stat.get_avail_total_bytes();
+        // https://unix.stackexchange.com/a/703650
+        let blocksize = if stat.fragment_size() > 0 {
+            stat.fragment_size()
+        } else {
+            stat.block_size()
+        };
+
+        // use blocks_available (b_avail) since, pageserver runs as unprivileged user
+        let avail_bytes = stat.blocks_available() * blocksize;
+        let total_bytes = stat.blocks() * blocksize;

        Ok(Usage {
            config,
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -597,10 +597,6 @@ paths:
        Create a timeline. Returns new timeline id on success.
        Recreating the same timeline will succeed if the parameters match the existing timeline.
        If no pg_version is specified, assume DEFAULT_PG_VERSION hardcoded in the pageserver.
-
-        To ensure durability, the caller must retry the creation until success.
-        Just because the timeline is visible via other endpoints does not mean it is durable.
-        Future versions may stop showing timelines that are not yet durable.
      requestBody:
        content:
          application/json:
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -18,6 +18,7 @@ use hyper::StatusCode;
 use hyper::{Body, Request, Response, Uri};
 use metrics::launch_timestamp::LaunchTimestamp;
 use pageserver_api::models::virtual_file::IoMode;
+use pageserver_api::models::AuxFilePolicy;
 use pageserver_api::models::DownloadRemoteLayersTaskSpawnRequest;
 use pageserver_api::models::IngestAuxFilesRequest;
 use pageserver_api::models::ListAuxFilesRequest;
@@ -26,7 +27,6 @@ use pageserver_api::models::LocationConfigListResponse;
 use pageserver_api::models::LocationConfigMode;
 use pageserver_api::models::LsnLease;
 use pageserver_api::models::LsnLeaseRequest;
-use pageserver_api::models::OffloadedTimelineInfo;
 use pageserver_api::models::ShardParameters;
 use pageserver_api::models::TenantDetails;
 use pageserver_api::models::TenantLocationConfigRequest;
@@ -38,8 +38,6 @@ use pageserver_api::models::TenantShardSplitRequest;
 use pageserver_api::models::TenantShardSplitResponse;
 use pageserver_api::models::TenantSorting;
 use pageserver_api::models::TimelineArchivalConfigRequest;
-use pageserver_api::models::TimelineCreateRequestMode;
-use pageserver_api::models::TimelinesInfoAndOffloaded;
 use pageserver_api::models::TopTenantShardItem;
 use pageserver_api::models::TopTenantShardsRequest;
 use pageserver_api::models::TopTenantShardsResponse;
@@ -84,9 +82,7 @@ use crate::tenant::timeline::CompactFlags;
 use crate::tenant::timeline::CompactionError;
 use crate::tenant::timeline::Timeline;
 use crate::tenant::GetTimelineError;
-use crate::tenant::OffloadedTimeline;
 use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError};
-use crate::DEFAULT_PG_VERSION;
 use crate::{disk_usage_eviction_task, tenant};
 use pageserver_api::models::{
    StatusResponse, TenantConfigRequest, TenantInfo, TimelineCreateRequest, TimelineGcRequest,
@@ -478,28 +474,12 @@ async fn build_timeline_info_common(
        is_archived: Some(is_archived),

        walreceiver_status,
+
+        last_aux_file_policy: timeline.last_aux_file_policy.load(),
    };
    Ok(info)
 }

-fn build_timeline_offloaded_info(offloaded: &Arc<OffloadedTimeline>) -> OffloadedTimelineInfo {
-    let &OffloadedTimeline {
-        tenant_shard_id,
-        timeline_id,
-        ancestor_retain_lsn,
-        ancestor_timeline_id,
-        archived_at,
-        ..
-    } = offloaded.as_ref();
-    OffloadedTimelineInfo {
-        tenant_id: tenant_shard_id,
-        timeline_id,
-        ancestor_retain_lsn,
-        ancestor_timeline_id,
-        archived_at: archived_at.and_utc(),
-    }
-}
-
 // healthcheck handler
 async fn status_handler(
    request: Request<Body>,
@@ -549,26 +529,6 @@ async fn timeline_create_handler(
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;

    let new_timeline_id = request_data.new_timeline_id;
-    // fill in the default pg_version if not provided & convert request into domain model
-    let params: tenant::CreateTimelineParams = match request_data.mode {
-        TimelineCreateRequestMode::Bootstrap {
-            existing_initdb_timeline_id,
-            pg_version,
-        } => tenant::CreateTimelineParams::Bootstrap(tenant::CreateTimelineParamsBootstrap {
-            new_timeline_id,
-            existing_initdb_timeline_id,
-            pg_version: pg_version.unwrap_or(DEFAULT_PG_VERSION),
-        }),
-        TimelineCreateRequestMode::Branch {
-            ancestor_timeline_id,
-            ancestor_start_lsn,
-            pg_version: _,
-        } => tenant::CreateTimelineParams::Branch(tenant::CreateTimelineParamsBranch {
-            new_timeline_id,
-            ancestor_timeline_id,
-            ancestor_start_lsn,
-        }),
-    };

    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Error);

@@ -581,12 +541,22 @@ async fn timeline_create_handler(

        tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;

-        // earlier versions of the code had pg_version and ancestor_lsn in the span
-        // => continue to provide that information, but, through a log message that doesn't require us to destructure
-        tracing::info!(?params, "creating timeline");
+        if let Some(ancestor_id) = request_data.ancestor_timeline_id.as_ref() {
+            tracing::info!(%ancestor_id, "starting to branch");
+        } else {
+            tracing::info!("bootstrapping");
+        }

        match tenant
-            .create_timeline(params, state.broker_client.clone(), &ctx)
+            .create_timeline(
+                new_timeline_id,
+                request_data.ancestor_timeline_id,
+                request_data.ancestor_start_lsn,
+                request_data.pg_version.unwrap_or(crate::DEFAULT_PG_VERSION),
+                request_data.existing_initdb_timeline_id,
+                state.broker_client.clone(),
+                &ctx,
+            )
            .await
        {
            Ok(new_timeline) => {
@@ -637,6 +607,8 @@ async fn timeline_create_handler(
        tenant_id = %tenant_shard_id.tenant_id,
        shard_id = %tenant_shard_id.shard_slug(),
        timeline_id = %new_timeline_id,
+        lsn=?request_data.ancestor_start_lsn,
+        pg_version=?request_data.pg_version
    ))
    .await
 }
@@ -674,7 +646,7 @@ async fn timeline_list_handler(
            )
            .instrument(info_span!("build_timeline_info", timeline_id = %timeline.timeline_id))
            .await
-            .context("Failed to build timeline info")
+            .context("Failed to convert tenant timeline {timeline_id} into the local one: {e:?}")
            .map_err(ApiError::InternalServerError)?;

            response_data.push(timeline_info);
@@ -689,62 +661,6 @@ async fn timeline_list_handler(
    json_response(StatusCode::OK, response_data)
 }

-async fn timeline_and_offloaded_list_handler(
-    request: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
-    let include_non_incremental_logical_size: Option<bool> =
-        parse_query_param(&request, "include-non-incremental-logical-size")?;
-    let force_await_initial_logical_size: Option<bool> =
-        parse_query_param(&request, "force-await-initial-logical-size")?;
-    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
-
-    let state = get_state(&request);
-    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-
-    let response_data = async {
-        let tenant = state
-            .tenant_manager
-            .get_attached_tenant_shard(tenant_shard_id)?;
-
-        tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
-
-        let (timelines, offloadeds) = tenant.list_timelines_and_offloaded();
-
-        let mut timeline_infos = Vec::with_capacity(timelines.len());
-        for timeline in timelines {
-            let timeline_info = build_timeline_info(
-                &timeline,
-                include_non_incremental_logical_size.unwrap_or(false),
-                force_await_initial_logical_size.unwrap_or(false),
-                &ctx,
-            )
-            .instrument(info_span!("build_timeline_info", timeline_id = %timeline.timeline_id))
-            .await
-            .context("Failed to build timeline info")
-            .map_err(ApiError::InternalServerError)?;
-
-            timeline_infos.push(timeline_info);
-        }
-        let offloaded_infos = offloadeds
-            .into_iter()
-            .map(|offloaded| build_timeline_offloaded_info(&offloaded))
-            .collect::<Vec<_>>();
-        let res = TimelinesInfoAndOffloaded {
-            timelines: timeline_infos,
-            offloaded: offloaded_infos,
-        };
-        Ok::<TimelinesInfoAndOffloaded, ApiError>(res)
-    }
-    .instrument(info_span!("timeline_and_offloaded_list",
-                tenant_id = %tenant_shard_id.tenant_id,
-                shard_id = %tenant_shard_id.shard_slug()))
-    .await?;
-
-    json_response(StatusCode::OK, response_data)
-}
-
 async fn timeline_preserve_initdb_handler(
    request: Request<Body>,
    _cancel: CancellationToken,
@@ -1293,99 +1209,6 @@ async fn layer_map_info_handler(
    json_response(StatusCode::OK, layer_map_info)
 }

-#[instrument(skip_all, fields(tenant_id, shard_id, timeline_id, layer_name))]
-async fn timeline_layer_scan_disposable_keys(
-    request: Request<Body>,
-    cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
-    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
-    let layer_name: LayerName = parse_request_param(&request, "layer_name")?;
-
-    tracing::Span::current().record(
-        "tenant_id",
-        tracing::field::display(&tenant_shard_id.tenant_id),
-    );
-    tracing::Span::current().record(
-        "shard_id",
-        tracing::field::display(tenant_shard_id.shard_slug()),
-    );
-    tracing::Span::current().record("timeline_id", tracing::field::display(&timeline_id));
-    tracing::Span::current().record("layer_name", tracing::field::display(&layer_name));
-
-    let state = get_state(&request);
-
-    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
-
-    // technically the timeline need not be active for this scan to complete
-    let timeline =
-        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
-            .await?;
-
-    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-
-    let guard = timeline.layers.read().await;
-    let Some(layer) = guard.try_get_from_key(&layer_name.clone().into()) else {
-        return Err(ApiError::NotFound(
-            anyhow::anyhow!("Layer {tenant_shard_id}/{timeline_id}/{layer_name} not found").into(),
-        ));
-    };
-
-    let resident_layer = layer
-        .download_and_keep_resident()
-        .await
-        .map_err(|err| match err {
-            tenant::storage_layer::layer::DownloadError::TimelineShutdown
-            | tenant::storage_layer::layer::DownloadError::DownloadCancelled => {
-                ApiError::ShuttingDown
-            }
-            tenant::storage_layer::layer::DownloadError::ContextAndConfigReallyDeniesDownloads
-            | tenant::storage_layer::layer::DownloadError::DownloadRequired
-            | tenant::storage_layer::layer::DownloadError::NotFile(_)
-            | tenant::storage_layer::layer::DownloadError::DownloadFailed
-            | tenant::storage_layer::layer::DownloadError::PreStatFailed(_) => {
-                ApiError::InternalServerError(err.into())
-            }
-            #[cfg(test)]
-            tenant::storage_layer::layer::DownloadError::Failpoint(_) => {
-                ApiError::InternalServerError(err.into())
-            }
-        })?;
-
-    let keys = resident_layer
-        .load_keys(&ctx)
-        .await
-        .map_err(ApiError::InternalServerError)?;
-
-    let shard_identity = timeline.get_shard_identity();
-
-    let mut disposable_count = 0;
-    let mut not_disposable_count = 0;
-    let cancel = cancel.clone();
-    for (i, key) in keys.into_iter().enumerate() {
-        if shard_identity.is_key_disposable(&key) {
-            disposable_count += 1;
-            tracing::debug!(key = %key, key.dbg=?key, "disposable key");
-        } else {
-            not_disposable_count += 1;
-        }
-        #[allow(clippy::collapsible_if)]
-        if i % 10000 == 0 {
-            if cancel.is_cancelled() || timeline.cancel.is_cancelled() || timeline.is_stopping() {
-                return Err(ApiError::ShuttingDown);
-            }
-        }
-    }
-
-    json_response(
-        StatusCode::OK,
-        pageserver_api::models::ScanDisposableKeysResponse {
-            disposable_count,
-            not_disposable_count,
-        },
-    )
-}
-
 async fn layer_download_handler(
    request: Request<Body>,
    _cancel: CancellationToken,
@@ -2431,7 +2254,7 @@ async fn tenant_scan_remote_handler(
                         %timeline_id))
            .await
            {
-                Ok((index_part, index_generation, _index_mtime)) => {
+                Ok((index_part, index_generation)) => {
                    tracing::info!("Found timeline {tenant_shard_id}/{timeline_id} metadata (gen {index_generation:?}, {} layers, {} consistent LSN)",
                        index_part.layer_metadata.len(), index_part.metadata.disk_consistent_lsn());
                    generation = std::cmp::max(generation, index_generation);
@@ -2576,6 +2399,31 @@ async fn post_tracing_event_handler(
    json_response(StatusCode::OK, ())
 }

+async fn force_aux_policy_switch_handler(
+    mut r: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    check_permission(&r, None)?;
+    let tenant_shard_id: TenantShardId = parse_request_param(&r, "tenant_shard_id")?;
+    let timeline_id: TimelineId = parse_request_param(&r, "timeline_id")?;
+    let policy: AuxFilePolicy = json_request(&mut r).await?;
+
+    let state = get_state(&r);
+
+    let tenant = state
+        .tenant_manager
+        .get_attached_tenant_shard(tenant_shard_id)?;
+    tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
+    let timeline =
+        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
+            .await?;
+    timeline
+        .do_switch_aux_policy(policy)
+        .map_err(ApiError::InternalServerError)?;
+
+    json_response(StatusCode::OK, ())
+}
+
 async fn put_io_engine_handler(
    mut r: Request<Body>,
    _cancel: CancellationToken,
@@ -3173,9 +3021,6 @@ pub fn make_router(
        .get("/v1/tenant/:tenant_shard_id/timeline", |r| {
            api_handler(r, timeline_list_handler)
        })
-        .get("/v1/tenant/:tenant_shard_id/timeline_and_offloaded", |r| {
-            api_handler(r, timeline_and_offloaded_list_handler)
-        })
        .post("/v1/tenant/:tenant_shard_id/timeline", |r| {
            api_handler(r, timeline_create_handler)
        })
@@ -3248,10 +3093,6 @@ pub fn make_router(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer/:layer_file_name",
            |r| api_handler(r, evict_timeline_layer_handler),
        )
-        .post(
-            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer/:layer_name/scan_disposable_keys",
-            |r| testing_api_handler("timeline_layer_scan_disposable_keys", r, timeline_layer_scan_disposable_keys),
-        )
        .post(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/block_gc",
            |r| api_handler(r, timeline_gc_blocking_handler),
@@ -3295,6 +3136,10 @@ pub fn make_router(
        )
        .put("/v1/io_engine", |r| api_handler(r, put_io_engine_handler))
        .put("/v1/io_mode", |r| api_handler(r, put_io_mode_handler))
+        .put(
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/force_aux_policy_switch",
+            |r| api_handler(r, force_aux_policy_switch_handler),
+        )
        .get("/v1/utilization", |r| api_handler(r, get_utilization))
        .post(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/ingest_aux_files",
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1189,7 +1189,7 @@ struct GlobalAndPerTimelineHistogramTimer<'a, 'c> {
    op: SmgrQueryType,
 }

-impl Drop for GlobalAndPerTimelineHistogramTimer<'_, '_> {
+impl<'a, 'c> Drop for GlobalAndPerTimelineHistogramTimer<'a, 'c> {
    fn drop(&mut self) {
        let elapsed = self.start.elapsed();
        let ex_throttled = self
@@ -1560,7 +1560,7 @@ impl BasebackupQueryTime {
    }
 }

-impl BasebackupQueryTimeOngoingRecording<'_, '_> {
+impl<'a, 'c> BasebackupQueryTimeOngoingRecording<'a, 'c> {
    pub(crate) fn observe<T>(self, res: &Result<T, QueryError>) {
        let elapsed = self.start.elapsed();
        let ex_throttled = self
@@ -2092,7 +2092,6 @@ pub(crate) struct WalIngestMetrics {
    pub(crate) records_received: IntCounter,
    pub(crate) records_committed: IntCounter,
    pub(crate) records_filtered: IntCounter,
-    pub(crate) gap_blocks_zeroed_on_rel_extend: IntCounter,
 }

 pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMetrics {
@@ -2116,11 +2115,6 @@ pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMet
        "Number of WAL records filtered out due to sharding"
    )
    .expect("failed to define a metric"),
-    gap_blocks_zeroed_on_rel_extend: register_int_counter!(
-        "pageserver_gap_blocks_zeroed_on_rel_extend",
-        "Total number of zero gap blocks written on relation extends"
-    )
-    .expect("failed to define a metric"),
 });

 pub(crate) static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
@@ -3040,111 +3034,13 @@ impl<F: Future<Output = Result<O, E>>, O, E> Future for MeasuredRemoteOp<F> {
 }

 pub mod tokio_epoll_uring {
-    use std::{
-        collections::HashMap,
-        sync::{Arc, Mutex},
-    };
-
-    use metrics::{register_histogram, register_int_counter, Histogram, LocalHistogram, UIntGauge};
+    use metrics::{register_int_counter, UIntGauge};
    use once_cell::sync::Lazy;

-    /// Shared storage for tokio-epoll-uring thread local metrics.
-    pub(crate) static THREAD_LOCAL_METRICS_STORAGE: Lazy<ThreadLocalMetricsStorage> =
-        Lazy::new(|| {
-            let slots_submission_queue_depth = register_histogram!(
-                "pageserver_tokio_epoll_uring_slots_submission_queue_depth",
-                "The slots waiters queue depth of each tokio_epoll_uring system",
-                vec![1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0],
-            )
-            .expect("failed to define a metric");
-            ThreadLocalMetricsStorage {
-                observers: Mutex::new(HashMap::new()),
-                slots_submission_queue_depth,
-            }
-        });
-
-    pub struct ThreadLocalMetricsStorage {
-        /// List of thread local metrics observers.
-        observers: Mutex<HashMap<u64, Arc<ThreadLocalMetrics>>>,
-        /// A histogram shared between all thread local systems
-        /// for collecting slots submission queue depth.
-        slots_submission_queue_depth: Histogram,
-    }
-
-    /// Each thread-local [`tokio_epoll_uring::System`] gets one of these as its
-    /// [`tokio_epoll_uring::metrics::PerSystemMetrics`] generic.
-    ///
-    /// The System makes observations into [`Self`] and periodically, the collector
-    /// comes along and flushes [`Self`] into the shared storage [`THREAD_LOCAL_METRICS_STORAGE`].
-    ///
-    /// [`LocalHistogram`] is `!Send`, so, we need to put it behind a [`Mutex`].
-    /// But except for the periodic flush, the lock is uncontended so there's no waiting
-    /// for cache coherence protocol to get an exclusive cache line.
-    pub struct ThreadLocalMetrics {
-        /// Local observer of thread local tokio-epoll-uring system's slots waiters queue depth.
-        slots_submission_queue_depth: Mutex<LocalHistogram>,
-    }
-
-    impl ThreadLocalMetricsStorage {
-        /// Registers a new thread local system. Returns a thread local metrics observer.
-        pub fn register_system(&self, id: u64) -> Arc<ThreadLocalMetrics> {
-            let per_system_metrics = Arc::new(ThreadLocalMetrics::new(
-                self.slots_submission_queue_depth.local(),
-            ));
-            let mut g = self.observers.lock().unwrap();
-            g.insert(id, Arc::clone(&per_system_metrics));
-            per_system_metrics
-        }
-
-        /// Removes metrics observer for a thread local system.
-        /// This should be called before dropping a thread local system.
-        pub fn remove_system(&self, id: u64) {
-            let mut g = self.observers.lock().unwrap();
-            g.remove(&id);
-        }
-
-        /// Flush all thread local metrics to the shared storage.
-        pub fn flush_thread_local_metrics(&self) {
-            let g = self.observers.lock().unwrap();
-            g.values().for_each(|local| {
-                local.flush();
-            });
-        }
-    }
-
-    impl ThreadLocalMetrics {
-        pub fn new(slots_submission_queue_depth: LocalHistogram) -> Self {
-            ThreadLocalMetrics {
-                slots_submission_queue_depth: Mutex::new(slots_submission_queue_depth),
-            }
-        }
-
-        /// Flushes the thread local metrics to shared aggregator.
-        pub fn flush(&self) {
-            let Self {
-                slots_submission_queue_depth,
-            } = self;
-            slots_submission_queue_depth.lock().unwrap().flush();
-        }
-    }
-
-    impl tokio_epoll_uring::metrics::PerSystemMetrics for ThreadLocalMetrics {
-        fn observe_slots_submission_queue_depth(&self, queue_depth: u64) {
-            let Self {
-                slots_submission_queue_depth,
-            } = self;
-            slots_submission_queue_depth
-                .lock()
-                .unwrap()
-                .observe(queue_depth as f64);
-        }
-    }
-
    pub struct Collector {
        descs: Vec<metrics::core::Desc>,
        systems_created: UIntGauge,
        systems_destroyed: UIntGauge,
-        thread_local_metrics_storage: &'static ThreadLocalMetricsStorage,
    }

    impl metrics::core::Collector for Collector {
@@ -3154,7 +3050,7 @@ pub mod tokio_epoll_uring {

        fn collect(&self) -> Vec<metrics::proto::MetricFamily> {
            let mut mfs = Vec::with_capacity(Self::NMETRICS);
-            let tokio_epoll_uring::metrics::GlobalMetrics {
+            let tokio_epoll_uring::metrics::Metrics {
                systems_created,
                systems_destroyed,
            } = tokio_epoll_uring::metrics::global();
@@ -3162,21 +3058,12 @@ pub mod tokio_epoll_uring {
            mfs.extend(self.systems_created.collect());
            self.systems_destroyed.set(systems_destroyed);
            mfs.extend(self.systems_destroyed.collect());
-
-            self.thread_local_metrics_storage
-                .flush_thread_local_metrics();
-
-            mfs.extend(
-                self.thread_local_metrics_storage
-                    .slots_submission_queue_depth
-                    .collect(),
-            );
            mfs
        }
    }

    impl Collector {
-        const NMETRICS: usize = 3;
+        const NMETRICS: usize = 2;

        #[allow(clippy::new_without_default)]
        pub fn new() -> Self {
@@ -3208,7 +3095,6 @@ pub mod tokio_epoll_uring {
                descs,
                systems_created,
                systems_destroyed,
-                thread_local_metrics_storage: &THREAD_LOCAL_METRICS_STORAGE,
            }
        }
    }
@@ -3568,7 +3454,6 @@ pub fn preinitialize_metrics() {
    Lazy::force(&RECONSTRUCT_TIME);
    Lazy::force(&BASEBACKUP_QUERY_TIME);
    Lazy::force(&COMPUTE_COMMANDS_COUNTERS);
-    Lazy::force(&tokio_epoll_uring::THREAD_LOCAL_METRICS_STORAGE);

    tenant_throttling::preinitialize_global_metrics();
 }
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -82,7 +82,6 @@ use once_cell::sync::OnceCell;
 use crate::{
    context::RequestContext,
    metrics::{page_cache_eviction_metrics, PageCacheSizeMetrics},
-    virtual_file::{IoBufferMut, IoPageSlice},
 };

 static PAGE_CACHE: OnceCell<PageCache> = OnceCell::new();
@@ -145,7 +144,7 @@ struct SlotInner {
    key: Option<CacheKey>,
    // for `coalesce_readers_permit`
    permit: std::sync::Mutex<Weak<PinnedSlotsPermit>>,
-    buf: IoPageSlice<'static>,
+    buf: &'static mut [u8; PAGE_SZ],
 }

 impl Slot {
@@ -235,13 +234,13 @@ impl std::ops::Deref for PageReadGuard<'_> {
    type Target = [u8; PAGE_SZ];

    fn deref(&self) -> &Self::Target {
-        self.slot_guard.buf.deref()
+        self.slot_guard.buf
    }
 }

 impl AsRef<[u8; PAGE_SZ]> for PageReadGuard<'_> {
    fn as_ref(&self) -> &[u8; PAGE_SZ] {
-        self.slot_guard.buf.as_ref()
+        self.slot_guard.buf
    }
 }

@@ -267,7 +266,7 @@ enum PageWriteGuardState<'i> {
 impl std::ops::DerefMut for PageWriteGuard<'_> {
    fn deref_mut(&mut self) -> &mut Self::Target {
        match &mut self.state {
-            PageWriteGuardState::Invalid { inner, _permit } => inner.buf.deref_mut(),
+            PageWriteGuardState::Invalid { inner, _permit } => inner.buf,
            PageWriteGuardState::Downgraded => unreachable!(),
        }
    }
@@ -278,7 +277,7 @@ impl std::ops::Deref for PageWriteGuard<'_> {

    fn deref(&self) -> &Self::Target {
        match &self.state {
-            PageWriteGuardState::Invalid { inner, _permit } => inner.buf.deref(),
+            PageWriteGuardState::Invalid { inner, _permit } => inner.buf,
            PageWriteGuardState::Downgraded => unreachable!(),
        }
    }
@@ -644,7 +643,7 @@ impl PageCache {
        // We could use Vec::leak here, but that potentially also leaks
        // uninitialized reserved capacity. With into_boxed_slice and Box::leak
        // this is avoided.
-        let page_buffer = IoBufferMut::with_capacity_zeroed(num_pages * PAGE_SZ).leak();
+        let page_buffer = Box::leak(vec![0u8; num_pages * PAGE_SZ].into_boxed_slice());

        let size_metrics = &crate::metrics::PAGE_CACHE_SIZE;
        size_metrics.max_bytes.set_page_sz(num_pages);
@@ -653,8 +652,7 @@ impl PageCache {
        let slots = page_buffer
            .chunks_exact_mut(PAGE_SZ)
            .map(|chunk| {
-                // SAFETY: Each chunk has `PAGE_SZ` (8192) bytes, greater than 512, still aligned.
-                let buf = unsafe { IoPageSlice::new_unchecked(chunk.try_into().unwrap()) };
+                let buf: &mut [u8; PAGE_SZ] = chunk.try_into().unwrap();

                Slot {
                    inner: tokio::sync::RwLock::new(SlotInner {
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -1326,22 +1326,22 @@ where
                .for_command(ComputeCommandKind::Basebackup)
                .inc();

-            let (lsn, gzip) = match (params.get(2), params.get(3)) {
-                (None, _) => (None, false),
-                (Some(&"--gzip"), _) => (None, true),
-                (Some(lsn_str), gzip_str_opt) => {
-                    let lsn = Lsn::from_str(lsn_str)
-                        .with_context(|| format!("Failed to parse Lsn from {lsn_str}"))?;
-                    let gzip = match gzip_str_opt {
-                        Some(&"--gzip") => true,
-                        None => false,
-                        Some(third_param) => {
-                            return Err(QueryError::Other(anyhow::anyhow!(
-                                "Parameter in position 3 unknown {third_param}",
-                            )))
-                        }
-                    };
-                    (Some(lsn), gzip)
+            let lsn = if let Some(lsn_str) = params.get(2) {
+                Some(
+                    Lsn::from_str(lsn_str)
+                        .with_context(|| format!("Failed to parse Lsn from {lsn_str}"))?,
+                )
+            } else {
+                None
+            };
+
+            let gzip = match params.get(3) {
+                Some(&"--gzip") => true,
+                None => false,
+                Some(third_param) => {
+                    return Err(QueryError::Other(anyhow::anyhow!(
+                        "Parameter in position 3 unknown {third_param}",
+                    )))
                }
            };

--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -22,6 +22,7 @@ use pageserver_api::key::{
    CompactKey, AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
 };
 use pageserver_api::keyspace::SparseKeySpace;
+use pageserver_api::models::AuxFilePolicy;
 use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::BLCKSZ;
@@ -32,7 +33,7 @@ use std::ops::ControlFlow;
 use std::ops::Range;
 use strum::IntoEnumIterator;
 use tokio_util::sync::CancellationToken;
-use tracing::{debug, trace, warn};
+use tracing::{debug, info, trace, warn};
 use utils::bin_ser::DeserializeError;
 use utils::pausable_failpoint;
 use utils::{bin_ser::BeSer, lsn::Lsn};
@@ -676,6 +677,21 @@ impl Timeline {
        self.get(CHECKPOINT_KEY, lsn, ctx).await
    }

+    async fn list_aux_files_v1(
+        &self,
+        lsn: Lsn,
+        ctx: &RequestContext,
+    ) -> Result<HashMap<String, Bytes>, PageReconstructError> {
+        match self.get(AUX_FILES_KEY, lsn, ctx).await {
+            Ok(buf) => Ok(AuxFilesDirectory::des(&buf)?.files),
+            Err(e) => {
+                // This is expected: historical databases do not have the key.
+                debug!("Failed to get info about AUX files: {}", e);
+                Ok(HashMap::new())
+            }
+        }
+    }
+
    async fn list_aux_files_v2(
        &self,
        lsn: Lsn,
@@ -706,7 +722,10 @@ impl Timeline {
        lsn: Lsn,
        ctx: &RequestContext,
    ) -> Result<(), PageReconstructError> {
-        self.list_aux_files_v2(lsn, ctx).await?;
+        let current_policy = self.last_aux_file_policy.load();
+        if let Some(AuxFilePolicy::V2) | Some(AuxFilePolicy::CrossValidation) = current_policy {
+            self.list_aux_files_v2(lsn, ctx).await?;
+        }
        Ok(())
    }

@@ -715,7 +734,51 @@ impl Timeline {
        lsn: Lsn,
        ctx: &RequestContext,
    ) -> Result<HashMap<String, Bytes>, PageReconstructError> {
-        self.list_aux_files_v2(lsn, ctx).await
+        let current_policy = self.last_aux_file_policy.load();
+        match current_policy {
+            Some(AuxFilePolicy::V1) => {
+                let res = self.list_aux_files_v1(lsn, ctx).await?;
+                let empty_str = if res.is_empty() { ", empty" } else { "" };
+                warn!(
+                    "this timeline is using deprecated aux file policy V1 (policy=v1{empty_str})"
+                );
+                Ok(res)
+            }
+            None => {
+                let res = self.list_aux_files_v1(lsn, ctx).await?;
+                if !res.is_empty() {
+                    warn!("this timeline is using deprecated aux file policy V1 (policy=None)");
+                }
+                Ok(res)
+            }
+            Some(AuxFilePolicy::V2) => self.list_aux_files_v2(lsn, ctx).await,
+            Some(AuxFilePolicy::CrossValidation) => {
+                let v1_result = self.list_aux_files_v1(lsn, ctx).await;
+                let v2_result = self.list_aux_files_v2(lsn, ctx).await;
+                match (v1_result, v2_result) {
+                    (Ok(v1), Ok(v2)) => {
+                        if v1 != v2 {
+                            tracing::error!(
+                                "unmatched aux file v1 v2 result:\nv1 {v1:?}\nv2 {v2:?}"
+                            );
+                            return Err(PageReconstructError::Other(anyhow::anyhow!(
+                                "unmatched aux file v1 v2 result"
+                            )));
+                        }
+                        Ok(v1)
+                    }
+                    (Ok(_), Err(v2)) => {
+                        tracing::error!("aux file v1 returns Ok while aux file v2 returns an err");
+                        Err(v2)
+                    }
+                    (Err(v1), Ok(_)) => {
+                        tracing::error!("aux file v2 returns Ok while aux file v1 returns an err");
+                        Err(v1)
+                    }
+                    (Err(_), Err(v2)) => Err(v2),
+                }
+            }
+        }
    }

    pub(crate) async fn get_replorigins(
@@ -891,6 +954,9 @@ impl Timeline {

        result.add_key(CONTROLFILE_KEY);
        result.add_key(CHECKPOINT_KEY);
+        if self.get(AUX_FILES_KEY, lsn, ctx).await.is_ok() {
+            result.add_key(AUX_FILES_KEY);
+        }

        // Add extra keyspaces in the test cases. Some test cases write keys into the storage without
        // creating directory keys. These test cases will add such keyspaces into `extra_test_dense_keyspace`
@@ -1100,6 +1166,9 @@ impl<'a> DatadirModification<'a> {
        self.pending_directory_entries.push((DirectoryKind::Db, 0));
        self.put(DBDIR_KEY, Value::Image(buf.into()));

+        // Create AuxFilesDirectory
+        self.init_aux_dir()?;
+
        let buf = if self.tline.pg_version >= 17 {
            TwoPhaseDirectoryV17::ser(&TwoPhaseDirectoryV17 {
                xids: HashSet::new(),
@@ -1278,6 +1347,9 @@ impl<'a> DatadirModification<'a> {
            // 'true', now write the updated 'dbdirs' map back.
            let buf = DbDirectory::ser(&dbdir)?;
            self.put(DBDIR_KEY, Value::Image(buf.into()));
+
+            // Create AuxFilesDirectory as well
+            self.init_aux_dir()?;
        }
        if r.is_none() {
            // Create RelDirectory
@@ -1506,42 +1578,35 @@ impl<'a> DatadirModification<'a> {
        Ok(())
    }

-    /// Drop some relations
-    pub(crate) async fn put_rel_drops(
-        &mut self,
-        drop_relations: HashMap<(u32, u32), Vec<RelTag>>,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        for ((spc_node, db_node), rel_tags) in drop_relations {
-            let dir_key = rel_dir_to_key(spc_node, db_node);
-            let buf = self.get(dir_key, ctx).await?;
-            let mut dir = RelDirectory::des(&buf)?;
+    /// Drop a relation.
+    pub async fn put_rel_drop(&mut self, rel: RelTag, ctx: &RequestContext) -> anyhow::Result<()> {
+        anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);

-            let mut dirty = false;
-            for rel_tag in rel_tags {
-                if dir.rels.remove(&(rel_tag.relnode, rel_tag.forknum)) {
-                    dirty = true;
+        // Remove it from the directory entry
+        let dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode);
+        let buf = self.get(dir_key, ctx).await?;
+        let mut dir = RelDirectory::des(&buf)?;

-                    // update logical size
-                    let size_key = rel_size_to_key(rel_tag);
-                    let old_size = self.get(size_key, ctx).await?.get_u32_le();
-                    self.pending_nblocks -= old_size as i64;
+        self.pending_directory_entries
+            .push((DirectoryKind::Rel, dir.rels.len()));

-                    // Remove entry from relation size cache
-                    self.tline.remove_cached_rel_size(&rel_tag);
-
-                    // Delete size entry, as well as all blocks
-                    self.delete(rel_key_range(rel_tag));
-                }
-            }
-
-            if dirty {
-                self.put(dir_key, Value::Image(Bytes::from(RelDirectory::ser(&dir)?)));
-                self.pending_directory_entries
-                    .push((DirectoryKind::Rel, dir.rels.len()));
-            }
+        if dir.rels.remove(&(rel.relnode, rel.forknum)) {
+            self.put(dir_key, Value::Image(Bytes::from(RelDirectory::ser(&dir)?)));
+        } else {
+            warn!("dropped rel {} did not exist in rel directory", rel);
        }

+        // update logical size
+        let size_key = rel_size_to_key(rel);
+        let old_size = self.get(size_key, ctx).await?.get_u32_le();
+        self.pending_nblocks -= old_size as i64;
+
+        // Remove enty from relation size cache
+        self.tline.remove_cached_rel_size(&rel);
+
+        // Delete size entry, as well as all blocks
+        self.delete(rel_key_range(rel));
+
        Ok(())
    }

@@ -1661,60 +1726,200 @@ impl<'a> DatadirModification<'a> {
        Ok(())
    }

+    pub fn init_aux_dir(&mut self) -> anyhow::Result<()> {
+        if let AuxFilePolicy::V2 = self.tline.get_switch_aux_file_policy() {
+            return Ok(());
+        }
+        let buf = AuxFilesDirectory::ser(&AuxFilesDirectory {
+            files: HashMap::new(),
+        })?;
+        self.pending_directory_entries
+            .push((DirectoryKind::AuxFiles, 0));
+        self.put(AUX_FILES_KEY, Value::Image(Bytes::from(buf)));
+        Ok(())
+    }
+
    pub async fn put_file(
        &mut self,
        path: &str,
        content: &[u8],
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
-        let key = aux_file::encode_aux_file_key(path);
-        // retrieve the key from the engine
-        let old_val = match self.get(key, ctx).await {
-            Ok(val) => Some(val),
-            Err(PageReconstructError::MissingKey(_)) => None,
-            Err(e) => return Err(e.into()),
-        };
-        let files: Vec<(&str, &[u8])> = if let Some(ref old_val) = old_val {
-            aux_file::decode_file_value(old_val)?
-        } else {
-            Vec::new()
-        };
-        let mut other_files = Vec::with_capacity(files.len());
-        let mut modifying_file = None;
-        for file @ (p, content) in files {
-            if path == p {
-                assert!(
-                    modifying_file.is_none(),
-                    "duplicated entries found for {}",
-                    path
-                );
-                modifying_file = Some(content);
+        let switch_policy = self.tline.get_switch_aux_file_policy();
+
+        let policy = {
+            let current_policy = self.tline.last_aux_file_policy.load();
+            // Allowed switch path:
+            // * no aux files -> v1/v2/cross-validation
+            // * cross-validation->v2
+
+            let current_policy = if current_policy.is_none() {
+                // This path will only be hit once per tenant: we will decide the final policy in this code block.
+                // The next call to `put_file` will always have `last_aux_file_policy != None`.
+                let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn);
+                let aux_files_key_v1 = self.tline.list_aux_files_v1(lsn, ctx).await?;
+                if aux_files_key_v1.is_empty() {
+                    None
+                } else {
+                    warn!("this timeline is using deprecated aux file policy V1 (detected existing v1 files)");
+                    self.tline.do_switch_aux_policy(AuxFilePolicy::V1)?;
+                    Some(AuxFilePolicy::V1)
+                }
            } else {
-                other_files.push(file);
+                current_policy
+            };
+
+            if AuxFilePolicy::is_valid_migration_path(current_policy, switch_policy) {
+                self.tline.do_switch_aux_policy(switch_policy)?;
+                info!(current=?current_policy, next=?switch_policy, "switching aux file policy");
+                switch_policy
+            } else {
+                // This branch handles non-valid migration path, and the case that switch_policy == current_policy.
+                // And actually, because the migration path always allow unspecified -> *, this unwrap_or will never be hit.
+                current_policy.unwrap_or(AuxFilePolicy::default_tenant_config())
            }
+        };
+
+        if let AuxFilePolicy::V2 | AuxFilePolicy::CrossValidation = policy {
+            let key = aux_file::encode_aux_file_key(path);
+            // retrieve the key from the engine
+            let old_val = match self.get(key, ctx).await {
+                Ok(val) => Some(val),
+                Err(PageReconstructError::MissingKey(_)) => None,
+                Err(e) => return Err(e.into()),
+            };
+            let files: Vec<(&str, &[u8])> = if let Some(ref old_val) = old_val {
+                aux_file::decode_file_value(old_val)?
+            } else {
+                Vec::new()
+            };
+            let mut other_files = Vec::with_capacity(files.len());
+            let mut modifying_file = None;
+            for file @ (p, content) in files {
+                if path == p {
+                    assert!(
+                        modifying_file.is_none(),
+                        "duplicated entries found for {}",
+                        path
+                    );
+                    modifying_file = Some(content);
+                } else {
+                    other_files.push(file);
+                }
+            }
+            let mut new_files = other_files;
+            match (modifying_file, content.is_empty()) {
+                (Some(old_content), false) => {
+                    self.tline
+                        .aux_file_size_estimator
+                        .on_update(old_content.len(), content.len());
+                    new_files.push((path, content));
+                }
+                (Some(old_content), true) => {
+                    self.tline
+                        .aux_file_size_estimator
+                        .on_remove(old_content.len());
+                    // not adding the file key to the final `new_files` vec.
+                }
+                (None, false) => {
+                    self.tline.aux_file_size_estimator.on_add(content.len());
+                    new_files.push((path, content));
+                }
+                (None, true) => warn!("removing non-existing aux file: {}", path),
+            }
+            let new_val = aux_file::encode_file_value(&new_files)?;
+            self.put(key, Value::Image(new_val.into()));
        }
-        let mut new_files = other_files;
-        match (modifying_file, content.is_empty()) {
-            (Some(old_content), false) => {
-                self.tline
-                    .aux_file_size_estimator
-                    .on_update(old_content.len(), content.len());
-                new_files.push((path, content));
+
+        if let AuxFilePolicy::V1 | AuxFilePolicy::CrossValidation = policy {
+            let file_path = path.to_string();
+            let content = if content.is_empty() {
+                None
+            } else {
+                Some(Bytes::copy_from_slice(content))
+            };
+
+            let n_files;
+            let mut aux_files = self.tline.aux_files.lock().await;
+            if let Some(mut dir) = aux_files.dir.take() {
+                // We already updated aux files in `self`: emit a delta and update our latest value.
+                dir.upsert(file_path.clone(), content.clone());
+                n_files = dir.files.len();
+                if aux_files.n_deltas == MAX_AUX_FILE_DELTAS {
+                    self.put(
+                        AUX_FILES_KEY,
+                        Value::Image(Bytes::from(
+                            AuxFilesDirectory::ser(&dir).context("serialize")?,
+                        )),
+                    );
+                    aux_files.n_deltas = 0;
+                } else {
+                    self.put(
+                        AUX_FILES_KEY,
+                        Value::WalRecord(NeonWalRecord::AuxFile { file_path, content }),
+                    );
+                    aux_files.n_deltas += 1;
+                }
+                aux_files.dir = Some(dir);
+            } else {
+                // Check if the AUX_FILES_KEY is initialized
+                match self.get(AUX_FILES_KEY, ctx).await {
+                    Ok(dir_bytes) => {
+                        let mut dir = AuxFilesDirectory::des(&dir_bytes)?;
+                        // Key is already set, we may append a delta
+                        self.put(
+                            AUX_FILES_KEY,
+                            Value::WalRecord(NeonWalRecord::AuxFile {
+                                file_path: file_path.clone(),
+                                content: content.clone(),
+                            }),
+                        );
+                        dir.upsert(file_path, content);
+                        n_files = dir.files.len();
+                        aux_files.dir = Some(dir);
+                    }
+                    Err(
+                        e @ (PageReconstructError::Cancelled
+                        | PageReconstructError::AncestorLsnTimeout(_)),
+                    ) => {
+                        // Important that we do not interpret a shutdown error as "not found" and thereby
+                        // reset the map.
+                        return Err(e.into());
+                    }
+                    // Note: we added missing key error variant in https://github.com/neondatabase/neon/pull/7393 but
+                    // the original code assumes all other errors are missing keys. Therefore, we keep the code path
+                    // the same for now, though in theory, we should only match the `MissingKey` variant.
+                    Err(
+                        e @ (PageReconstructError::Other(_)
+                        | PageReconstructError::WalRedo(_)
+                        | PageReconstructError::MissingKey(_)),
+                    ) => {
+                        // Key is missing, we must insert an image as the basis for subsequent deltas.
+
+                        if !matches!(e, PageReconstructError::MissingKey(_)) {
+                            let e = utils::error::report_compact_sources(&e);
+                            tracing::warn!("treating error as if it was a missing key: {}", e);
+                        }
+
+                        let mut dir = AuxFilesDirectory {
+                            files: HashMap::new(),
+                        };
+                        dir.upsert(file_path, content);
+                        self.put(
+                            AUX_FILES_KEY,
+                            Value::Image(Bytes::from(
+                                AuxFilesDirectory::ser(&dir).context("serialize")?,
+                            )),
+                        );
+                        n_files = 1;
+                        aux_files.dir = Some(dir);
+                    }
+                }
            }
-            (Some(old_content), true) => {
-                self.tline
-                    .aux_file_size_estimator
-                    .on_remove(old_content.len());
-                // not adding the file key to the final `new_files` vec.
-            }
-            (None, false) => {
-                self.tline.aux_file_size_estimator.on_add(content.len());
-                new_files.push((path, content));
-            }
-            (None, true) => warn!("removing non-existing aux file: {}", path),
+
+            self.pending_directory_entries
+                .push((DirectoryKind::AuxFiles, n_files));
        }
-        let new_val = aux_file::encode_file_value(&new_files)?;
-        self.put(key, Value::Image(new_val.into()));

        Ok(())
    }
@@ -1884,6 +2089,12 @@ impl<'a> DatadirModification<'a> {
        self.tline.get(key, lsn, ctx).await
    }

+    /// Only used during unit tests, force putting a key into the modification.
+    #[cfg(test)]
+    pub(crate) fn put_for_test(&mut self, key: Key, val: Value) {
+        self.put(key, val);
+    }
+
    fn put(&mut self, key: Key, val: Value) {
        if Self::is_data_key(&key) {
            self.put_data(key.to_compact(), val)
@@ -2001,6 +2212,21 @@ struct RelDirectory {
    rels: HashSet<(Oid, u8)>,
 }

+#[derive(Debug, Serialize, Deserialize, Default, PartialEq)]
+pub(crate) struct AuxFilesDirectory {
+    pub(crate) files: HashMap<String, Bytes>,
+}
+
+impl AuxFilesDirectory {
+    pub(crate) fn upsert(&mut self, key: String, value: Option<Bytes>) {
+        if let Some(value) = value {
+            self.files.insert(key, value);
+        } else {
+            self.files.remove(&key);
+        }
+    }
+}
+
 #[derive(Debug, Serialize, Deserialize)]
 struct RelSizeEntry {
    nblocks: u32,
--- a/pageserver/src/statvfs.rs
+++ b/pageserver/src/statvfs.rs
@@ -53,22 +53,6 @@ impl Statvfs {
            Statvfs::Mock(stat) => stat.block_size,
        }
    }
-
-    /// Get the available and total bytes on the filesystem.
-    pub fn get_avail_total_bytes(&self) -> (u64, u64) {
-        // https://unix.stackexchange.com/a/703650
-        let blocksize = if self.fragment_size() > 0 {
-            self.fragment_size()
-        } else {
-            self.block_size()
-        };
-
-        // use blocks_available (b_avail) since, pageserver runs as unprivileged user
-        let avail_bytes = self.blocks_available() * blocksize;
-        let total_bytes = self.blocks() * blocksize;
-
-        (avail_bytes, total_bytes)
-    }
 }

 pub mod mock {
@@ -90,7 +74,7 @@ pub mod mock {
                let used_bytes = walk_dir_disk_usage(tenants_dir, name_filter.as_deref()).unwrap();

                // round it up to the nearest block multiple
-                let used_blocks = used_bytes.div_ceil(*blocksize);
+                let used_blocks = (used_bytes + (blocksize - 1)) / blocksize;

                if used_blocks > *total_blocks {
                    panic!(
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
--- a/pageserver/src/tenant/block_io.rs
+++ b/pageserver/src/tenant/block_io.rs
@@ -5,8 +5,6 @@
 use super::storage_layer::delta_layer::{Adapter, DeltaLayerInner};
 use crate::context::RequestContext;
 use crate::page_cache::{self, FileId, PageReadGuard, PageWriteGuard, ReadBufResult, PAGE_SZ};
-#[cfg(test)]
-use crate::virtual_file::IoBufferMut;
 use crate::virtual_file::VirtualFile;
 use bytes::Bytes;
 use std::ops::Deref;
@@ -42,7 +40,7 @@ pub enum BlockLease<'a> {
    #[cfg(test)]
    Arc(std::sync::Arc<[u8; PAGE_SZ]>),
    #[cfg(test)]
-    IoBufferMut(IoBufferMut),
+    Vec(Vec<u8>),
 }

 impl From<PageReadGuard<'static>> for BlockLease<'static> {
@@ -52,13 +50,13 @@ impl From<PageReadGuard<'static>> for BlockLease<'static> {
 }

 #[cfg(test)]
-impl From<std::sync::Arc<[u8; PAGE_SZ]>> for BlockLease<'_> {
+impl<'a> From<std::sync::Arc<[u8; PAGE_SZ]>> for BlockLease<'a> {
    fn from(value: std::sync::Arc<[u8; PAGE_SZ]>) -> Self {
        BlockLease::Arc(value)
    }
 }

-impl Deref for BlockLease<'_> {
+impl<'a> Deref for BlockLease<'a> {
    type Target = [u8; PAGE_SZ];

    fn deref(&self) -> &Self::Target {
@@ -69,7 +67,7 @@ impl Deref for BlockLease<'_> {
            #[cfg(test)]
            BlockLease::Arc(v) => v.deref(),
            #[cfg(test)]
-            BlockLease::IoBufferMut(v) => {
+            BlockLease::Vec(v) => {
                TryFrom::try_from(&v[..]).expect("caller must ensure that v has PAGE_SZ")
            }
        }
--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -131,7 +131,7 @@ struct OnDiskNode<'a, const L: usize> {
    values: &'a [u8],
 }

-impl<const L: usize> OnDiskNode<'_, L> {
+impl<'a, const L: usize> OnDiskNode<'a, L> {
    ///
    /// Interpret a PAGE_SZ page as a node.
    ///
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -6,11 +6,10 @@ use crate::config::PageServerConf;
 use crate::context::RequestContext;
 use crate::page_cache;
 use crate::tenant::storage_layer::inmemory_layer::vectored_dio_read::File;
-use crate::virtual_file::owned_buffers_io::io_buf_aligned::IoBufAlignedMut;
 use crate::virtual_file::owned_buffers_io::slice::SliceMutExt;
 use crate::virtual_file::owned_buffers_io::util::size_tracking_writer;
 use crate::virtual_file::owned_buffers_io::write::Buffer;
-use crate::virtual_file::{self, owned_buffers_io, IoBufferMut, VirtualFile};
+use crate::virtual_file::{self, owned_buffers_io, VirtualFile};
 use bytes::BytesMut;
 use camino::Utf8PathBuf;
 use num_traits::Num;
@@ -108,18 +107,15 @@ impl EphemeralFile {
        self.page_cache_file_id
    }

-    pub(crate) async fn load_to_io_buf(
-        &self,
-        ctx: &RequestContext,
-    ) -> Result<IoBufferMut, io::Error> {
+    pub(crate) async fn load_to_vec(&self, ctx: &RequestContext) -> Result<Vec<u8>, io::Error> {
        let size = self.len().into_usize();
-        let buf = IoBufferMut::with_capacity(size);
-        let (slice, nread) = self.read_exact_at_eof_ok(0, buf.slice_full(), ctx).await?;
+        let vec = Vec::with_capacity(size);
+        let (slice, nread) = self.read_exact_at_eof_ok(0, vec.slice_full(), ctx).await?;
        assert_eq!(nread, size);
-        let buf = slice.into_inner();
-        assert_eq!(buf.len(), nread);
-        assert_eq!(buf.capacity(), size, "we shouldn't be reallocating");
-        Ok(buf)
+        let vec = slice.into_inner();
+        assert_eq!(vec.len(), nread);
+        assert_eq!(vec.capacity(), size, "we shouldn't be reallocating");
+        Ok(vec)
    }

    /// Returns the offset at which the first byte of the input was written, for use
@@ -162,7 +158,7 @@ impl EphemeralFile {
 }

 impl super::storage_layer::inmemory_layer::vectored_dio_read::File for EphemeralFile {
-    async fn read_exact_at_eof_ok<'a, 'b, B: IoBufAlignedMut + Send>(
+    async fn read_exact_at_eof_ok<'a, 'b, B: tokio_epoll_uring::IoBufMut + Send>(
        &'b self,
        start: u64,
        dst: tokio_epoll_uring::Slice<B>,
@@ -349,7 +345,7 @@ mod tests {
        assert!(file.len() as usize == write_nbytes);
        for i in 0..write_nbytes {
            assert_eq!(value_offsets[i], i.into_u64());
-            let buf = IoBufferMut::with_capacity(1);
+            let buf = Vec::with_capacity(1);
            let (buf_slice, nread) = file
                .read_exact_at_eof_ok(i.into_u64(), buf.slice_full(), &ctx)
                .await
@@ -389,7 +385,7 @@ mod tests {

        // assert the state is as this test expects it to be
        assert_eq!(
-            &file.load_to_io_buf(&ctx).await.unwrap(),
+            &file.load_to_vec(&ctx).await.unwrap(),
            &content[0..cap + cap / 2]
        );
        let md = file
@@ -444,7 +440,7 @@ mod tests {
                let (buf, nread) = file
                    .read_exact_at_eof_ok(
                        start.into_u64(),
-                        IoBufferMut::with_capacity(len).slice_full(),
+                        Vec::with_capacity(len).slice_full(),
                        ctx,
                    )
                    .await
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -11,7 +11,6 @@ use pageserver_api::shard::{
 };
 use pageserver_api::upcall_api::ReAttachResponseTenant;
 use rand::{distributions::Alphanumeric, Rng};
-use remote_storage::TimeoutOrCancel;
 use std::borrow::Cow;
 use std::cmp::Ordering;
 use std::collections::{BTreeMap, HashMap, HashSet};
@@ -1351,17 +1350,47 @@ impl TenantManager {
        }
    }

+    async fn delete_tenant_remote(
+        &self,
+        tenant_shard_id: TenantShardId,
+    ) -> Result<(), DeleteTenantError> {
+        let remote_path = remote_tenant_path(&tenant_shard_id);
+        let mut keys_stream = self.resources.remote_storage.list_streaming(
+            Some(&remote_path),
+            remote_storage::ListingMode::NoDelimiter,
+            None,
+            &self.cancel,
+        );
+        while let Some(chunk) = keys_stream.next().await {
+            let keys = match chunk {
+                Ok(listing) => listing.keys,
+                Err(remote_storage::DownloadError::Cancelled) => {
+                    return Err(DeleteTenantError::Cancelled)
+                }
+                Err(remote_storage::DownloadError::NotFound) => return Ok(()),
+                Err(other) => return Err(DeleteTenantError::Other(anyhow::anyhow!(other))),
+            };
+
+            if keys.is_empty() {
+                tracing::info!("Remote storage already deleted");
+            } else {
+                tracing::info!("Deleting {} keys from remote storage", keys.len());
+                let keys = keys.into_iter().map(|o| o.key).collect::<Vec<_>>();
+                self.resources
+                    .remote_storage
+                    .delete_objects(&keys, &self.cancel)
+                    .await?;
+            }
+        }
+
+        Ok(())
+    }
+
    /// If a tenant is attached, detach it.  Then remove its data from remote storage.
    ///
    /// A tenant is considered deleted once it is gone from remote storage.  It is the caller's
    /// responsibility to avoid trying to attach the tenant again or use it any way once deletion
    /// has started: this operation is not atomic, and must be retried until it succeeds.
-    ///
-    /// As a special case, if an unsharded tenant ID is given for a sharded tenant, it will remove
-    /// all tenant shards in remote storage (removing all paths with the tenant prefix). The storage
-    /// controller uses this to purge all remote tenant data, including any stale parent shards that
-    /// may remain after splits. Ideally, this special case would be handled elsewhere. See:
-    /// <https://github.com/neondatabase/neon/pull/9394>.
    pub(crate) async fn delete_tenant(
        &self,
        tenant_shard_id: TenantShardId,
@@ -1413,29 +1442,25 @@ impl TenantManager {
        //   in 500 responses to delete requests.
        // - We keep the `SlotGuard` during this I/O, so that if a concurrent delete request comes in, it will
        //   503/retry, rather than kicking off a wasteful concurrent deletion.
-        // NB: this also deletes partial prefixes, i.e. a <tenant_id> path will delete all
-        // <tenant_id>_<shard_id>/* objects. See method comment for why.
-        backoff::retry(
-            || async move {
-                self.resources
-                    .remote_storage
-                    .delete_prefix(&remote_tenant_path(&tenant_shard_id), &self.cancel)
-                    .await
+        match backoff::retry(
+            || async move { self.delete_tenant_remote(tenant_shard_id).await },
+            |e| match e {
+                DeleteTenantError::Cancelled => true,
+                DeleteTenantError::SlotError(_) => {
+                    unreachable!("Remote deletion doesn't touch slots")
+                }
+                _ => false,
            },
-            |_| false, // backoff::retry handles cancellation
            1,
            3,
            &format!("delete_tenant[tenant_shard_id={tenant_shard_id}]"),
            &self.cancel,
        )
        .await
-        .unwrap_or(Err(TimeoutOrCancel::Cancel.into()))
-        .map_err(|err| {
-            if TimeoutOrCancel::caused_by_cancel(&err) {
-                return DeleteTenantError::Cancelled;
-            }
-            DeleteTenantError::Other(err)
-        })
+        {
+            Some(r) => r,
+            None => Err(DeleteTenantError::Cancelled),
+        }
    }

    #[instrument(skip_all, fields(tenant_id=%tenant.get_tenant_shard_id().tenant_id, shard_id=%tenant.get_tenant_shard_id().shard_slug(), new_shard_count=%new_shard_count.literal()))]
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -180,7 +180,6 @@

 pub(crate) mod download;
 pub mod index;
-pub mod manifest;
 pub(crate) mod upload;

 use anyhow::Context;
@@ -188,10 +187,11 @@ use camino::Utf8Path;
 use chrono::{NaiveDateTime, Utc};

 pub(crate) use download::download_initdb_tar_zst;
-use pageserver_api::models::TimelineArchivalState;
+use pageserver_api::models::{AuxFilePolicy, TimelineArchivalState};
 use pageserver_api::shard::{ShardIndex, TenantShardId};
 use scopeguard::ScopeGuard;
 use tokio_util::sync::CancellationToken;
+pub(crate) use upload::upload_initdb_dir;
 use utils::backoff::{
    self, exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
 };
@@ -245,11 +245,9 @@ use super::upload_queue::{NotInitialized, SetDeletedFlagProgress};
 use super::Generation;

 pub(crate) use download::{
-    do_download_tenant_manifest, download_index_part, is_temp_download_file,
-    list_remote_tenant_shards, list_remote_timelines,
+    download_index_part, is_temp_download_file, list_remote_tenant_shards, list_remote_timelines,
 };
 pub(crate) use index::LayerFileMetadata;
-pub(crate) use upload::{upload_initdb_dir, upload_tenant_manifest};

 // Occasional network issues and such can cause remote operations to fail, and
 // that's expected. If a download fails, we log it at info-level, and retry.
@@ -274,12 +272,6 @@ pub(crate) const BUFFER_SIZE: usize = 32 * 1024;
 /// which we warn and skip.
 const DELETION_QUEUE_FLUSH_TIMEOUT: Duration = Duration::from_secs(10);

-/// Hardcode a generation for the tenant manifest for now so that we don't
-/// need to deal with generation-less manifests in the future.
-///
-/// TODO: add proper generation support to all the places that use this.
-pub(crate) const TENANT_MANIFEST_GENERATION: Generation = Generation::new(1);
-
 pub enum MaybeDeletedIndexPart {
    IndexPart(IndexPart),
    Deleted(IndexPart),
@@ -303,10 +295,6 @@ pub enum WaitCompletionError {
    UploadQueueShutDownOrStopped,
 }

-#[derive(Debug, thiserror::Error)]
-#[error("Upload queue either in unexpected state or hasn't downloaded manifest yet")]
-pub struct UploadQueueNotReadyError;
-
 /// A client for accessing a timeline's data in remote storage.
 ///
 /// This takes care of managing the number of connections, and balancing them
@@ -480,20 +468,6 @@ impl RemoteTimelineClient {
            .ok()
    }

-    /// Returns `Ok(Some(timestamp))` if the timeline has been archived, `Ok(None)` if the timeline hasn't been archived.
-    ///
-    /// Return Err(_) if the remote index_part hasn't been downloaded yet, or the timeline hasn't been stopped yet.
-    pub(crate) fn archived_at_stopped_queue(
-        &self,
-    ) -> Result<Option<NaiveDateTime>, UploadQueueNotReadyError> {
-        self.upload_queue
-            .lock()
-            .unwrap()
-            .stopped_mut()
-            .map(|q| q.upload_queue_for_deletion.clean.0.archived_at)
-            .map_err(|_| UploadQueueNotReadyError)
-    }
-
    fn update_remote_physical_size_gauge(&self, current_remote_index_part: Option<&IndexPart>) {
        let size: u64 = if let Some(current_remote_index_part) = current_remote_index_part {
            current_remote_index_part
@@ -531,7 +505,7 @@ impl RemoteTimelineClient {
            },
        );

-        let (index_part, index_generation, index_last_modified) = download::download_index_part(
+        let (index_part, _index_generation) = download::download_index_part(
            &self.storage_impl,
            &self.tenant_shard_id,
            &self.timeline_id,
@@ -545,49 +519,6 @@ impl RemoteTimelineClient {
        )
        .await?;

-        // Defense in depth: monotonicity of generation numbers is an important correctness guarantee, so when we see a very
-        // old index, we do extra checks in case this is the result of backward time-travel of the generation number (e.g.
-        // in case of a bug in the service that issues generation numbers). Indices are allowed to be old, but we expect that
-        // when we load an old index we are loading the _latest_ index: if we are asked to load an old index and there is
-        // also a newer index available, that is surprising.
-        const INDEX_AGE_CHECKS_THRESHOLD: Duration = Duration::from_secs(14 * 24 * 3600);
-        let index_age = index_last_modified.elapsed().unwrap_or_else(|e| {
-            if e.duration() > Duration::from_secs(5) {
-                // We only warn if the S3 clock and our local clock are >5s out: because this is a low resolution
-                // timestamp, it is common to be out by at least 1 second.
-                tracing::warn!("Index has modification time in the future: {e}");
-            }
-            Duration::ZERO
-        });
-        if index_age > INDEX_AGE_CHECKS_THRESHOLD {
-            tracing::info!(
-                ?index_generation,
-                age = index_age.as_secs_f64(),
-                "Loaded an old index, checking for other indices..."
-            );
-
-            // Find the highest-generation index
-            let (_latest_index_part, latest_index_generation, latest_index_mtime) =
-                download::download_index_part(
-                    &self.storage_impl,
-                    &self.tenant_shard_id,
-                    &self.timeline_id,
-                    Generation::MAX,
-                    cancel,
-                )
-                .await?;
-
-            if latest_index_generation > index_generation {
-                // Unexpected!  Why are we loading such an old index if a more recent one exists?
-                tracing::warn!(
-                    ?index_generation,
-                    ?latest_index_generation,
-                    ?latest_index_mtime,
-                    "Found a newer index while loading an old one"
-                );
-            }
-        }
-
        if index_part.deleted_at.is_some() {
            Ok(MaybeDeletedIndexPart::Deleted(index_part))
        } else {
@@ -697,6 +628,18 @@ impl RemoteTimelineClient {
        Ok(())
    }

+    /// Launch an index-file upload operation in the background, with only the `aux_file_policy` flag updated.
+    pub(crate) fn schedule_index_upload_for_aux_file_policy_update(
+        self: &Arc<Self>,
+        last_aux_file_policy: Option<AuxFilePolicy>,
+    ) -> anyhow::Result<()> {
+        let mut guard = self.upload_queue.lock().unwrap();
+        let upload_queue = guard.initialized_mut()?;
+        upload_queue.dirty.last_aux_file_policy = last_aux_file_policy;
+        self.schedule_index_upload(upload_queue)?;
+        Ok(())
+    }
+
    /// Launch an index-file upload operation in the background, with only the `archived_at` field updated.
    ///
    /// Returns whether it is required to wait for the queue to be empty to ensure that the change is uploaded,
@@ -1278,14 +1221,10 @@ impl RemoteTimelineClient {
        let fut = {
            let mut guard = self.upload_queue.lock().unwrap();
            let upload_queue = match &mut *guard {
-                UploadQueue::Stopped(_) => {
-                    scopeguard::ScopeGuard::into_inner(sg);
-                    return;
-                }
+                UploadQueue::Stopped(_) => return,
                UploadQueue::Uninitialized => {
                    // transition into Stopped state
                    self.stop_impl(&mut guard);
-                    scopeguard::ScopeGuard::into_inner(sg);
                    return;
                }
                UploadQueue::Initialized(ref mut init) => init,
@@ -2212,7 +2151,7 @@ pub(crate) struct UploadQueueAccessor<'a> {
    inner: std::sync::MutexGuard<'a, UploadQueue>,
 }

-impl UploadQueueAccessor<'_> {
+impl<'a> UploadQueueAccessor<'a> {
    pub(crate) fn latest_uploaded_index_part(&self) -> &IndexPart {
        match &*self.inner {
            UploadQueue::Initialized(x) => &x.clean.0,
@@ -2228,17 +2167,6 @@ pub fn remote_tenant_path(tenant_shard_id: &TenantShardId) -> RemotePath {
    RemotePath::from_string(&path).expect("Failed to construct path")
 }

-pub fn remote_tenant_manifest_path(
-    tenant_shard_id: &TenantShardId,
-    generation: Generation,
-) -> RemotePath {
-    let path = format!(
-        "tenants/{tenant_shard_id}/tenant-manifest{}.json",
-        generation.get_suffix()
-    );
-    RemotePath::from_string(&path).expect("Failed to construct path")
-}
-
 pub fn remote_timelines_path(tenant_shard_id: &TenantShardId) -> RemotePath {
    let path = format!("tenants/{tenant_shard_id}/{TIMELINES_SEGMENT_NAME}");
    RemotePath::from_string(&path).expect("Failed to construct path")
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -6,7 +6,6 @@
 use std::collections::HashSet;
 use std::future::Future;
 use std::str::FromStr;
-use std::time::SystemTime;

 use anyhow::{anyhow, Context};
 use camino::{Utf8Path, Utf8PathBuf};
@@ -34,11 +33,10 @@ use utils::id::{TenantId, TimelineId};
 use utils::pausable_failpoint;

 use super::index::{IndexPart, LayerFileMetadata};
-use super::manifest::TenantManifest;
 use super::{
    parse_remote_index_path, remote_index_path, remote_initdb_archive_path,
-    remote_initdb_preserved_archive_path, remote_tenant_manifest_path, remote_tenant_path,
-    FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, INITDB_PATH,
+    remote_initdb_preserved_archive_path, remote_tenant_path, FAILED_DOWNLOAD_WARN_THRESHOLD,
+    FAILED_REMOTE_OP_RETRIES, INITDB_PATH,
 };

 ///
@@ -339,15 +337,19 @@ pub async fn list_remote_timelines(
    list_identifiers::<TimelineId>(storage, remote_path, cancel).await
 }

-async fn do_download_remote_path_retry_forever(
+async fn do_download_index_part(
    storage: &GenericRemoteStorage,
-    remote_path: &RemotePath,
+    tenant_shard_id: &TenantShardId,
+    timeline_id: &TimelineId,
+    index_generation: Generation,
    cancel: &CancellationToken,
-) -> Result<(Vec<u8>, SystemTime), DownloadError> {
-    download_retry_forever(
+) -> Result<(IndexPart, Generation), DownloadError> {
+    let remote_path = remote_index_path(tenant_shard_id, timeline_id, index_generation);
+
+    let index_part_bytes = download_retry_forever(
        || async {
            let download = storage
-                .download(remote_path, &DownloadOpts::default(), cancel)
+                .download(&remote_path, &DownloadOpts::default(), cancel)
                .await?;

            let mut bytes = Vec::new();
@@ -357,50 +359,18 @@ async fn do_download_remote_path_retry_forever(

            tokio::io::copy_buf(&mut stream, &mut bytes).await?;

-            Ok((bytes, download.last_modified))
+            Ok(bytes)
        },
        &format!("download {remote_path:?}"),
        cancel,
    )
-    .await
-}
-
-pub async fn do_download_tenant_manifest(
-    storage: &GenericRemoteStorage,
-    tenant_shard_id: &TenantShardId,
-    cancel: &CancellationToken,
-) -> Result<(TenantManifest, Generation), DownloadError> {
-    // TODO: generation support
-    let generation = super::TENANT_MANIFEST_GENERATION;
-    let remote_path = remote_tenant_manifest_path(tenant_shard_id, generation);
-
-    let (manifest_bytes, _manifest_bytes_mtime) =
-        do_download_remote_path_retry_forever(storage, &remote_path, cancel).await?;
-
-    let tenant_manifest = TenantManifest::from_json_bytes(&manifest_bytes)
-        .with_context(|| format!("deserialize tenant manifest file at {remote_path:?}"))
-        .map_err(DownloadError::Other)?;
-
-    Ok((tenant_manifest, generation))
-}
-
-async fn do_download_index_part(
-    storage: &GenericRemoteStorage,
-    tenant_shard_id: &TenantShardId,
-    timeline_id: &TimelineId,
-    index_generation: Generation,
-    cancel: &CancellationToken,
-) -> Result<(IndexPart, Generation, SystemTime), DownloadError> {
-    let remote_path = remote_index_path(tenant_shard_id, timeline_id, index_generation);
-
-    let (index_part_bytes, index_part_mtime) =
-        do_download_remote_path_retry_forever(storage, &remote_path, cancel).await?;
+    .await?;

    let index_part: IndexPart = serde_json::from_slice(&index_part_bytes)
        .with_context(|| format!("deserialize index part file at {remote_path:?}"))
        .map_err(DownloadError::Other)?;

-    Ok((index_part, index_generation, index_part_mtime))
+    Ok((index_part, index_generation))
 }

 /// index_part.json objects are suffixed with a generation number, so we cannot
@@ -415,7 +385,7 @@ pub(crate) async fn download_index_part(
    timeline_id: &TimelineId,
    my_generation: Generation,
    cancel: &CancellationToken,
-) -> Result<(IndexPart, Generation, SystemTime), DownloadError> {
+) -> Result<(IndexPart, Generation), DownloadError> {
    debug_assert_current_span_has_tenant_and_timeline_id();

    if my_generation.is_none() {
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -121,11 +121,11 @@ impl IndexPart {
        self.disk_consistent_lsn
    }

-    pub fn from_json_bytes(bytes: &[u8]) -> Result<Self, serde_json::Error> {
+    pub fn from_s3_bytes(bytes: &[u8]) -> Result<Self, serde_json::Error> {
        serde_json::from_slice::<IndexPart>(bytes)
    }

-    pub fn to_json_bytes(&self) -> serde_json::Result<Vec<u8>> {
+    pub fn to_s3_bytes(&self) -> serde_json::Result<Vec<u8>> {
        serde_json::to_vec(self)
    }

@@ -133,6 +133,10 @@ impl IndexPart {
    pub(crate) fn example() -> Self {
        Self::empty(TimelineMetadata::example())
    }
+
+    pub(crate) fn last_aux_file_policy(&self) -> Option<AuxFilePolicy> {
+        self.last_aux_file_policy
+    }
 }

 /// Metadata gathered for each of the layer files.
@@ -383,7 +387,7 @@ mod tests {
            last_aux_file_policy: None,
        };

-        let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
+        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
        assert_eq!(part, expected);
    }

@@ -427,7 +431,7 @@ mod tests {
            last_aux_file_policy: None,
        };

-        let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
+        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
        assert_eq!(part, expected);
    }

@@ -472,7 +476,7 @@ mod tests {
            last_aux_file_policy: None,
        };

-        let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
+        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
        assert_eq!(part, expected);
    }

@@ -520,7 +524,7 @@ mod tests {
            last_aux_file_policy: None,
        };

-        let empty_layers_parsed = IndexPart::from_json_bytes(empty_layers_json.as_bytes()).unwrap();
+        let empty_layers_parsed = IndexPart::from_s3_bytes(empty_layers_json.as_bytes()).unwrap();

        assert_eq!(empty_layers_parsed, expected);
    }
@@ -563,7 +567,7 @@ mod tests {
            last_aux_file_policy: None,
        };

-        let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
+        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
        assert_eq!(part, expected);
    }

@@ -609,7 +613,7 @@ mod tests {
            last_aux_file_policy: None,
        };

-        let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
+        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
        assert_eq!(part, expected);
    }

@@ -660,7 +664,7 @@ mod tests {
            last_aux_file_policy: Some(AuxFilePolicy::V2),
        };

-        let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
+        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
        assert_eq!(part, expected);
    }

@@ -716,7 +720,7 @@ mod tests {
            last_aux_file_policy: Default::default(),
        };

-        let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
+        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
        assert_eq!(part, expected);
    }

@@ -773,7 +777,7 @@ mod tests {
            last_aux_file_policy: Default::default(),
        };

-        let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
+        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
        assert_eq!(part, expected);
    }

@@ -835,7 +839,7 @@ mod tests {
            archived_at: None,
        };

-        let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
+        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
        assert_eq!(part, expected);
    }

--- a/pageserver/src/tenant/remote_timeline_client/manifest.rs
+++ b/pageserver/src/tenant/remote_timeline_client/manifest.rs
@@ -1,53 +0,0 @@
-use chrono::NaiveDateTime;
-use serde::{Deserialize, Serialize};
-use utils::{id::TimelineId, lsn::Lsn};
-
-/// Tenant-shard scoped manifest
-#[derive(Clone, Serialize, Deserialize)]
-pub struct TenantManifest {
-    /// Debugging aid describing the version of this manifest.
-    /// Can also be used for distinguishing breaking changes later on.
-    pub version: usize,
-
-    /// The list of offloaded timelines together with enough information
-    /// to not have to actually load them.
-    ///
-    /// Note: the timelines mentioned in this list might be deleted, i.e.
-    /// we don't hold an invariant that the references aren't dangling.
-    /// Existence of index-part.json is the actual indicator of timeline existence.
-    pub offloaded_timelines: Vec<OffloadedTimelineManifest>,
-}
-
-/// The remote level representation of an offloaded timeline.
-///
-/// Very similar to [`pageserver_api::models::OffloadedTimelineInfo`],
-/// but the two datastructures serve different needs, this is for a persistent disk format
-/// that must be backwards compatible, while the other is only for informative purposes.
-#[derive(Clone, Serialize, Deserialize, Copy)]
-pub struct OffloadedTimelineManifest {
-    pub timeline_id: TimelineId,
-    /// Whether the timeline has a parent it has been branched off from or not
-    pub ancestor_timeline_id: Option<TimelineId>,
-    /// Whether to retain the branch lsn at the ancestor or not
-    pub ancestor_retain_lsn: Option<Lsn>,
-    /// The time point when the timeline was archived
-    pub archived_at: NaiveDateTime,
-}
-
-pub const LATEST_TENANT_MANIFEST_VERSION: usize = 1;
-
-impl TenantManifest {
-    pub(crate) fn empty() -> Self {
-        Self {
-            version: LATEST_TENANT_MANIFEST_VERSION,
-            offloaded_timelines: vec![],
-        }
-    }
-    pub(crate) fn from_json_bytes(bytes: &[u8]) -> Result<Self, serde_json::Error> {
-        serde_json::from_slice::<Self>(bytes)
-    }
-
-    pub(crate) fn to_json_bytes(&self) -> serde_json::Result<Vec<u8>> {
-        serde_json::to_vec(self)
-    }
-}
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -13,11 +13,9 @@ use tokio_util::sync::CancellationToken;
 use utils::{backoff, pausable_failpoint};

 use super::index::IndexPart;
-use super::manifest::TenantManifest;
 use super::Generation;
 use crate::tenant::remote_timeline_client::{
    remote_index_path, remote_initdb_archive_path, remote_initdb_preserved_archive_path,
-    remote_tenant_manifest_path,
 };
 use remote_storage::{GenericRemoteStorage, RemotePath, TimeTravelError};
 use utils::id::{TenantId, TimelineId};
@@ -41,7 +39,7 @@ pub(crate) async fn upload_index_part<'a>(
    pausable_failpoint!("before-upload-index-pausable");

    // FIXME: this error comes too late
-    let serialized = index_part.to_json_bytes()?;
+    let serialized = index_part.to_s3_bytes()?;
    let serialized = Bytes::from(serialized);

    let index_part_size = serialized.len();
@@ -57,37 +55,6 @@ pub(crate) async fn upload_index_part<'a>(
        .await
        .with_context(|| format!("upload index part for '{tenant_shard_id} / {timeline_id}'"))
 }
-/// Serializes and uploads the given tenant manifest data to the remote storage.
-pub(crate) async fn upload_tenant_manifest(
-    storage: &GenericRemoteStorage,
-    tenant_shard_id: &TenantShardId,
-    generation: Generation,
-    tenant_manifest: &TenantManifest,
-    cancel: &CancellationToken,
-) -> anyhow::Result<()> {
-    tracing::trace!("uploading new tenant manifest");
-
-    fail_point!("before-upload-manifest", |_| {
-        bail!("failpoint before-upload-manifest")
-    });
-    pausable_failpoint!("before-upload-manifest-pausable");
-
-    let serialized = tenant_manifest.to_json_bytes()?;
-    let serialized = Bytes::from(serialized);
-
-    let tenant_manifest_site = serialized.len();
-
-    let remote_path = remote_tenant_manifest_path(tenant_shard_id, generation);
-    storage
-        .upload_storage_object(
-            futures::stream::once(futures::future::ready(Ok(serialized))),
-            tenant_manifest_site,
-            &remote_path,
-            cancel,
-        )
-        .await
-        .with_context(|| format!("upload tenant manifest for '{tenant_shard_id}'"))
-}

 /// Attempts to upload given layer files.
 /// No extra checks for overlapping files is made and any files that are already present remotely will be overwritten, if submitted during the upload.
--- a/pageserver/src/tenant/secondary/heatmap_uploader.rs
+++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs
@@ -108,6 +108,7 @@ impl scheduler::Completion for WriteComplete {
 /// when we last did a write.  We only populate this after doing at least one
 /// write for a tenant -- this avoids holding state for tenants that have
 /// uploads disabled.
+
 struct UploaderTenantState {
    // This Weak only exists to enable culling idle instances of this type
    // when the Tenant has been deallocated.
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -187,8 +187,6 @@ pub(super) async fn gather_inputs(
    // but it is unlikely to cause any issues. In the worst case,
    // the calculation will error out.
    timelines.retain(|t| t.is_active());
-    // Also filter out archived timelines.
-    timelines.retain(|t| t.is_archived() != Some(true));

    // Build a map of branch points.
    let mut branchpoints: HashMap<TimelineId, HashSet<Lsn>> = HashMap::new();
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -1,6 +1,5 @@
 //! Common traits and structs for layers

-pub mod batch_split_writer;
 pub mod delta_layer;
 pub mod filter_iterator;
 pub mod image_layer;
@@ -9,6 +8,7 @@ pub(crate) mod layer;
 mod layer_desc;
 mod layer_name;
 pub mod merge_iterator;
+pub mod split_writer;

 use crate::context::{AccessStatsBehavior, RequestContext};
 use crate::repository::Value;
@@ -705,7 +705,7 @@ pub mod tests {
 /// Useful with `Key`, which has too verbose `{:?}` for printing multiple layers.
 struct RangeDisplayDebug<'a, T: std::fmt::Display>(&'a Range<T>);

-impl<T: std::fmt::Display> std::fmt::Debug for RangeDisplayDebug<'_, T> {
+impl<'a, T: std::fmt::Display> std::fmt::Debug for RangeDisplayDebug<'a, T> {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "{}..{}", self.0.start, self.0.end)
    }
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -44,11 +44,11 @@ use crate::tenant::vectored_blob_io::{
 };
 use crate::tenant::PageReconstructError;
 use crate::virtual_file::owned_buffers_io::io_buf_ext::{FullSlice, IoBufExt};
-use crate::virtual_file::IoBufferMut;
 use crate::virtual_file::{self, MaybeFatalIo, VirtualFile};
 use crate::{walrecord, TEMP_FILE_SUFFIX};
 use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
 use anyhow::{anyhow, bail, ensure, Context, Result};
+use bytes::BytesMut;
 use camino::{Utf8Path, Utf8PathBuf};
 use futures::StreamExt;
 use itertools::Itertools;
@@ -515,8 +515,8 @@ impl DeltaLayerWriterInner {
    ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
        let temp_path = self.path.clone();
        let result = self.finish0(key_end, ctx).await;
-        if let Err(ref e) = result {
-            tracing::info!(%temp_path, "cleaning up temporary file after error during writing: {e}");
+        if result.is_err() {
+            tracing::info!(%temp_path, "cleaning up temporary file after error during writing");
            if let Err(e) = std::fs::remove_file(&temp_path) {
                tracing::warn!(error=%e, %temp_path, "error cleaning up temporary layer file after error during writing");
            }
@@ -529,7 +529,8 @@ impl DeltaLayerWriterInner {
        key_end: Key,
        ctx: &RequestContext,
    ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
-        let index_start_blk = self.blob_writer.size().div_ceil(PAGE_SZ as u64) as u32;
+        let index_start_blk =
+            ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;

        let mut file = self.blob_writer.into_inner(ctx).await?;

@@ -1002,7 +1003,7 @@ impl DeltaLayerInner {
            .0
            .into();
        let buf_size = Self::get_min_read_buffer_size(&reads, max_vectored_read_bytes);
-        let mut buf = Some(IoBufferMut::with_capacity(buf_size));
+        let mut buf = Some(BytesMut::with_capacity(buf_size));

        // Note that reads are processed in reverse order (from highest key+lsn).
        // This is the order that `ReconstructState` requires such that it can
@@ -1029,7 +1030,7 @@ impl DeltaLayerInner {

                    // We have "lost" the buffer since the lower level IO api
                    // doesn't return the buffer on error. Allocate a new one.
-                    buf = Some(IoBufferMut::with_capacity(buf_size));
+                    buf = Some(BytesMut::with_capacity(buf_size));

                    continue;
                }
@@ -1084,7 +1085,7 @@ impl DeltaLayerInner {
        }
    }

-    pub(crate) async fn index_entries<'a>(
+    pub(super) async fn load_keys<'a>(
        &'a self,
        ctx: &RequestContext,
    ) -> Result<Vec<DeltaEntry<'a>>> {
@@ -1203,7 +1204,7 @@ impl DeltaLayerInner {
            .map(|x| x.0.get())
            .unwrap_or(8192);

-        let mut buffer = Some(IoBufferMut::with_capacity(max_read_size));
+        let mut buffer = Some(BytesMut::with_capacity(max_read_size));

        // FIXME: buffering of DeltaLayerWriter
        let mut per_blob_copy = Vec::new();
@@ -1346,7 +1347,7 @@ impl DeltaLayerInner {

        tree_reader.dump().await?;

-        let keys = self.index_entries(ctx).await?;
+        let keys = self.load_keys(ctx).await?;

        async fn dump_blob(val: &ValueRef<'_>, ctx: &RequestContext) -> anyhow::Result<String> {
            let buf = val.load_raw(ctx).await?;
@@ -1453,16 +1454,6 @@ impl DeltaLayerInner {
            ),
        }
    }
-
-    /// NB: not super efficient, but not terrible either. Should prob be an iterator.
-    //
-    // We're reusing the index traversal logical in plan_reads; would be nice to
-    // factor that out.
-    pub(crate) async fn load_keys(&self, ctx: &RequestContext) -> anyhow::Result<Vec<Key>> {
-        self.index_entries(ctx)
-            .await
-            .map(|entries| entries.into_iter().map(|entry| entry.key).collect())
-    }
 }

 /// A set of data associated with a delta layer key and its value
@@ -1571,11 +1562,12 @@ impl<'a> DeltaLayerIterator<'a> {
        let vectored_blob_reader = VectoredBlobReader::new(&self.delta_layer.file);
        let mut next_batch = std::collections::VecDeque::new();
        let buf_size = plan.size();
-        let buf = IoBufferMut::with_capacity(buf_size);
+        let buf = BytesMut::with_capacity(buf_size);
        let blobs_buf = vectored_blob_reader
            .read_blobs(&plan, buf, self.ctx)
            .await?;
-        let view = BufView::new_slice(&blobs_buf.buf);
+        let frozen_buf = blobs_buf.buf.freeze();
+        let view = BufView::new_bytes(frozen_buf);
        for meta in blobs_buf.blobs.iter() {
            let blob_read = meta.read(&view).await?;
            let value = Value::des(&blob_read)?;
@@ -1950,7 +1942,7 @@ pub(crate) mod test {
                &vectored_reads,
                constants::MAX_VECTORED_READ_BYTES,
            );
-            let mut buf = Some(IoBufferMut::with_capacity(buf_size));
+            let mut buf = Some(BytesMut::with_capacity(buf_size));

            for read in vectored_reads {
                let blobs_buf = vectored_blob_reader
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -41,11 +41,10 @@ use crate::tenant::vectored_blob_io::{
 };
 use crate::tenant::PageReconstructError;
 use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
-use crate::virtual_file::IoBufferMut;
 use crate::virtual_file::{self, MaybeFatalIo, VirtualFile};
 use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
 use anyhow::{anyhow, bail, ensure, Context, Result};
-use bytes::Bytes;
+use bytes::{Bytes, BytesMut};
 use camino::{Utf8Path, Utf8PathBuf};
 use hex;
 use itertools::Itertools;
@@ -548,10 +547,10 @@ impl ImageLayerInner {
        for read in plan.into_iter() {
            let buf_size = read.size();

-            let buf = IoBufferMut::with_capacity(buf_size);
+            let buf = BytesMut::with_capacity(buf_size);
            let blobs_buf = vectored_blob_reader.read_blobs(&read, buf, ctx).await?;
-
-            let view = BufView::new_slice(&blobs_buf.buf);
+            let frozen_buf = blobs_buf.buf.freeze();
+            let view = BufView::new_bytes(frozen_buf);

            for meta in blobs_buf.blobs.iter() {
                let img_buf = meta.read(&view).await?;
@@ -610,12 +609,13 @@ impl ImageLayerInner {
                }
            }

-            let buf = IoBufferMut::with_capacity(buf_size);
+            let buf = BytesMut::with_capacity(buf_size);
            let res = vectored_blob_reader.read_blobs(&read, buf, ctx).await;

            match res {
                Ok(blobs_buf) => {
-                    let view = BufView::new_slice(&blobs_buf.buf);
+                    let frozen_buf = blobs_buf.buf.freeze();
+                    let view = BufView::new_bytes(frozen_buf);
                    for meta in blobs_buf.blobs.iter() {
                        let img_buf = meta.read(&view).await;

@@ -673,21 +673,6 @@ impl ImageLayerInner {
            ),
        }
    }
-
-    /// NB: not super efficient, but not terrible either. Should prob be an iterator.
-    //
-    // We're reusing the index traversal logical in plan_reads; would be nice to
-    // factor that out.
-    pub(crate) async fn load_keys(&self, ctx: &RequestContext) -> anyhow::Result<Vec<Key>> {
-        let plan = self
-            .plan_reads(KeySpace::single(self.key_range.clone()), None, ctx)
-            .await?;
-        Ok(plan
-            .into_iter()
-            .flat_map(|read| read.blobs_at)
-            .map(|(_, blob_meta)| blob_meta.key)
-            .collect())
-    }
 }

 /// A builder object for constructing a new image layer.
@@ -843,26 +828,8 @@ impl ImageLayerWriterInner {
        ctx: &RequestContext,
        end_key: Option<Key>,
    ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
-        let temp_path = self.path.clone();
-        let result = self.finish0(ctx, end_key).await;
-        if let Err(ref e) = result {
-            tracing::info!(%temp_path, "cleaning up temporary file after error during writing: {e}");
-            if let Err(e) = std::fs::remove_file(&temp_path) {
-                tracing::warn!(error=%e, %temp_path, "error cleaning up temporary layer file after error during writing");
-            }
-        }
-        result
-    }
-
-    ///
-    /// Finish writing the image layer.
-    ///
-    async fn finish0(
-        self,
-        ctx: &RequestContext,
-        end_key: Option<Key>,
-    ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
-        let index_start_blk = self.blob_writer.size().div_ceil(PAGE_SZ as u64) as u32;
+        let index_start_blk =
+            ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;

        // Calculate compression ratio
        let compressed_size = self.blob_writer.size() - PAGE_SZ as u64; // Subtract PAGE_SZ for header
@@ -1024,7 +991,7 @@ impl ImageLayerWriter {
        self.inner.take().unwrap().finish(ctx, None).await
    }

-    /// Finish writing the image layer with an end key, used in [`super::batch_split_writer::SplitImageLayerWriter`]. The end key determines the end of the image layer's covered range and is exclusive.
+    /// Finish writing the image layer with an end key, used in [`super::split_writer::SplitImageLayerWriter`]. The end key determines the end of the image layer's covered range and is exclusive.
    pub(super) async fn finish_with_end_key(
        mut self,
        end_key: Key,
@@ -1084,11 +1051,12 @@ impl<'a> ImageLayerIterator<'a> {
        let vectored_blob_reader = VectoredBlobReader::new(&self.image_layer.file);
        let mut next_batch = std::collections::VecDeque::new();
        let buf_size = plan.size();
-        let buf = IoBufferMut::with_capacity(buf_size);
+        let buf = BytesMut::with_capacity(buf_size);
        let blobs_buf = vectored_blob_reader
            .read_blobs(&plan, buf, self.ctx)
            .await?;
-        let view = BufView::new_slice(&blobs_buf.buf);
+        let frozen_buf = blobs_buf.buf.freeze();
+        let view = BufView::new_bytes(frozen_buf);
        for meta in blobs_buf.blobs.iter() {
            let img_buf = meta.read(&view).await?;
            next_batch.push_back((
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -14,6 +14,7 @@ use crate::tenant::PageReconstructError;
 use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
 use crate::{l0_flush, page_cache};
 use anyhow::{anyhow, Context, Result};
+use bytes::Bytes;
 use camino::Utf8PathBuf;
 use pageserver_api::key::CompactKey;
 use pageserver_api::keyspace::KeySpace;
@@ -808,8 +809,9 @@ impl InMemoryLayer {

        match l0_flush_global_state {
            l0_flush::Inner::Direct { .. } => {
-                let file_contents = inner.file.load_to_io_buf(ctx).await?;
-                let file_contents = file_contents.freeze();
+                let file_contents: Vec<u8> = inner.file.load_to_vec(ctx).await?;
+
+                let file_contents = Bytes::from(file_contents);

                for (key, vec_map) in inner.index.iter() {
                    // Write all page versions
@@ -823,7 +825,7 @@ impl InMemoryLayer {
                            len,
                            will_init,
                        } = entry;
-                        let buf = file_contents.slice(pos as usize..(pos + len) as usize);
+                        let buf = Bytes::slice(&file_contents, pos as usize..(pos + len) as usize);
                        let (_buf, res) = delta_layer_writer
                            .put_value_bytes(
                                Key::from_compact(*key),
--- a/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs
@@ -9,7 +9,6 @@ use tokio_epoll_uring::{BoundedBuf, IoBufMut, Slice};
 use crate::{
    assert_u64_eq_usize::{U64IsUsize, UsizeIsU64},
    context::RequestContext,
-    virtual_file::{owned_buffers_io::io_buf_aligned::IoBufAlignedMut, IoBufferMut},
 };

 /// The file interface we require. At runtime, this is a [`crate::tenant::ephemeral_file::EphemeralFile`].
@@ -25,7 +24,7 @@ pub trait File: Send {
    /// [`std::io::ErrorKind::UnexpectedEof`] error if the file is shorter than `start+dst.len()`.
    ///
    /// No guarantees are made about the remaining bytes in `dst` in case of a short read.
-    async fn read_exact_at_eof_ok<'a, 'b, B: IoBufAlignedMut + Send>(
+    async fn read_exact_at_eof_ok<'a, 'b, B: IoBufMut + Send>(
        &'b self,
        start: u64,
        dst: Slice<B>,
@@ -228,7 +227,7 @@ where

    // Execute physical reads and fill the logical read buffers
    // TODO: pipelined reads; prefetch;
-    let get_io_buffer = |nchunks| IoBufferMut::with_capacity(nchunks * DIO_CHUNK_SIZE);
+    let get_io_buffer = |nchunks| Vec::with_capacity(nchunks * DIO_CHUNK_SIZE);
    for PhysicalRead {
        start_chunk_no,
        nchunks,
@@ -460,7 +459,7 @@ mod tests {
        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
        let file = InMemoryFile::new_random(10);
        let test_read = |pos, len| {
-            let buf = IoBufferMut::with_capacity_zeroed(len);
+            let buf = vec![0; len];
            let fut = file.read_exact_at_eof_ok(pos, buf.slice_full(), &ctx);
            use futures::FutureExt;
            let (slice, nread) = fut
@@ -471,9 +470,9 @@ mod tests {
            buf.truncate(nread);
            buf
        };
-        assert_eq!(&test_read(0, 1), &file.content[0..1]);
-        assert_eq!(&test_read(1, 2), &file.content[1..3]);
-        assert_eq!(&test_read(9, 2), &file.content[9..]);
+        assert_eq!(test_read(0, 1), &file.content[0..1]);
+        assert_eq!(test_read(1, 2), &file.content[1..3]);
+        assert_eq!(test_read(9, 2), &file.content[9..]);
        assert!(test_read(10, 2).is_empty());
        assert!(test_read(11, 2).is_empty());
    }
@@ -610,7 +609,7 @@ mod tests {
    }

    impl<'x> File for RecorderFile<'x> {
-        async fn read_exact_at_eof_ok<'a, 'b, B: IoBufAlignedMut + Send>(
+        async fn read_exact_at_eof_ok<'a, 'b, B: IoBufMut + Send>(
            &'b self,
            start: u64,
            dst: Slice<B>,
@@ -783,7 +782,7 @@ mod tests {
            2048,  1024 => Err("foo".to_owned()),
        };

-        let buf = IoBufferMut::with_capacity(512);
+        let buf = Vec::with_capacity(512);
        let (buf, nread) = mock_file
            .read_exact_at_eof_ok(0, buf.slice_full(), &ctx)
            .await
@@ -791,7 +790,7 @@ mod tests {
        assert_eq!(nread, 512);
        assert_eq!(&buf.into_inner()[..nread], &[0; 512]);

-        let buf = IoBufferMut::with_capacity(512);
+        let buf = Vec::with_capacity(512);
        let (buf, nread) = mock_file
            .read_exact_at_eof_ok(512, buf.slice_full(), &ctx)
            .await
@@ -799,7 +798,7 @@ mod tests {
        assert_eq!(nread, 512);
        assert_eq!(&buf.into_inner()[..nread], &[1; 512]);

-        let buf = IoBufferMut::with_capacity(512);
+        let buf = Vec::with_capacity(512);
        let (buf, nread) = mock_file
            .read_exact_at_eof_ok(1024, buf.slice_full(), &ctx)
            .await
@@ -807,7 +806,7 @@ mod tests {
        assert_eq!(nread, 10);
        assert_eq!(&buf.into_inner()[..nread], &[2; 10]);

-        let buf = IoBufferMut::with_capacity(1024);
+        let buf = Vec::with_capacity(1024);
        let err = mock_file
            .read_exact_at_eof_ok(2048, buf.slice_full(), &ctx)
            .await
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -19,7 +19,7 @@ use crate::task_mgr::TaskKind;
 use crate::tenant::timeline::{CompactionError, GetVectoredError};
 use crate::tenant::{remote_timeline_client::LayerFileMetadata, Timeline};

-use super::delta_layer::{self};
+use super::delta_layer::{self, DeltaEntry};
 use super::image_layer::{self};
 use super::{
    AsLayerDesc, ImageLayerWriter, LayerAccessStats, LayerAccessStatsReset, LayerName,
@@ -341,10 +341,6 @@ impl Layer {
        Ok(())
    }

-    pub(crate) async fn needs_download(&self) -> Result<Option<NeedsDownload>, std::io::Error> {
-        self.0.needs_download().await
-    }
-
    /// Assuming the layer is already downloaded, returns a guard which will prohibit eviction
    /// while the guard exists.
    ///
@@ -978,7 +974,7 @@ impl LayerInner {
        let timeline = self
            .timeline
            .upgrade()
-            .ok_or(DownloadError::TimelineShutdown)?;
+            .ok_or_else(|| DownloadError::TimelineShutdown)?;

        // count cancellations, which currently remain largely unexpected
        let init_cancelled = scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
@@ -1841,22 +1837,23 @@ impl ResidentLayer {
    pub(crate) async fn load_keys<'a>(
        &'a self,
        ctx: &RequestContext,
-    ) -> anyhow::Result<Vec<pageserver_api::key::Key>> {
+    ) -> anyhow::Result<Vec<DeltaEntry<'a>>> {
        use LayerKind::*;

        let owner = &self.owner.0;
-        let inner = self.downloaded.get(owner, ctx).await?;
+        match self.downloaded.get(owner, ctx).await? {
+            Delta(ref d) => {
+                // this is valid because the DownloadedLayer::kind is a OnceCell, not a
+                // Mutex<OnceCell>, so we cannot go and deinitialize the value with OnceCell::take
+                // while it's being held.
+                self.owner.record_access(ctx);

-        // this is valid because the DownloadedLayer::kind is a OnceCell, not a
-        // Mutex<OnceCell>, so we cannot go and deinitialize the value with OnceCell::take
-        // while it's being held.
-        self.owner.record_access(ctx);
-
-        let res = match inner {
-            Delta(ref d) => delta_layer::DeltaLayerInner::load_keys(d, ctx).await,
-            Image(ref i) => image_layer::ImageLayerInner::load_keys(i, ctx).await,
-        };
-        res.with_context(|| format!("Layer index is corrupted for {self}"))
+                delta_layer::DeltaLayerInner::load_keys(d, ctx)
+                    .await
+                    .with_context(|| format!("Layer index is corrupted for {self}"))
+            }
+            Image(_) => anyhow::bail!(format!("cannot load_keys on a image layer {self}")),
+        }
    }

    /// Read all they keys in this layer which match the ShardIdentity, and write them all to
--- a/pageserver/src/tenant/storage_layer/layer_desc.rs
+++ b/pageserver/src/tenant/storage_layer/layer_desc.rs
@@ -57,34 +57,6 @@ impl std::fmt::Display for PersistentLayerKey {
    }
 }

-impl From<ImageLayerName> for PersistentLayerKey {
-    fn from(image_layer_name: ImageLayerName) -> Self {
-        Self {
-            key_range: image_layer_name.key_range,
-            lsn_range: PersistentLayerDesc::image_layer_lsn_range(image_layer_name.lsn),
-            is_delta: false,
-        }
-    }
-}
-
-impl From<DeltaLayerName> for PersistentLayerKey {
-    fn from(delta_layer_name: DeltaLayerName) -> Self {
-        Self {
-            key_range: delta_layer_name.key_range,
-            lsn_range: delta_layer_name.lsn_range,
-            is_delta: true,
-        }
-    }
-}
-
-impl From<LayerName> for PersistentLayerKey {
-    fn from(layer_name: LayerName) -> Self {
-        match layer_name {
-            LayerName::Image(i) => i.into(),
-            LayerName::Delta(d) => d.into(),
-        }
-    }
-}
 impl PersistentLayerDesc {
    pub fn key(&self) -> PersistentLayerKey {
        PersistentLayerKey {
--- a/pageserver/src/tenant/storage_layer/layer_name.rs
+++ b/pageserver/src/tenant/storage_layer/layer_name.rs
@@ -339,7 +339,7 @@ impl<'de> serde::Deserialize<'de> for LayerName {

 struct LayerNameVisitor;

-impl serde::de::Visitor<'_> for LayerNameVisitor {
+impl<'de> serde::de::Visitor<'de> for LayerNameVisitor {
    type Value = LayerName;

    fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
--- a/pageserver/src/tenant/storage_layer/merge_iterator.rs
+++ b/pageserver/src/tenant/storage_layer/merge_iterator.rs
@@ -99,21 +99,21 @@ impl<'a> PeekableLayerIterRef<'a> {
    }
 }

-impl std::cmp::PartialEq for IteratorWrapper<'_> {
+impl<'a> std::cmp::PartialEq for IteratorWrapper<'a> {
    fn eq(&self, other: &Self) -> bool {
        self.cmp(other) == Ordering::Equal
    }
 }

-impl std::cmp::Eq for IteratorWrapper<'_> {}
+impl<'a> std::cmp::Eq for IteratorWrapper<'a> {}

-impl std::cmp::PartialOrd for IteratorWrapper<'_> {
+impl<'a> std::cmp::PartialOrd for IteratorWrapper<'a> {
    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
        Some(self.cmp(other))
    }
 }

-impl std::cmp::Ord for IteratorWrapper<'_> {
+impl<'a> std::cmp::Ord for IteratorWrapper<'a> {
    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
        use std::cmp::Ordering;
        let a = self.peek_next_key_lsn_value();
--- a/pageserver/src/tenant/storage_layer/batch_split_writer.rs
+++ b/pageserver/src/tenant/storage_layer/batch_split_writer.rs
@@ -12,154 +12,41 @@ use super::{
    DeltaLayerWriter, ImageLayerWriter, PersistentLayerDesc, PersistentLayerKey, ResidentLayer,
 };

-pub(crate) enum BatchWriterResult {
+pub(crate) enum SplitWriterResult {
    Produced(ResidentLayer),
    Discarded(PersistentLayerKey),
 }

 #[cfg(test)]
-impl BatchWriterResult {
+impl SplitWriterResult {
    fn into_resident_layer(self) -> ResidentLayer {
        match self {
-            BatchWriterResult::Produced(layer) => layer,
-            BatchWriterResult::Discarded(_) => panic!("unexpected discarded layer"),
+            SplitWriterResult::Produced(layer) => layer,
+            SplitWriterResult::Discarded(_) => panic!("unexpected discarded layer"),
        }
    }

    fn into_discarded_layer(self) -> PersistentLayerKey {
        match self {
-            BatchWriterResult::Produced(_) => panic!("unexpected produced layer"),
-            BatchWriterResult::Discarded(layer) => layer,
+            SplitWriterResult::Produced(_) => panic!("unexpected produced layer"),
+            SplitWriterResult::Discarded(layer) => layer,
        }
    }
 }

-enum LayerWriterWrapper {
-    Image(ImageLayerWriter),
-    Delta(DeltaLayerWriter),
-}
-
-/// An layer writer that takes unfinished layers and finish them atomically.
-#[must_use]
-pub struct BatchLayerWriter {
-    generated_layer_writers: Vec<(LayerWriterWrapper, PersistentLayerKey)>,
-    conf: &'static PageServerConf,
-}
-
-impl BatchLayerWriter {
-    pub async fn new(conf: &'static PageServerConf) -> anyhow::Result<Self> {
-        Ok(Self {
-            generated_layer_writers: Vec::new(),
-            conf,
-        })
-    }
-
-    pub fn add_unfinished_image_writer(
-        &mut self,
-        writer: ImageLayerWriter,
-        key_range: Range<Key>,
-        lsn: Lsn,
-    ) {
-        self.generated_layer_writers.push((
-            LayerWriterWrapper::Image(writer),
-            PersistentLayerKey {
-                key_range,
-                lsn_range: PersistentLayerDesc::image_layer_lsn_range(lsn),
-                is_delta: false,
-            },
-        ));
-    }
-
-    pub fn add_unfinished_delta_writer(
-        &mut self,
-        writer: DeltaLayerWriter,
-        key_range: Range<Key>,
-        lsn_range: Range<Lsn>,
-    ) {
-        self.generated_layer_writers.push((
-            LayerWriterWrapper::Delta(writer),
-            PersistentLayerKey {
-                key_range,
-                lsn_range,
-                is_delta: true,
-            },
-        ));
-    }
-
-    pub(crate) async fn finish_with_discard_fn<D, F>(
-        self,
-        tline: &Arc<Timeline>,
-        ctx: &RequestContext,
-        discard_fn: D,
-    ) -> anyhow::Result<Vec<BatchWriterResult>>
-    where
-        D: Fn(&PersistentLayerKey) -> F,
-        F: Future<Output = bool>,
-    {
-        let Self {
-            generated_layer_writers,
-            ..
-        } = self;
-        let clean_up_layers = |generated_layers: Vec<BatchWriterResult>| {
-            for produced_layer in generated_layers {
-                if let BatchWriterResult::Produced(resident_layer) = produced_layer {
-                    let layer: Layer = resident_layer.into();
-                    layer.delete_on_drop();
-                }
-            }
-        };
-        // BEGIN: catch every error and do the recovery in the below section
-        let mut generated_layers: Vec<BatchWriterResult> = Vec::new();
-        for (inner, layer_key) in generated_layer_writers {
-            if discard_fn(&layer_key).await {
-                generated_layers.push(BatchWriterResult::Discarded(layer_key));
-            } else {
-                let res = match inner {
-                    LayerWriterWrapper::Delta(writer) => {
-                        writer.finish(layer_key.key_range.end, ctx).await
-                    }
-                    LayerWriterWrapper::Image(writer) => {
-                        writer
-                            .finish_with_end_key(layer_key.key_range.end, ctx)
-                            .await
-                    }
-                };
-                let layer = match res {
-                    Ok((desc, path)) => {
-                        match Layer::finish_creating(self.conf, tline, desc, &path) {
-                            Ok(layer) => layer,
-                            Err(e) => {
-                                tokio::fs::remove_file(&path).await.ok();
-                                clean_up_layers(generated_layers);
-                                return Err(e);
-                            }
-                        }
-                    }
-                    Err(e) => {
-                        // Image/DeltaLayerWriter::finish will clean up the temporary layer if anything goes wrong,
-                        // so we don't need to remove the layer we just failed to create by ourselves.
-                        clean_up_layers(generated_layers);
-                        return Err(e);
-                    }
-                };
-                generated_layers.push(BatchWriterResult::Produced(layer));
-            }
-        }
-        // END: catch every error and do the recovery in the above section
-        Ok(generated_layers)
-    }
-}
-
 /// An image writer that takes images and produces multiple image layers.
+///
+/// The interface does not guarantee atomicity (i.e., if the image layer generation
+/// fails, there might be leftover files to be cleaned up)
 #[must_use]
 pub struct SplitImageLayerWriter {
    inner: ImageLayerWriter,
    target_layer_size: u64,
-    lsn: Lsn,
+    generated_layers: Vec<SplitWriterResult>,
    conf: &'static PageServerConf,
    timeline_id: TimelineId,
    tenant_shard_id: TenantShardId,
-    batches: BatchLayerWriter,
+    lsn: Lsn,
    start_key: Key,
 }

@@ -184,21 +71,27 @@ impl SplitImageLayerWriter {
                ctx,
            )
            .await?,
+            generated_layers: Vec::new(),
            conf,
            timeline_id,
            tenant_shard_id,
-            batches: BatchLayerWriter::new(conf).await?,
            lsn,
            start_key,
        })
    }

-    pub async fn put_image(
+    pub async fn put_image_with_discard_fn<D, F>(
        &mut self,
        key: Key,
        img: Bytes,
+        tline: &Arc<Timeline>,
        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+        discard: D,
+    ) -> anyhow::Result<()>
+    where
+        D: FnOnce(&PersistentLayerKey) -> F,
+        F: Future<Output = bool>,
+    {
        // The current estimation is an upper bound of the space that the key/image could take
        // because we did not consider compression in this estimation. The resulting image layer
        // could be smaller than the target size.
@@ -216,34 +109,72 @@ impl SplitImageLayerWriter {
            )
            .await?;
            let prev_image_writer = std::mem::replace(&mut self.inner, next_image_writer);
-            self.batches.add_unfinished_image_writer(
-                prev_image_writer,
-                self.start_key..key,
-                self.lsn,
-            );
+            let layer_key = PersistentLayerKey {
+                key_range: self.start_key..key,
+                lsn_range: PersistentLayerDesc::image_layer_lsn_range(self.lsn),
+                is_delta: false,
+            };
            self.start_key = key;
+
+            if discard(&layer_key).await {
+                drop(prev_image_writer);
+                self.generated_layers
+                    .push(SplitWriterResult::Discarded(layer_key));
+            } else {
+                let (desc, path) = prev_image_writer.finish_with_end_key(key, ctx).await?;
+
+                let layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
+                self.generated_layers
+                    .push(SplitWriterResult::Produced(layer));
+            }
        }
        self.inner.put_image(key, img, ctx).await
    }

+    #[cfg(test)]
+    pub async fn put_image(
+        &mut self,
+        key: Key,
+        img: Bytes,
+        tline: &Arc<Timeline>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        self.put_image_with_discard_fn(key, img, tline, ctx, |_| async { false })
+            .await
+    }
+
    pub(crate) async fn finish_with_discard_fn<D, F>(
        self,
        tline: &Arc<Timeline>,
        ctx: &RequestContext,
        end_key: Key,
-        discard_fn: D,
-    ) -> anyhow::Result<Vec<BatchWriterResult>>
+        discard: D,
+    ) -> anyhow::Result<Vec<SplitWriterResult>>
    where
-        D: Fn(&PersistentLayerKey) -> F,
+        D: FnOnce(&PersistentLayerKey) -> F,
        F: Future<Output = bool>,
    {
        let Self {
-            mut batches, inner, ..
+            mut generated_layers,
+            inner,
+            ..
        } = self;
-        if inner.num_keys() != 0 {
-            batches.add_unfinished_image_writer(inner, self.start_key..end_key, self.lsn);
+        if inner.num_keys() == 0 {
+            return Ok(generated_layers);
        }
-        batches.finish_with_discard_fn(tline, ctx, discard_fn).await
+        let layer_key = PersistentLayerKey {
+            key_range: self.start_key..end_key,
+            lsn_range: PersistentLayerDesc::image_layer_lsn_range(self.lsn),
+            is_delta: false,
+        };
+        if discard(&layer_key).await {
+            generated_layers.push(SplitWriterResult::Discarded(layer_key));
+        } else {
+            let (desc, path) = inner.finish_with_end_key(end_key, ctx).await?;
+            let layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
+            generated_layers.push(SplitWriterResult::Produced(layer));
+        }
+        Ok(generated_layers)
    }

    #[cfg(test)]
@@ -252,14 +183,22 @@ impl SplitImageLayerWriter {
        tline: &Arc<Timeline>,
        ctx: &RequestContext,
        end_key: Key,
-    ) -> anyhow::Result<Vec<BatchWriterResult>> {
+    ) -> anyhow::Result<Vec<SplitWriterResult>> {
        self.finish_with_discard_fn(tline, ctx, end_key, |_| async { false })
            .await
    }
+
+    /// This function will be deprecated with #8841.
+    pub(crate) fn take(self) -> anyhow::Result<(Vec<SplitWriterResult>, ImageLayerWriter)> {
+        Ok((self.generated_layers, self.inner))
+    }
 }

 /// A delta writer that takes key-lsn-values and produces multiple delta layers.
 ///
+/// The interface does not guarantee atomicity (i.e., if the delta layer generation fails,
+/// there might be leftover files to be cleaned up).
+///
 /// Note that if updates of a single key exceed the target size limit, all of the updates will be batched
 /// into a single file. This behavior might change in the future. For reference, the legacy compaction algorithm
 /// will split them into multiple files based on size.
@@ -267,12 +206,12 @@ impl SplitImageLayerWriter {
 pub struct SplitDeltaLayerWriter {
    inner: Option<(Key, DeltaLayerWriter)>,
    target_layer_size: u64,
+    generated_layers: Vec<SplitWriterResult>,
    conf: &'static PageServerConf,
    timeline_id: TimelineId,
    tenant_shard_id: TenantShardId,
    lsn_range: Range<Lsn>,
    last_key_written: Key,
-    batches: BatchLayerWriter,
 }

 impl SplitDeltaLayerWriter {
@@ -286,22 +225,29 @@ impl SplitDeltaLayerWriter {
        Ok(Self {
            target_layer_size,
            inner: None,
+            generated_layers: Vec::new(),
            conf,
            timeline_id,
            tenant_shard_id,
            lsn_range,
            last_key_written: Key::MIN,
-            batches: BatchLayerWriter::new(conf).await?,
        })
    }

-    pub async fn put_value(
+    /// Put value into the layer writer. In the case the writer decides to produce a layer, and the discard fn returns true, no layer will be written in the end.
+    pub async fn put_value_with_discard_fn<D, F>(
        &mut self,
        key: Key,
        lsn: Lsn,
        val: Value,
+        tline: &Arc<Timeline>,
        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+        discard: D,
+    ) -> anyhow::Result<()>
+    where
+        D: FnOnce(&PersistentLayerKey) -> F,
+        F: Future<Output = bool>,
+    {
        // The current estimation is key size plus LSN size plus value size estimation. This is not an accurate
        // number, and therefore the final layer size could be a little bit larger or smaller than the target.
        //
@@ -340,11 +286,21 @@ impl SplitDeltaLayerWriter {
                .await?;
                let (start_key, prev_delta_writer) =
                    std::mem::replace(&mut self.inner, Some((key, next_delta_writer))).unwrap();
-                self.batches.add_unfinished_delta_writer(
-                    prev_delta_writer,
-                    start_key..key,
-                    self.lsn_range.clone(),
-                );
+                let layer_key = PersistentLayerKey {
+                    key_range: start_key..key,
+                    lsn_range: self.lsn_range.clone(),
+                    is_delta: true,
+                };
+                if discard(&layer_key).await {
+                    drop(prev_delta_writer);
+                    self.generated_layers
+                        .push(SplitWriterResult::Discarded(layer_key));
+                } else {
+                    let (desc, path) = prev_delta_writer.finish(key, ctx).await?;
+                    let delta_layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
+                    self.generated_layers
+                        .push(SplitWriterResult::Produced(delta_layer));
+                }
            } else if inner.estimated_size() >= S3_UPLOAD_LIMIT {
                // We have to produce a very large file b/c a key is updated too often.
                anyhow::bail!(
@@ -359,30 +315,53 @@ impl SplitDeltaLayerWriter {
        inner.put_value(key, lsn, val, ctx).await
    }

+    pub async fn put_value(
+        &mut self,
+        key: Key,
+        lsn: Lsn,
+        val: Value,
+        tline: &Arc<Timeline>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        self.put_value_with_discard_fn(key, lsn, val, tline, ctx, |_| async { false })
+            .await
+    }
+
    pub(crate) async fn finish_with_discard_fn<D, F>(
        self,
        tline: &Arc<Timeline>,
        ctx: &RequestContext,
-        discard_fn: D,
-    ) -> anyhow::Result<Vec<BatchWriterResult>>
+        discard: D,
+    ) -> anyhow::Result<Vec<SplitWriterResult>>
    where
-        D: Fn(&PersistentLayerKey) -> F,
+        D: FnOnce(&PersistentLayerKey) -> F,
        F: Future<Output = bool>,
    {
        let Self {
-            mut batches, inner, ..
+            mut generated_layers,
+            inner,
+            ..
        } = self;
-        if let Some((start_key, writer)) = inner {
-            if writer.num_keys() != 0 {
-                let end_key = self.last_key_written.next();
-                batches.add_unfinished_delta_writer(
-                    writer,
-                    start_key..end_key,
-                    self.lsn_range.clone(),
-                );
-            }
+        let Some((start_key, inner)) = inner else {
+            return Ok(generated_layers);
+        };
+        if inner.num_keys() == 0 {
+            return Ok(generated_layers);
        }
-        batches.finish_with_discard_fn(tline, ctx, discard_fn).await
+        let end_key = self.last_key_written.next();
+        let layer_key = PersistentLayerKey {
+            key_range: start_key..end_key,
+            lsn_range: self.lsn_range.clone(),
+            is_delta: true,
+        };
+        if discard(&layer_key).await {
+            generated_layers.push(SplitWriterResult::Discarded(layer_key));
+        } else {
+            let (desc, path) = inner.finish(end_key, ctx).await?;
+            let delta_layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
+            generated_layers.push(SplitWriterResult::Produced(delta_layer));
+        }
+        Ok(generated_layers)
    }

    #[cfg(test)]
@@ -390,10 +369,15 @@ impl SplitDeltaLayerWriter {
        self,
        tline: &Arc<Timeline>,
        ctx: &RequestContext,
-    ) -> anyhow::Result<Vec<BatchWriterResult>> {
+    ) -> anyhow::Result<Vec<SplitWriterResult>> {
        self.finish_with_discard_fn(tline, ctx, |_| async { false })
            .await
    }
+
+    /// This function will be deprecated with #8841.
+    pub(crate) fn take(self) -> anyhow::Result<(Vec<SplitWriterResult>, Option<DeltaLayerWriter>)> {
+        Ok((self.generated_layers, self.inner.map(|x| x.1)))
+    }
 }

 #[cfg(test)]
@@ -463,7 +447,7 @@ mod tests {
        .unwrap();

        image_writer
-            .put_image(get_key(0), get_img(0), &ctx)
+            .put_image(get_key(0), get_img(0), &tline, &ctx)
            .await
            .unwrap();
        let layers = image_writer
@@ -473,7 +457,13 @@ mod tests {
        assert_eq!(layers.len(), 1);

        delta_writer
-            .put_value(get_key(0), Lsn(0x18), Value::Image(get_img(0)), &ctx)
+            .put_value(
+                get_key(0),
+                Lsn(0x18),
+                Value::Image(get_img(0)),
+                &tline,
+                &ctx,
+            )
            .await
            .unwrap();
        let layers = delta_writer.finish(&tline, &ctx).await.unwrap();
@@ -496,18 +486,14 @@ mod tests {

    #[tokio::test]
    async fn write_split() {
-        // Test the split writer with retaining all the layers we have produced (discard=false)
        write_split_helper("split_writer_write_split", false).await;
    }

    #[tokio::test]
    async fn write_split_discard() {
-        // Test the split writer with discarding all the layers we have produced (discard=true)
-        write_split_helper("split_writer_write_split_discard", true).await;
+        write_split_helper("split_writer_write_split_discard", false).await;
    }

-    /// Test the image+delta writer by writing a large number of images and deltas. If discard is
-    /// set to true, all layers will be discarded.
    async fn write_split_helper(harness_name: &'static str, discard: bool) {
        let harness = TenantHarness::create(harness_name).await.unwrap();
        let (tenant, ctx) = harness.load().await;
@@ -541,63 +527,69 @@ mod tests {
        for i in 0..N {
            let i = i as u32;
            image_writer
-                .put_image(get_key(i), get_large_img(), &ctx)
+                .put_image_with_discard_fn(get_key(i), get_large_img(), &tline, &ctx, |_| async {
+                    discard
+                })
                .await
                .unwrap();
            delta_writer
-                .put_value(get_key(i), Lsn(0x20), Value::Image(get_large_img()), &ctx)
+                .put_value_with_discard_fn(
+                    get_key(i),
+                    Lsn(0x20),
+                    Value::Image(get_large_img()),
+                    &tline,
+                    &ctx,
+                    |_| async { discard },
+                )
                .await
                .unwrap();
        }
        let image_layers = image_writer
-            .finish_with_discard_fn(&tline, &ctx, get_key(N as u32), |_| async { discard })
+            .finish(&tline, &ctx, get_key(N as u32))
            .await
            .unwrap();
-        let delta_layers = delta_writer
-            .finish_with_discard_fn(&tline, &ctx, |_| async { discard })
-            .await
-            .unwrap();
-        let image_layers = image_layers
-            .into_iter()
-            .map(|x| {
-                if discard {
-                    x.into_discarded_layer()
-                } else {
-                    x.into_resident_layer().layer_desc().key()
+        let delta_layers = delta_writer.finish(&tline, &ctx).await.unwrap();
+        if discard {
+            for layer in image_layers {
+                layer.into_discarded_layer();
+            }
+            for layer in delta_layers {
+                layer.into_discarded_layer();
+            }
+        } else {
+            let image_layers = image_layers
+                .into_iter()
+                .map(|x| x.into_resident_layer())
+                .collect_vec();
+            let delta_layers = delta_layers
+                .into_iter()
+                .map(|x| x.into_resident_layer())
+                .collect_vec();
+            assert_eq!(image_layers.len(), N / 512 + 1);
+            assert_eq!(delta_layers.len(), N / 512 + 1);
+            assert_eq!(
+                delta_layers.first().unwrap().layer_desc().key_range.start,
+                get_key(0)
+            );
+            assert_eq!(
+                delta_layers.last().unwrap().layer_desc().key_range.end,
+                get_key(N as u32)
+            );
+            for idx in 0..image_layers.len() {
+                assert_ne!(image_layers[idx].layer_desc().key_range.start, Key::MIN);
+                assert_ne!(image_layers[idx].layer_desc().key_range.end, Key::MAX);
+                assert_ne!(delta_layers[idx].layer_desc().key_range.start, Key::MIN);
+                assert_ne!(delta_layers[idx].layer_desc().key_range.end, Key::MAX);
+                if idx > 0 {
+                    assert_eq!(
+                        image_layers[idx - 1].layer_desc().key_range.end,
+                        image_layers[idx].layer_desc().key_range.start
+                    );
+                    assert_eq!(
+                        delta_layers[idx - 1].layer_desc().key_range.end,
+                        delta_layers[idx].layer_desc().key_range.start
+                    );
                }
-            })
-            .collect_vec();
-        let delta_layers = delta_layers
-            .into_iter()
-            .map(|x| {
-                if discard {
-                    x.into_discarded_layer()
-                } else {
-                    x.into_resident_layer().layer_desc().key()
-                }
-            })
-            .collect_vec();
-        assert_eq!(image_layers.len(), N / 512 + 1);
-        assert_eq!(delta_layers.len(), N / 512 + 1);
-        assert_eq!(delta_layers.first().unwrap().key_range.start, get_key(0));
-        assert_eq!(
-            delta_layers.last().unwrap().key_range.end,
-            get_key(N as u32)
-        );
-        for idx in 0..image_layers.len() {
-            assert_ne!(image_layers[idx].key_range.start, Key::MIN);
-            assert_ne!(image_layers[idx].key_range.end, Key::MAX);
-            assert_ne!(delta_layers[idx].key_range.start, Key::MIN);
-            assert_ne!(delta_layers[idx].key_range.end, Key::MAX);
-            if idx > 0 {
-                assert_eq!(
-                    image_layers[idx - 1].key_range.end,
-                    image_layers[idx].key_range.start
-                );
-                assert_eq!(
-                    delta_layers[idx - 1].key_range.end,
-                    delta_layers[idx].key_range.start
-                );
            }
        }
    }
@@ -637,11 +629,11 @@ mod tests {
        .unwrap();

        image_writer
-            .put_image(get_key(0), get_img(0), &ctx)
+            .put_image(get_key(0), get_img(0), &tline, &ctx)
            .await
            .unwrap();
        image_writer
-            .put_image(get_key(1), get_large_img(), &ctx)
+            .put_image(get_key(1), get_large_img(), &tline, &ctx)
            .await
            .unwrap();
        let layers = image_writer
@@ -651,11 +643,23 @@ mod tests {
        assert_eq!(layers.len(), 2);

        delta_writer
-            .put_value(get_key(0), Lsn(0x18), Value::Image(get_img(0)), &ctx)
+            .put_value(
+                get_key(0),
+                Lsn(0x18),
+                Value::Image(get_img(0)),
+                &tline,
+                &ctx,
+            )
            .await
            .unwrap();
        delta_writer
-            .put_value(get_key(1), Lsn(0x1A), Value::Image(get_large_img()), &ctx)
+            .put_value(
+                get_key(1),
+                Lsn(0x1A),
+                Value::Image(get_large_img()),
+                &tline,
+                &ctx,
+            )
            .await
            .unwrap();
        let layers = delta_writer.finish(&tline, &ctx).await.unwrap();
@@ -719,6 +723,7 @@ mod tests {
                    get_key(0),
                    Lsn(i as u64 * 16 + 0x10),
                    Value::Image(get_large_img()),
+                    &tline,
                    &ctx,
                )
                .await
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -28,9 +28,9 @@ use pageserver_api::{
    },
    keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPartitioning},
    models::{
-        CompactionAlgorithm, CompactionAlgorithmSettings, DownloadRemoteLayersTaskInfo,
-        DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy, InMemoryLayerInfo, LayerMapInfo,
-        LsnLease, TimelineState,
+        AtomicAuxFilePolicy, AuxFilePolicy, CompactionAlgorithm, CompactionAlgorithmSettings,
+        DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy,
+        InMemoryLayerInfo, LayerMapInfo, LsnLease, TimelineState,
    },
    reltag::BlockNumber,
    shard::{ShardIdentity, ShardNumber, TenantShardId},
@@ -98,12 +98,12 @@ use crate::{
 use crate::{
    metrics::ScanLatencyOngoingRecording, tenant::timeline::logical_size::CurrentLogicalSize,
 };
-use crate::{
-    pgdatadir_mapping::DirectoryKind,
-    virtual_file::{MaybeFatalIo, VirtualFile},
-};
 use crate::{pgdatadir_mapping::LsnForTimestamp, tenant::tasks::BackgroundLoopKind};
 use crate::{pgdatadir_mapping::MAX_AUX_FILE_V2_DELTAS, tenant::storage_layer::PersistentLayerKey};
+use crate::{
+    pgdatadir_mapping::{AuxFilesDirectory, DirectoryKind},
+    virtual_file::{MaybeFatalIo, VirtualFile},
+};
 use pageserver_api::config::tenant_conf_defaults::DEFAULT_PITR_INTERVAL;

 use crate::config::PageServerConf;
@@ -206,6 +206,11 @@ pub struct TimelineResources {
    pub l0_flush_global_state: l0_flush::L0FlushGlobalState,
 }

+pub(crate) struct AuxFilesState {
+    pub(crate) dir: Option<AuxFilesDirectory>,
+    pub(crate) n_deltas: usize,
+}
+
 /// The relation size cache caches relation sizes at the end of the timeline. It speeds up WAL
 /// ingestion considerably, because WAL ingestion needs to check on most records if the record
 /// implicitly extends the relation.  At startup, `complete_as_of` is initialized to the current end
@@ -371,7 +376,7 @@ pub struct Timeline {

    /// Prevent two tasks from deleting the timeline at the same time. If held, the
    /// timeline is being deleted. If 'true', the timeline has already been deleted.
-    pub delete_progress: TimelineDeleteProgress,
+    pub delete_progress: Arc<tokio::sync::Mutex<DeleteTimelineFlow>>,

    eviction_task_timeline_state: tokio::sync::Mutex<EvictionTaskTimelineState>,

@@ -408,9 +413,15 @@ pub struct Timeline {
    timeline_get_throttle:
        Arc<crate::tenant::throttle::Throttle<crate::metrics::tenant_throttling::TimelineGet>>,

+    /// Keep aux directory cache to avoid it's reconstruction on each update
+    pub(crate) aux_files: tokio::sync::Mutex<AuxFilesState>,
+
    /// Size estimator for aux file v2
    pub(crate) aux_file_size_estimator: AuxFileSizeEstimator,

+    /// Indicate whether aux file v2 storage is enabled.
+    pub(crate) last_aux_file_policy: AtomicAuxFilePolicy,
+
    /// Some test cases directly place keys into the timeline without actually modifying the directory
    /// keys (i.e., DB_DIR). The test cases creating such keys will put the keyspaces here, so that
    /// these keys won't get garbage-collected during compaction/GC. This field only modifies the dense
@@ -424,13 +435,8 @@ pub struct Timeline {
    pub(crate) handles: handle::PerTimelineState<crate::page_service::TenantManagerTypes>,

    pub(crate) attach_wal_lag_cooldown: Arc<OnceLock<WalLagCooldown>>,
-
-    /// Cf. [`crate::tenant::CreateTimelineIdempotency`].
-    pub(crate) create_idempotency: crate::tenant::CreateTimelineIdempotency,
 }

-pub type TimelineDeleteProgress = Arc<tokio::sync::Mutex<DeleteTimelineFlow>>;
-
 pub struct WalReceiverInfo {
    pub wal_source_connconf: PgConnectionConfig,
    pub last_received_msg_lsn: Lsn,
@@ -1559,7 +1565,6 @@ impl Timeline {
    }

    /// Checks if the internal state of the timeline is consistent with it being able to be offloaded.
-    ///
    /// This is neccessary but not sufficient for offloading of the timeline as it might have
    /// child timelines that are not offloaded yet.
    pub(crate) fn can_offload(&self) -> bool {
@@ -2006,6 +2011,14 @@ impl Timeline {
            .unwrap_or(self.conf.default_tenant_conf.lsn_lease_length_for_ts)
    }

+    pub(crate) fn get_switch_aux_file_policy(&self) -> AuxFilePolicy {
+        let tenant_conf = self.tenant_conf.load();
+        tenant_conf
+            .tenant_conf
+            .switch_aux_file_policy
+            .unwrap_or(self.conf.default_tenant_conf.switch_aux_file_policy)
+    }
+
    pub(crate) fn get_lazy_slru_download(&self) -> bool {
        let tenant_conf = self.tenant_conf.load();
        tenant_conf
@@ -2138,8 +2151,8 @@ impl Timeline {
        resources: TimelineResources,
        pg_version: u32,
        state: TimelineState,
+        aux_file_policy: Option<AuxFilePolicy>,
        attach_wal_lag_cooldown: Arc<OnceLock<WalLagCooldown>>,
-        create_idempotency: crate::tenant::CreateTimelineIdempotency,
        cancel: CancellationToken,
    ) -> Arc<Self> {
        let disk_consistent_lsn = metadata.disk_consistent_lsn();
@@ -2256,7 +2269,7 @@ impl Timeline {
                eviction_task_timeline_state: tokio::sync::Mutex::new(
                    EvictionTaskTimelineState::default(),
                ),
-                delete_progress: TimelineDeleteProgress::default(),
+                delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTimelineFlow::default())),

                cancel,
                gate: Gate::default(),
@@ -2268,8 +2281,15 @@ impl Timeline {

                timeline_get_throttle: resources.timeline_get_throttle,

+                aux_files: tokio::sync::Mutex::new(AuxFilesState {
+                    dir: None,
+                    n_deltas: 0,
+                }),
+
                aux_file_size_estimator: AuxFileSizeEstimator::new(aux_file_metrics),

+                last_aux_file_policy: AtomicAuxFilePolicy::new(aux_file_policy),
+
                #[cfg(test)]
                extra_test_dense_keyspace: ArcSwap::new(Arc::new(KeySpace::default())),

@@ -2278,10 +2298,12 @@ impl Timeline {
                handles: Default::default(),

                attach_wal_lag_cooldown,
-
-                create_idempotency,
            };

+            if aux_file_policy == Some(AuxFilePolicy::V1) {
+                warn!("this timeline is using deprecated aux file policy V1 (when loading the timeline)");
+            }
+
            result.repartition_threshold =
                result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;

@@ -2410,7 +2432,7 @@ impl Timeline {
    pub(super) async fn load_layer_map(
        &self,
        disk_consistent_lsn: Lsn,
-        index_part: IndexPart,
+        index_part: Option<IndexPart>,
    ) -> anyhow::Result<()> {
        use init::{Decision::*, Discovered, DismissedLayer};
        use LayerName::*;
@@ -2474,7 +2496,8 @@ impl Timeline {
                    );
                }

-                let decided = init::reconcile(discovered_layers, &index_part, disk_consistent_lsn);
+                let decided =
+                    init::reconcile(discovered_layers, index_part.as_ref(), disk_consistent_lsn);

                let mut loaded_layers = Vec::new();
                let mut needs_cleanup = Vec::new();
@@ -4455,6 +4478,14 @@ impl Timeline {
    ) -> Result<(), detach_ancestor::Error> {
        detach_ancestor::complete(self, tenant, attempt, ctx).await
    }
+
+    /// Switch aux file policy and schedule upload to the index part.
+    pub(crate) fn do_switch_aux_policy(&self, policy: AuxFilePolicy) -> anyhow::Result<()> {
+        self.last_aux_file_policy.store(Some(policy));
+        self.remote_client
+            .schedule_index_upload_for_aux_file_policy_update(Some(policy))?;
+        Ok(())
+    }
 }

 impl Drop for Timeline {
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -29,14 +29,13 @@ use utils::id::TimelineId;

 use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
 use crate::page_cache;
-use crate::statvfs::Statvfs;
 use crate::tenant::checks::check_valid_layermap;
 use crate::tenant::remote_timeline_client::WaitCompletionError;
-use crate::tenant::storage_layer::batch_split_writer::{
-    BatchWriterResult, SplitDeltaLayerWriter, SplitImageLayerWriter,
-};
 use crate::tenant::storage_layer::filter_iterator::FilterIterator;
 use crate::tenant::storage_layer::merge_iterator::MergeIterator;
+use crate::tenant::storage_layer::split_writer::{
+    SplitDeltaLayerWriter, SplitImageLayerWriter, SplitWriterResult,
+};
 use crate::tenant::storage_layer::{
    AsLayerDesc, PersistentLayerDesc, PersistentLayerKey, ValueReconstructState,
 };
@@ -121,12 +120,18 @@ impl KeyHistoryRetention {
    async fn pipe_to(
        self,
        key: Key,
+        tline: &Arc<Timeline>,
        delta_writer: &mut SplitDeltaLayerWriter,
        mut image_writer: Option<&mut SplitImageLayerWriter>,
        stat: &mut CompactionStatistics,
+        dry_run: bool,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
        let mut first_batch = true;
+        let discard = |key: &PersistentLayerKey| {
+            let key = key.clone();
+            async move { Self::discard_key(&key, tline, dry_run).await }
+        };
        for (cutoff_lsn, KeyLogAtLsn(logs)) in self.below_horizon {
            if first_batch {
                if logs.len() == 1 && logs[0].1.is_image() {
@@ -135,30 +140,45 @@ impl KeyHistoryRetention {
                    };
                    stat.produce_image_key(img);
                    if let Some(image_writer) = image_writer.as_mut() {
-                        image_writer.put_image(key, img.clone(), ctx).await?;
+                        image_writer
+                            .put_image_with_discard_fn(key, img.clone(), tline, ctx, discard)
+                            .await?;
                    } else {
                        delta_writer
-                            .put_value(key, cutoff_lsn, Value::Image(img.clone()), ctx)
+                            .put_value_with_discard_fn(
+                                key,
+                                cutoff_lsn,
+                                Value::Image(img.clone()),
+                                tline,
+                                ctx,
+                                discard,
+                            )
                            .await?;
                    }
                } else {
                    for (lsn, val) in logs {
                        stat.produce_key(&val);
-                        delta_writer.put_value(key, lsn, val, ctx).await?;
+                        delta_writer
+                            .put_value_with_discard_fn(key, lsn, val, tline, ctx, discard)
+                            .await?;
                    }
                }
                first_batch = false;
            } else {
                for (lsn, val) in logs {
                    stat.produce_key(&val);
-                    delta_writer.put_value(key, lsn, val, ctx).await?;
+                    delta_writer
+                        .put_value_with_discard_fn(key, lsn, val, tline, ctx, discard)
+                        .await?;
                }
            }
        }
        let KeyLogAtLsn(above_horizon_logs) = self.above_horizon;
        for (lsn, val) in above_horizon_logs {
            stat.produce_key(&val);
-            delta_writer.put_value(key, lsn, val, ctx).await?;
+            delta_writer
+                .put_value_with_discard_fn(key, lsn, val, tline, ctx, discard)
+                .await?;
        }
        Ok(())
    }
@@ -834,12 +854,7 @@ impl Timeline {
                if self.cancel.is_cancelled() {
                    return Err(CompactionError::ShuttingDown);
                }
-                let delta = l.get_as_delta(ctx).await.map_err(CompactionError::Other)?;
-                let keys = delta
-                    .index_entries(ctx)
-                    .await
-                    .map_err(CompactionError::Other)?;
-                all_keys.extend(keys);
+                all_keys.extend(l.load_keys(ctx).await.map_err(CompactionError::Other)?);
            }
            // The current stdlib sorting implementation is designed in a way where it is
            // particularly fast where the slice is made up of sorted sub-ranges.
@@ -1676,45 +1691,6 @@ impl Timeline {
        unreachable!("key retention is empty")
    }

-    /// Check how much space is left on the disk
-    async fn check_available_space(self: &Arc<Self>) -> anyhow::Result<u64> {
-        let tenants_dir = self.conf.tenants_path();
-
-        let stat = Statvfs::get(&tenants_dir, None)
-            .context("statvfs failed, presumably directory got unlinked")?;
-
-        let (avail_bytes, _) = stat.get_avail_total_bytes();
-
-        Ok(avail_bytes)
-    }
-
-    /// Check if the compaction can proceed safely without running out of space. We assume the size
-    /// upper bound of the produced files of a compaction job is the same as all layers involved in
-    /// the compaction. Therefore, we need `2 * layers_to_be_compacted_size` at least to do a
-    /// compaction.
-    async fn check_compaction_space(
-        self: &Arc<Self>,
-        layer_selection: &[Layer],
-    ) -> anyhow::Result<()> {
-        let available_space = self.check_available_space().await?;
-        let mut remote_layer_size = 0;
-        let mut all_layer_size = 0;
-        for layer in layer_selection {
-            let needs_download = layer.needs_download().await?;
-            if needs_download.is_some() {
-                remote_layer_size += layer.layer_desc().file_size;
-            }
-            all_layer_size += layer.layer_desc().file_size;
-        }
-        let allocated_space = (available_space as f64 * 0.8) as u64; /* reserve 20% space for other tasks */
-        if all_layer_size /* space needed for newly-generated file */ + remote_layer_size /* space for downloading layers */ > allocated_space
-        {
-            return Err(anyhow!("not enough space for compaction: available_space={}, allocated_space={}, all_layer_size={}, remote_layer_size={}, required_space={}",
-                available_space, allocated_space, all_layer_size, remote_layer_size, all_layer_size + remote_layer_size));
-        }
-        Ok(())
-    }
-
    /// An experimental compaction building block that combines compaction with garbage collection.
    ///
    /// The current implementation picks all delta + image layers that are below or intersecting with
@@ -1830,8 +1806,6 @@ impl Timeline {
            lowest_retain_lsn
        );

-        self.check_compaction_space(&layer_selection).await?;
-
        // Step 1: (In the future) construct a k-merge iterator over all layers. For now, simply collect all keys + LSNs.
        // Also, verify if the layer map can be split by drawing a horizontal line at every LSN start/end split point.
        let mut lsn_split_point = BTreeSet::new(); // TODO: use a better data structure (range tree / range set?)
@@ -1974,9 +1948,11 @@ impl Timeline {
                retention
                    .pipe_to(
                        *last_key,
+                        self,
                        &mut delta_layer_writer,
                        image_layer_writer.as_mut(),
                        &mut stat,
+                        dry_run,
                        ctx,
                    )
                    .await?;
@@ -2003,9 +1979,11 @@ impl Timeline {
        retention
            .pipe_to(
                last_key,
+                self,
                &mut delta_layer_writer,
                image_layer_writer.as_mut(),
                &mut stat,
+                dry_run,
                ctx,
            )
            .await?;
@@ -2021,7 +1999,8 @@ impl Timeline {
                    .finish_with_discard_fn(self, ctx, Key::MAX, discard)
                    .await?
            } else {
-                drop(writer);
+                let (layers, _) = writer.take()?;
+                assert!(layers.is_empty(), "image layers produced in dry run mode?");
                Vec::new()
            }
        } else {
@@ -2033,7 +2012,8 @@ impl Timeline {
                .finish_with_discard_fn(self, ctx, discard)
                .await?
        } else {
-            drop(delta_layer_writer);
+            let (layers, _) = delta_layer_writer.take()?;
+            assert!(layers.is_empty(), "delta layers produced in dry run mode?");
            Vec::new()
        };

@@ -2043,11 +2023,11 @@ impl Timeline {
        let produced_image_layers_len = produced_image_layers.len();
        for action in produced_delta_layers {
            match action {
-                BatchWriterResult::Produced(layer) => {
+                SplitWriterResult::Produced(layer) => {
                    stat.produce_delta_layer(layer.layer_desc().file_size());
                    compact_to.push(layer);
                }
-                BatchWriterResult::Discarded(l) => {
+                SplitWriterResult::Discarded(l) => {
                    keep_layers.insert(l);
                    stat.discard_delta_layer();
                }
@@ -2055,11 +2035,11 @@ impl Timeline {
        }
        for action in produced_image_layers {
            match action {
-                BatchWriterResult::Produced(layer) => {
+                SplitWriterResult::Produced(layer) => {
                    stat.produce_image_layer(layer.layer_desc().file_size());
                    compact_to.push(layer);
                }
-                BatchWriterResult::Discarded(l) => {
+                SplitWriterResult::Discarded(l) => {
                    keep_layers.insert(l);
                    stat.discard_image_layer();
                }
@@ -2443,7 +2423,7 @@ impl CompactionDeltaLayer<TimelineAdaptor> for ResidentDeltaLayer {
    type DeltaEntry<'a> = DeltaEntry<'a>;

    async fn load_keys<'a>(&self, ctx: &RequestContext) -> anyhow::Result<Vec<DeltaEntry<'_>>> {
-        self.0.get_as_delta(ctx).await?.index_entries(ctx).await
+        self.0.load_keys(ctx).await
    }
 }

--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -14,9 +14,7 @@ use crate::{
    task_mgr::{self, TaskKind},
    tenant::{
        metadata::TimelineMetadata,
-        remote_timeline_client::{
-            self, PersistIndexPartWithDeletedFlagError, RemoteTimelineClient,
-        },
+        remote_timeline_client::{PersistIndexPartWithDeletedFlagError, RemoteTimelineClient},
        CreateTimelineCause, DeleteTimelineError, Tenant, TimelineOrOffloaded,
    },
 };
@@ -27,9 +25,12 @@ use super::{Timeline, TimelineResources};
 /// during attach or pageserver restart.
 /// See comment in persist_index_part_with_deleted_flag.
 async fn set_deleted_in_remote_index(
-    remote_client: &Arc<RemoteTimelineClient>,
+    timeline: &TimelineOrOffloaded,
 ) -> Result<(), DeleteTimelineError> {
-    let res = remote_client.persist_index_part_with_deleted_flag().await;
+    let res = timeline
+        .remote_client()
+        .persist_index_part_with_deleted_flag()
+        .await;
    match res {
        // If we (now, or already) marked it successfully as deleted, we can proceed
        Ok(()) | Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(_)) => (),
@@ -128,10 +129,12 @@ pub(super) async fn delete_local_timeline_directory(
 }

 /// Removes remote layers and an index file after them.
-async fn delete_remote_layers_and_index(
-    remote_client: &Arc<RemoteTimelineClient>,
-) -> anyhow::Result<()> {
-    remote_client.delete_all().await.context("delete_all")
+async fn delete_remote_layers_and_index(timeline: &TimelineOrOffloaded) -> anyhow::Result<()> {
+    timeline
+        .remote_client()
+        .delete_all()
+        .await
+        .context("delete_all")
 }

 /// It is important that this gets called when DeletionGuard is being held.
@@ -176,32 +179,6 @@ async fn remove_maybe_offloaded_timeline_from_tenant(
    Ok(())
 }

-/// It is important that this gets called when DeletionGuard is being held.
-/// For more context see comments in [`DeleteTimelineFlow::prepare`]
-async fn upload_new_tenant_manifest(
-    tenant: &Tenant,
-    _: &DeletionGuard, // using it as a witness
-) -> anyhow::Result<()> {
-    // This is susceptible to race conditions, i.e. we won't continue deletions if there is a crash
-    // between the deletion of the index-part.json and reaching of this code.
-    // So indeed, the tenant manifest might refer to an offloaded timeline which has already been deleted.
-    // However, we handle this case in tenant loading code so the next time we attach, the issue is
-    // resolved.
-    let manifest = tenant.tenant_manifest();
-    // TODO: generation support
-    let generation = remote_timeline_client::TENANT_MANIFEST_GENERATION;
-    remote_timeline_client::upload_tenant_manifest(
-        &tenant.remote_storage,
-        &tenant.tenant_shard_id,
-        generation,
-        &manifest,
-        &tenant.cancel,
-    )
-    .await?;
-
-    Ok(())
-}
-
 /// Orchestrates timeline shut down of all timeline tasks, removes its in-memory structures,
 /// and deletes its data from both disk and s3.
 /// The sequence of steps:
@@ -258,8 +235,7 @@ impl DeleteTimelineFlow {
            ))?
        });

-        let remote_client = timeline.remote_client_maybe_construct(tenant);
-        set_deleted_in_remote_index(&remote_client).await?;
+        set_deleted_in_remote_index(&timeline).await?;

        fail::fail_point!("timeline-delete-before-schedule", |_| {
            Err(anyhow::anyhow!(
@@ -267,13 +243,7 @@ impl DeleteTimelineFlow {
            ))?
        });

-        Self::schedule_background(
-            guard,
-            tenant.conf,
-            Arc::clone(tenant),
-            timeline,
-            remote_client,
-        );
+        Self::schedule_background(guard, tenant.conf, Arc::clone(tenant), timeline);

        Ok(())
    }
@@ -313,7 +283,8 @@ impl DeleteTimelineFlow {
                // Important. We dont pass ancestor above because it can be missing.
                // Thus we need to skip the validation here.
                CreateTimelineCause::Delete,
-                crate::tenant::CreateTimelineIdempotency::FailWithConflict, // doesn't matter what we put here
+                // Aux file policy is not needed for deletion, assuming deletion does not read aux keyspace
+                None,
            )
            .context("create_timeline_struct")?;

@@ -332,9 +303,8 @@ impl DeleteTimelineFlow {

        guard.mark_in_progress()?;

-        let remote_client = timeline.remote_client.clone();
        let timeline = TimelineOrOffloaded::Timeline(timeline);
-        Self::schedule_background(guard, tenant.conf, tenant, timeline, remote_client);
+        Self::schedule_background(guard, tenant.conf, tenant, timeline);

        Ok(())
    }
@@ -412,7 +382,6 @@ impl DeleteTimelineFlow {
        conf: &'static PageServerConf,
        tenant: Arc<Tenant>,
        timeline: TimelineOrOffloaded,
-        remote_client: Arc<RemoteTimelineClient>,
    ) {
        let tenant_shard_id = timeline.tenant_shard_id();
        let timeline_id = timeline.timeline_id();
@@ -424,7 +393,7 @@ impl DeleteTimelineFlow {
            Some(timeline_id),
            "timeline_delete",
            async move {
-                if let Err(err) = Self::background(guard, conf, &tenant, &timeline, remote_client).await {
+                if let Err(err) = Self::background(guard, conf, &tenant, &timeline).await {
                    error!("Error: {err:#}");
                    if let TimelineOrOffloaded::Timeline(timeline) = timeline {
                        timeline.set_broken(format!("{err:#}"))
@@ -441,7 +410,6 @@ impl DeleteTimelineFlow {
        conf: &PageServerConf,
        tenant: &Tenant,
        timeline: &TimelineOrOffloaded,
-        remote_client: Arc<RemoteTimelineClient>,
    ) -> Result<(), DeleteTimelineError> {
        // Offloaded timelines have no local state
        // TODO: once we persist offloaded information, delete the timeline from there, too
@@ -449,14 +417,12 @@ impl DeleteTimelineFlow {
            delete_local_timeline_directory(conf, tenant.tenant_shard_id, timeline).await?;
        }

-        delete_remote_layers_and_index(&remote_client).await?;
+        delete_remote_layers_and_index(timeline).await?;

        pausable_failpoint!("in_progress_delete");

        remove_maybe_offloaded_timeline_from_tenant(tenant, timeline, &guard).await?;

-        upload_new_tenant_manifest(tenant, &guard).await?;
-
        *guard = Self::Finished;

        Ok(())
--- a/pageserver/src/tenant/timeline/init.rs
+++ b/pageserver/src/tenant/timeline/init.rs
@@ -125,9 +125,19 @@ pub(super) enum DismissedLayer {
 /// Merges local discoveries and remote [`IndexPart`] to a collection of decisions.
 pub(super) fn reconcile(
    local_layers: Vec<(LayerName, LocalLayerFileMetadata)>,
-    index_part: &IndexPart,
+    index_part: Option<&IndexPart>,
    disk_consistent_lsn: Lsn,
 ) -> Vec<(LayerName, Result<Decision, DismissedLayer>)> {
+    let Some(index_part) = index_part else {
+        // If we have no remote metadata, no local layer files are considered valid to load
+        return local_layers
+            .into_iter()
+            .map(|(layer_name, local_metadata)| {
+                (layer_name, Err(DismissedLayer::LocalOnly(local_metadata)))
+            })
+            .collect();
+    };
+
    let mut result = Vec::new();

    let mut remote_layers = HashMap::new();
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -45,16 +45,13 @@ impl LayerManager {
    pub(crate) fn get_from_key(&self, key: &PersistentLayerKey) -> Layer {
        // The assumption for the `expect()` is that all code maintains the following invariant:
        // A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor.
-        self.try_get_from_key(key)
+        self.layers()
+            .get(key)
            .with_context(|| format!("get layer from key: {key}"))
            .expect("not found")
            .clone()
    }

-    pub(crate) fn try_get_from_key(&self, key: &PersistentLayerKey) -> Option<&Layer> {
-        self.layers().get(key)
-    }
-
    pub(crate) fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Layer {
        self.get_from_key(&desc.key())
    }
--- a/pageserver/src/tenant/timeline/offload.rs
+++ b/pageserver/src/tenant/timeline/offload.rs
@@ -1,17 +1,17 @@
 use std::sync::Arc;

-use super::delete::{delete_local_timeline_directory, DeleteTimelineFlow, DeletionGuard};
-use super::Timeline;
-use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
-use crate::tenant::{remote_timeline_client, OffloadedTimeline, Tenant, TimelineOrOffloaded};
+use crate::tenant::{OffloadedTimeline, Tenant, TimelineOrOffloaded};
+
+use super::{
+    delete::{delete_local_timeline_directory, DeleteTimelineFlow, DeletionGuard},
+    Timeline,
+};

 pub(crate) async fn offload_timeline(
    tenant: &Tenant,
    timeline: &Arc<Timeline>,
 ) -> anyhow::Result<()> {
-    debug_assert_current_span_has_tenant_and_timeline_id();
    tracing::info!("offloading archived timeline");
-
    let (timeline, guard) = DeleteTimelineFlow::prepare(tenant, timeline.timeline_id)?;

    let TimelineOrOffloaded::Timeline(timeline) = timeline else {
@@ -19,28 +19,14 @@ pub(crate) async fn offload_timeline(
        return Ok(());
    };

-    let is_archived = timeline.is_archived();
-    match is_archived {
-        Some(true) => (),
-        Some(false) => {
-            tracing::warn!(?is_archived, "tried offloading a non-archived timeline");
-            anyhow::bail!("timeline isn't archived");
-        }
-        None => {
-            tracing::warn!(
-                ?is_archived,
-                "tried offloading a timeline where manifest is not yet available"
-            );
-            anyhow::bail!("timeline manifest hasn't been loaded yet");
-        }
-    }
-
    // Now that the Timeline is in Stopping state, request all the related tasks to shut down.
    timeline.shutdown(super::ShutdownMode::Hard).await;

    // TODO extend guard mechanism above with method
    // to make deletions possible while offloading is in progress

+    // TODO mark timeline as offloaded in S3
+
    let conf = &tenant.conf;
    delete_local_timeline_directory(conf, tenant.tenant_shard_id, &timeline).await?;

@@ -50,31 +36,10 @@ pub(crate) async fn offload_timeline(
        let mut offloaded_timelines = tenant.timelines_offloaded.lock().unwrap();
        offloaded_timelines.insert(
            timeline.timeline_id,
-            Arc::new(
-                OffloadedTimeline::from_timeline(&timeline)
-                    .expect("we checked above that timeline was ready"),
-            ),
+            Arc::new(OffloadedTimeline::from_timeline(&timeline)),
        );
    }

-    // Last step: mark timeline as offloaded in S3
-    // TODO: maybe move this step above, right above deletion of the local timeline directory,
-    // then there is no potential race condition where we partially offload a timeline, and
-    // at the next restart attach it again.
-    // For that to happen, we'd need to make the manifest reflect our *intended* state,
-    // not our actual state of offloaded timelines.
-    let manifest = tenant.tenant_manifest();
-    // TODO: generation support
-    let generation = remote_timeline_client::TENANT_MANIFEST_GENERATION;
-    remote_timeline_client::upload_tenant_manifest(
-        &tenant.remote_storage,
-        &tenant.tenant_shard_id,
-        generation,
-        &manifest,
-        &tenant.cancel,
-    )
-    .await?;
-
    Ok(())
 }

--- a/pageserver/src/tenant/timeline/uninit.rs
+++ b/pageserver/src/tenant/timeline/uninit.rs
@@ -5,11 +5,7 @@ use camino::Utf8PathBuf;
 use tracing::{error, info, info_span};
 use utils::{fs_ext, id::TimelineId, lsn::Lsn};

-use crate::{
-    context::RequestContext,
-    import_datadir,
-    tenant::{CreateTimelineIdempotency, Tenant, TimelineOrOffloaded},
-};
+use crate::{context::RequestContext, import_datadir, tenant::Tenant};

 use super::Timeline;

@@ -169,17 +165,13 @@ pub(crate) struct TimelineCreateGuard<'t> {
    owning_tenant: &'t Tenant,
    timeline_id: TimelineId,
    pub(crate) timeline_path: Utf8PathBuf,
-    pub(crate) idempotency: CreateTimelineIdempotency,
 }

 /// Errors when acquiring exclusive access to a timeline ID for creation
 #[derive(thiserror::Error, Debug)]
 pub(crate) enum TimelineExclusionError {
    #[error("Already exists")]
-    AlreadyExists {
-        existing: TimelineOrOffloaded,
-        arg: CreateTimelineIdempotency,
-    },
+    AlreadyExists(Arc<Timeline>),
    #[error("Already creating")]
    AlreadyCreating,

@@ -193,42 +185,27 @@ impl<'t> TimelineCreateGuard<'t> {
        owning_tenant: &'t Tenant,
        timeline_id: TimelineId,
        timeline_path: Utf8PathBuf,
-        idempotency: CreateTimelineIdempotency,
-        allow_offloaded: bool,
    ) -> Result<Self, TimelineExclusionError> {
        // Lock order: this is the only place we take both locks.  During drop() we only
        // lock creating_timelines
        let timelines = owning_tenant.timelines.lock().unwrap();
-        let timelines_offloaded = owning_tenant.timelines_offloaded.lock().unwrap();
        let mut creating_timelines: std::sync::MutexGuard<
            '_,
            std::collections::HashSet<TimelineId>,
        > = owning_tenant.timelines_creating.lock().unwrap();

        if let Some(existing) = timelines.get(&timeline_id) {
-            return Err(TimelineExclusionError::AlreadyExists {
-                existing: TimelineOrOffloaded::Timeline(existing.clone()),
-                arg: idempotency,
-            });
+            Err(TimelineExclusionError::AlreadyExists(existing.clone()))
+        } else if creating_timelines.contains(&timeline_id) {
+            Err(TimelineExclusionError::AlreadyCreating)
+        } else {
+            creating_timelines.insert(timeline_id);
+            Ok(Self {
+                owning_tenant,
+                timeline_id,
+                timeline_path,
+            })
        }
-        if !allow_offloaded {
-            if let Some(existing) = timelines_offloaded.get(&timeline_id) {
-                return Err(TimelineExclusionError::AlreadyExists {
-                    existing: TimelineOrOffloaded::Offloaded(existing.clone()),
-                    arg: idempotency,
-                });
-            }
-        }
-        if creating_timelines.contains(&timeline_id) {
-            return Err(TimelineExclusionError::AlreadyCreating);
-        }
-        creating_timelines.insert(timeline_id);
-        Ok(Self {
-            owning_tenant,
-            timeline_id,
-            timeline_path,
-            idempotency,
-        })
    }
 }

--- a/Show More
+++ b/Show More