diff --git a/.github/actions/allure-report-generate/action.yml b/.github/actions/allure-report-generate/action.yml
index 1ecb5ecc7e..f84beff20c 100644
--- a/.github/actions/allure-report-generate/action.yml
+++ b/.github/actions/allure-report-generate/action.yml
@@ -150,7 +150,7 @@ runs:
 
         # Use aws s3 cp (instead of aws s3 sync) to keep files from previous runs to make old URLs work,
         # and to keep files on the host to upload them to the database
-        time aws s3 cp --recursive --only-show-errors "${WORKDIR}/report" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}"
+        time s5cmd --log error cp "${WORKDIR}/report/*" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}/"
 
         # Generate redirect
         cat <<EOF > ${WORKDIR}/index.html
diff --git a/.github/actions/neon-branch-create/action.yml b/.github/actions/neon-branch-create/action.yml
index f1eea34ab9..dea3fc2357 100644
--- a/.github/actions/neon-branch-create/action.yml
+++ b/.github/actions/neon-branch-create/action.yml
@@ -10,7 +10,7 @@ inputs:
     required: true
   api_host:
     desctiption: 'Neon API host'
-    default: console.stage.neon.tech
+    default: console-stage.neon.build
 outputs:
   dsn:
     description: 'Created Branch DSN (for main database)'
diff --git a/.github/actions/neon-branch-delete/action.yml b/.github/actions/neon-branch-delete/action.yml
index f8cd351dd9..8acba7ad00 100644
--- a/.github/actions/neon-branch-delete/action.yml
+++ b/.github/actions/neon-branch-delete/action.yml
@@ -13,7 +13,7 @@ inputs:
     required: true
   api_host:
     desctiption: 'Neon API host'
-    default: console.stage.neon.tech
+    default: console-stage.neon.build
 
 runs:
   using: "composite"
diff --git a/.github/actions/neon-project-create/action.yml b/.github/actions/neon-project-create/action.yml
index ae6464990e..7f0e599b97 100644
--- a/.github/actions/neon-project-create/action.yml
+++ b/.github/actions/neon-project-create/action.yml
@@ -13,7 +13,7 @@ inputs:
     default: 15
   api_host:
     desctiption: 'Neon API host'
-    default: console.stage.neon.tech
+    default: console-stage.neon.build
   provisioner:
     desctiption: 'k8s-pod or k8s-neonvm'
     default: 'k8s-pod'
diff --git a/.github/actions/neon-project-delete/action.yml b/.github/actions/neon-project-delete/action.yml
index adc8510a34..b8ec6cac70 100644
--- a/.github/actions/neon-project-delete/action.yml
+++ b/.github/actions/neon-project-delete/action.yml
@@ -10,7 +10,7 @@ inputs:
     required: true
   api_host:
     desctiption: 'Neon API host'
-    default: console.stage.neon.tech
+    default: console-stage.neon.build
 
 runs:
   using: "composite"
diff --git a/.github/workflows/approved-for-ci-run.yml b/.github/workflows/approved-for-ci-run.yml
index 69c48d86b9..ab616d17e2 100644
--- a/.github/workflows/approved-for-ci-run.yml
+++ b/.github/workflows/approved-for-ci-run.yml
@@ -18,6 +18,7 @@ on:
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number }}
+  cancel-in-progress: false
 
 env:
   GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/build-build-tools-image.yml b/.github/workflows/build-build-tools-image.yml
index 251423e701..c527cef1ac 100644
--- a/.github/workflows/build-build-tools-image.yml
+++ b/.github/workflows/build-build-tools-image.yml
@@ -21,6 +21,7 @@ defaults:
 
 concurrency:
   group: build-build-tools-image-${{ inputs.image-tag }}
+  cancel-in-progress: false
 
 # No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
 permissions: {}
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 71cb56df91..a6fdd1953e 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -480,6 +480,8 @@ jobs:
           BUILD_TAG: ${{ needs.tag.outputs.build-tag }}
           PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
           PAGESERVER_GET_VECTORED_IMPL: vectored
+          PAGESERVER_GET_IMPL: vectored
+          PAGESERVER_VALIDATE_VEC_GET: true
 
       # Temporary disable this step until we figure out why it's so flaky
       # Ref https://github.com/neondatabase/neon/issues/4540
@@ -559,6 +561,9 @@ jobs:
           PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
           TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
           PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
+          PAGESERVER_GET_VECTORED_IMPL: vectored
+          PAGESERVER_GET_IMPL: vectored
+          PAGESERVER_VALIDATE_VEC_GET: false
       # XXX: no coverage data handling here, since benchmarks are run on release builds,
       # while coverage is currently collected for the debug ones
 
@@ -738,7 +743,7 @@ jobs:
         run: |
           mkdir -p .docker-custom
           echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
-      - uses: docker/setup-buildx-action@v3
+      - uses: docker/setup-buildx-action@v2
 
       - uses: docker/login-action@v3
         with:
@@ -795,7 +800,7 @@ jobs:
         run: |
           mkdir -p .docker-custom
           echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
-      - uses: docker/setup-buildx-action@v3
+      - uses: docker/setup-buildx-action@v2
         with:
           # Disable parallelism for docker buildkit.
           # As we already build everything with `make -j$(nproc)`, running it in additional level of parallelisam blows up the Runner.
@@ -868,7 +873,7 @@ jobs:
       run:
         shell: sh -eu {0}
     env:
-      VM_BUILDER_VERSION: v0.23.2
+      VM_BUILDER_VERSION: v0.28.1
 
     steps:
       - name: Checkout
@@ -1136,8 +1141,6 @@ jobs:
               -f deployPreprodRegion=true
 
             gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main \
-              -f deployPgSniRouter=false \
-              -f deployProxy=false \
               -f deployStorage=true \
               -f deployStorageBroker=true \
               -f deployStorageController=true \
diff --git a/.github/workflows/check-build-tools-image.yml b/.github/workflows/check-build-tools-image.yml
index 28646dfc19..a1e22cf93f 100644
--- a/.github/workflows/check-build-tools-image.yml
+++ b/.github/workflows/check-build-tools-image.yml
@@ -28,7 +28,9 @@ jobs:
       - name: Get build-tools image tag for the current commit
         id: get-build-tools-tag
         env:
-          COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
+          # Usually, for COMMIT_SHA, we use `github.event.pull_request.head.sha || github.sha`, but here, even for PRs,
+          # we want to use `github.sha` i.e. point to a phantom merge commit to determine the image tag correctly.
+          COMMIT_SHA: ${{ github.sha }}
           GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         run: |
           LAST_BUILD_TOOLS_SHA=$(
diff --git a/.github/workflows/pin-build-tools-image.yml b/.github/workflows/pin-build-tools-image.yml
index c941692066..d495a158e8 100644
--- a/.github/workflows/pin-build-tools-image.yml
+++ b/.github/workflows/pin-build-tools-image.yml
@@ -20,6 +20,7 @@ defaults:
 
 concurrency:
   group: pin-build-tools-image-${{ inputs.from-tag }}
+  cancel-in-progress: false
 
 permissions: {}
 
diff --git a/Cargo.lock b/Cargo.lock
index 9d78a382ea..c71a0e550e 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -270,6 +270,12 @@ dependencies = [
  "critical-section",
 ]
 
+[[package]]
+name = "atomic-take"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a8ab6b55fe97976e46f91ddbed8d147d966475dc29b2032757ba47e02376fbc3"
+
 [[package]]
 name = "autocfg"
 version = "1.1.0"
@@ -298,7 +304,7 @@ dependencies = [
  "fastrand 2.0.0",
  "hex",
  "http 0.2.9",
- "hyper",
+ "hyper 0.14.26",
  "ring 0.17.6",
  "time",
  "tokio",
@@ -335,7 +341,7 @@ dependencies = [
  "bytes",
  "fastrand 2.0.0",
  "http 0.2.9",
- "http-body",
+ "http-body 0.4.5",
  "percent-encoding",
  "pin-project-lite",
  "tracing",
@@ -386,7 +392,7 @@ dependencies = [
  "aws-types",
  "bytes",
  "http 0.2.9",
- "http-body",
+ "http-body 0.4.5",
  "once_cell",
  "percent-encoding",
  "regex-lite",
@@ -514,7 +520,7 @@ dependencies = [
  "crc32fast",
  "hex",
  "http 0.2.9",
- "http-body",
+ "http-body 0.4.5",
  "md-5",
  "pin-project-lite",
  "sha1",
@@ -546,7 +552,7 @@ dependencies = [
  "bytes-utils",
  "futures-core",
  "http 0.2.9",
- "http-body",
+ "http-body 0.4.5",
  "once_cell",
  "percent-encoding",
  "pin-project-lite",
@@ -585,15 +591,15 @@ dependencies = [
  "aws-smithy-types",
  "bytes",
  "fastrand 2.0.0",
- "h2",
+ "h2 0.3.26",
  "http 0.2.9",
- "http-body",
- "hyper",
- "hyper-rustls",
+ "http-body 0.4.5",
+ "hyper 0.14.26",
+ "hyper-rustls 0.24.0",
  "once_cell",
  "pin-project-lite",
  "pin-utils",
- "rustls 0.21.9",
+ "rustls 0.21.11",
  "tokio",
  "tracing",
 ]
@@ -626,7 +632,7 @@ dependencies = [
  "bytes-utils",
  "futures-core",
  "http 0.2.9",
- "http-body",
+ "http-body 0.4.5",
  "itoa",
  "num-integer",
  "pin-project-lite",
@@ -675,10 +681,10 @@ dependencies = [
  "bytes",
  "futures-util",
  "http 0.2.9",
- "http-body",
- "hyper",
+ "http-body 0.4.5",
+ "hyper 0.14.26",
  "itoa",
- "matchit",
+ "matchit 0.7.0",
  "memchr",
  "mime",
  "percent-encoding",
@@ -691,7 +697,7 @@ dependencies = [
  "sha1",
  "sync_wrapper",
  "tokio",
- "tokio-tungstenite",
+ "tokio-tungstenite 0.20.0",
  "tower",
  "tower-layer",
  "tower-service",
@@ -707,7 +713,7 @@ dependencies = [
  "bytes",
  "futures-util",
  "http 0.2.9",
- "http-body",
+ "http-body 0.4.5",
  "mime",
  "rustversion",
  "tower-layer",
@@ -716,9 +722,9 @@ dependencies = [
 
 [[package]]
 name = "azure_core"
-version = "0.18.0"
+version = "0.19.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a6218987c374650fdad0b476bfc675729762c28dfb35f58608a38a2b1ea337dd"
+checksum = "70fd680c0d0424a518229b1150922f92653ba2ac933aa000abc8bf1ca08105f7"
 dependencies = [
  "async-trait",
  "base64 0.21.1",
@@ -734,7 +740,7 @@ dependencies = [
  "pin-project",
  "quick-xml",
  "rand 0.8.5",
- "reqwest",
+ "reqwest 0.11.19",
  "rustc_version",
  "serde",
  "serde_json",
@@ -746,9 +752,9 @@ dependencies = [
 
 [[package]]
 name = "azure_identity"
-version = "0.18.1"
+version = "0.19.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9e1eacc4f7fb2a73d57c39139d0fc3aed78435606055779ddaef4b43cdf919a8"
+checksum = "a6d2060f5b2e1c664026ca4edd561306c473be887c1f7a81f10bf06f9b71c63f"
 dependencies = [
  "async-lock",
  "async-trait",
@@ -766,9 +772,9 @@ dependencies = [
 
 [[package]]
 name = "azure_storage"
-version = "0.18.0"
+version = "0.19.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ade8f2653e408de88b9eafec9f48c3c26b94026375e88adbd34523a7dd9795a1"
+checksum = "15d3da73bfa09350e1bd6ae2a260806fcf90048c7e78cd2d8f88be60b19a7266"
 dependencies = [
  "RustyXML",
  "async-lock",
@@ -785,9 +791,9 @@ dependencies = [
 
 [[package]]
 name = "azure_storage_blobs"
-version = "0.18.0"
+version = "0.19.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "025701c7cc5b523100f0f3b2b01723564ec5a86c03236521c06826337047e872"
+checksum = "149c21834a4105d761e3dd33d91c2a3064acc05a3c978848ea8089102ae45c94"
 dependencies = [
  "RustyXML",
  "azure_core",
@@ -806,9 +812,9 @@ dependencies = [
 
 [[package]]
 name = "azure_svc_blobstorage"
-version = "0.18.0"
+version = "0.19.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "76051e5bb67cea1055abe5e530a0878feac7e0ab4cbbcb4a6adc953a58993389"
+checksum = "88c888b7bf522d5405218b8613bf0fae7ddaae6ef3bf4ad42ae005993c96ab8b"
 dependencies = [
  "azure_core",
  "bytes",
@@ -859,6 +865,12 @@ version = "0.21.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3f1e31e207a6b8fb791a38ea3105e6cb541f55e4d029902d3039a4ad07cc4105"
 
+[[package]]
+name = "base64"
+version = "0.22.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6"
+
 [[package]]
 name = "base64-simd"
 version = "0.8.0"
@@ -1124,7 +1136,7 @@ version = "4.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "191d9573962933b4027f932c600cd252ce27a8ad5979418fe78e43c07996f27b"
 dependencies = [
- "heck",
+ "heck 0.4.1",
  "proc-macro2",
  "quote",
  "syn 2.0.52",
@@ -1196,7 +1208,7 @@ dependencies = [
  "compute_api",
  "flate2",
  "futures",
- "hyper",
+ "hyper 0.14.26",
  "nix 0.27.1",
  "notify",
  "num_cpus",
@@ -1204,7 +1216,7 @@ dependencies = [
  "postgres",
  "regex",
  "remote_storage",
- "reqwest",
+ "reqwest 0.12.4",
  "rust-ini",
  "serde",
  "serde_json",
@@ -1313,7 +1325,8 @@ dependencies = [
  "git-version",
  "hex",
  "humantime",
- "hyper",
+ "humantime-serde",
+ "hyper 0.14.26",
  "nix 0.27.1",
  "once_cell",
  "pageserver_api",
@@ -1322,7 +1335,7 @@ dependencies = [
  "postgres_backend",
  "postgres_connection",
  "regex",
- "reqwest",
+ "reqwest 0.12.4",
  "safekeeper_api",
  "scopeguard",
  "serde",
@@ -1462,12 +1475,9 @@ dependencies = [
 
 [[package]]
 name = "crossbeam-utils"
-version = "0.8.15"
+version = "0.8.19"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3c063cd8cc95f5c377ed0d4b49a4b21f632396ff690e8470c29b3359b346984b"
-dependencies = [
- "cfg-if",
-]
+checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345"
 
 [[package]]
 name = "crossterm"
@@ -1840,23 +1850,12 @@ checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5"
 
 [[package]]
 name = "errno"
-version = "0.3.1"
+version = "0.3.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4bcfec3a70f97c962c307b2d2c56e358cf1d00b558d74262b5f929ee8cc7e73a"
+checksum = "a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245"
 dependencies = [
- "errno-dragonfly",
- "libc",
- "windows-sys 0.48.0",
-]
-
-[[package]]
-name = "errno-dragonfly"
-version = "0.1.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf"
-dependencies = [
- "cc",
  "libc",
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
@@ -2213,6 +2212,25 @@ dependencies = [
  "tracing",
 ]
 
+[[package]]
+name = "h2"
+version = "0.4.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "816ec7294445779408f36fe57bc5b7fc1cf59664059096c65f905c1c61f58069"
+dependencies = [
+ "bytes",
+ "fnv",
+ "futures-core",
+ "futures-sink",
+ "futures-util",
+ "http 1.1.0",
+ "indexmap 2.0.1",
+ "slab",
+ "tokio",
+ "tokio-util",
+ "tracing",
+]
+
 [[package]]
 name = "half"
 version = "1.8.2"
@@ -2294,6 +2312,12 @@ version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
 
+[[package]]
+name = "heck"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
+
 [[package]]
 name = "hermit-abi"
 version = "0.3.3"
@@ -2345,6 +2369,17 @@ dependencies = [
  "winapi",
 ]
 
+[[package]]
+name = "hostname"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f9c7c7c8ac16c798734b8a24560c1362120597c40d5e1459f09498f8f6c8f2ba"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "windows 0.52.0",
+]
+
 [[package]]
 name = "http"
 version = "0.2.9"
@@ -2378,6 +2413,29 @@ dependencies = [
  "pin-project-lite",
 ]
 
+[[package]]
+name = "http-body"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1cac85db508abc24a2e48553ba12a996e87244a0395ce011e62b37158745d643"
+dependencies = [
+ "bytes",
+ "http 1.1.0",
+]
+
+[[package]]
+name = "http-body-util"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "41cb79eb393015dadd30fc252023adb0b2400a0caee0fa2a077e6e21a551e840"
+dependencies = [
+ "bytes",
+ "futures-util",
+ "http 1.1.0",
+ "http-body 1.0.0",
+ "pin-project-lite",
+]
+
 [[package]]
 name = "http-types"
 version = "2.12.0"
@@ -2436,9 +2494,9 @@ dependencies = [
  "futures-channel",
  "futures-core",
  "futures-util",
- "h2",
+ "h2 0.3.26",
  "http 0.2.9",
- "http-body",
+ "http-body 0.4.5",
  "httparse",
  "httpdate",
  "itoa",
@@ -2450,6 +2508,27 @@ dependencies = [
  "want",
 ]
 
+[[package]]
+name = "hyper"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "186548d73ac615b32a73aafe38fb4f56c0d340e110e5a200bcadbaf2e199263a"
+dependencies = [
+ "bytes",
+ "futures-channel",
+ "futures-util",
+ "h2 0.4.4",
+ "http 1.1.0",
+ "http-body 1.0.0",
+ "httparse",
+ "httpdate",
+ "itoa",
+ "pin-project-lite",
+ "smallvec",
+ "tokio",
+ "want",
+]
+
 [[package]]
 name = "hyper-rustls"
 version = "0.24.0"
@@ -2457,21 +2536,38 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0646026eb1b3eea4cd9ba47912ea5ce9cc07713d105b1a14698f4e6433d348b7"
 dependencies = [
  "http 0.2.9",
- "hyper",
+ "hyper 0.14.26",
  "log",
- "rustls 0.21.9",
+ "rustls 0.21.11",
  "rustls-native-certs 0.6.2",
  "tokio",
  "tokio-rustls 0.24.0",
 ]
 
+[[package]]
+name = "hyper-rustls"
+version = "0.26.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a0bea761b46ae2b24eb4aef630d8d1c398157b6fc29e6350ecf090a0b70c952c"
+dependencies = [
+ "futures-util",
+ "http 1.1.0",
+ "hyper 1.2.0",
+ "hyper-util",
+ "rustls 0.22.4",
+ "rustls-pki-types",
+ "tokio",
+ "tokio-rustls 0.25.0",
+ "tower-service",
+]
+
 [[package]]
 name = "hyper-timeout"
 version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bbb958482e8c7be4bc3cf272a766a2b0bf1a6755e7a6ae777f017a31d11b13b1"
 dependencies = [
- "hyper",
+ "hyper 0.14.26",
  "pin-project-lite",
  "tokio",
  "tokio-io-timeout",
@@ -2484,7 +2580,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d6183ddfa99b85da61a140bea0efc93fdf56ceaa041b37d553518030827f9905"
 dependencies = [
  "bytes",
- "hyper",
+ "hyper 0.14.26",
  "native-tls",
  "tokio",
  "tokio-native-tls",
@@ -2492,15 +2588,37 @@ dependencies = [
 
 [[package]]
 name = "hyper-tungstenite"
-version = "0.11.1"
+version = "0.13.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7cc7dcb1ab67cd336f468a12491765672e61a3b6b148634dbfe2fe8acd3fe7d9"
+checksum = "7a343d17fe7885302ed7252767dc7bb83609a874b6ff581142241ec4b73957ad"
 dependencies = [
- "hyper",
+ "http-body-util",
+ "hyper 1.2.0",
+ "hyper-util",
  "pin-project-lite",
  "tokio",
- "tokio-tungstenite",
- "tungstenite",
+ "tokio-tungstenite 0.21.0",
+ "tungstenite 0.21.0",
+]
+
+[[package]]
+name = "hyper-util"
+version = "0.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ca38ef113da30126bbff9cd1705f9273e15d45498615d138b0c20279ac7a76aa"
+dependencies = [
+ "bytes",
+ "futures-channel",
+ "futures-util",
+ "http 1.1.0",
+ "http-body 1.0.0",
+ "hyper 1.2.0",
+ "pin-project-lite",
+ "socket2 0.5.5",
+ "tokio",
+ "tower",
+ "tower-service",
+ "tracing",
 ]
 
 [[package]]
@@ -2514,7 +2632,7 @@ dependencies = [
  "iana-time-zone-haiku",
  "js-sys",
  "wasm-bindgen",
- "windows",
+ "windows 0.48.0",
 ]
 
 [[package]]
@@ -2685,9 +2803,9 @@ dependencies = [
 
 [[package]]
 name = "js-sys"
-version = "0.3.63"
+version = "0.3.69"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2f37a4a5928311ac501dee68b3c7613a1037d0edb30c8e5427bd832d55d1b790"
+checksum = "29c15563dc2726973df627357ce0c9ddddbea194836909d655df6a75d2cf296d"
 dependencies = [
  "wasm-bindgen",
 ]
@@ -2794,6 +2912,12 @@ version = "0.3.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519"
 
+[[package]]
+name = "linux-raw-sys"
+version = "0.4.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c"
+
 [[package]]
 name = "lock_api"
 version = "0.4.10"
@@ -2831,6 +2955,12 @@ version = "0.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b87248edafb776e59e6ee64a79086f65890d3510f2c656c000bf2a7e8a0aea40"
 
+[[package]]
+name = "matchit"
+version = "0.8.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "540f1c43aed89909c0cc0cc604e3bb2f7e7a341a3728a9e6cfe760e733cd11ed"
+
 [[package]]
 name = "md-5"
 version = "0.10.5"
@@ -2848,11 +2978,12 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"
 
 [[package]]
 name = "measured"
-version = "0.0.13"
+version = "0.0.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f246648d027839a34b420e27c7de1165ace96e19ef894985d0a6ff89a7840a9f"
+checksum = "652bc741286361c06de8cb4d89b21a6437f120c508c51713663589eeb9928ac5"
 dependencies = [
  "bytes",
+ "crossbeam-utils",
  "hashbrown 0.14.0",
  "itoa",
  "lasso",
@@ -2865,16 +2996,27 @@ dependencies = [
 
 [[package]]
 name = "measured-derive"
-version = "0.0.13"
+version = "0.0.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "edaa5cc22d99d5d6d7d99c3b5b5f7e7f8034c22f1b5d62a1adecd2ed005d9b80"
+checksum = "6ea497f33e1e856a376c32ad916f69a0bd3c597db1f912a399f842b01a4a685d"
 dependencies = [
- "heck",
+ "heck 0.5.0",
  "proc-macro2",
  "quote",
  "syn 2.0.52",
 ]
 
+[[package]]
+name = "measured-process"
+version = "0.0.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b364ccb66937a814b6b2ad751d1a2f7a9d5a78c761144036825fb36bb0771000"
+dependencies = [
+ "libc",
+ "measured",
+ "procfs 0.16.0",
+]
+
 [[package]]
 name = "memchr"
 version = "2.6.4"
@@ -2914,8 +3056,10 @@ version = "0.1.0"
 dependencies = [
  "chrono",
  "libc",
+ "measured",
+ "measured-process",
  "once_cell",
- "procfs",
+ "procfs 0.14.2",
  "prometheus",
  "rand 0.8.5",
  "rand_distr",
@@ -2950,16 +3094,6 @@ version = "0.3.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a"
 
-[[package]]
-name = "mime_guess"
-version = "2.0.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4192263c238a5f0d0c6bfd21f336a313a4ce1c450542449ca191bb657b4642ef"
-dependencies = [
- "mime",
- "unicase",
-]
-
 [[package]]
 name = "minimal-lexical"
 version = "0.2.1"
@@ -3086,6 +3220,16 @@ dependencies = [
  "winapi",
 ]
 
+[[package]]
+name = "nu-ansi-term"
+version = "0.46.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84"
+dependencies = [
+ "overload",
+ "winapi",
+]
+
 [[package]]
 name = "num"
 version = "0.4.1"
@@ -3293,7 +3437,7 @@ dependencies = [
  "bytes",
  "http 0.2.9",
  "opentelemetry_api",
- "reqwest",
+ "reqwest 0.11.19",
 ]
 
 [[package]]
@@ -3311,7 +3455,7 @@ dependencies = [
  "opentelemetry_api",
  "opentelemetry_sdk",
  "prost",
- "reqwest",
+ "reqwest 0.11.19",
  "thiserror",
  "tokio",
  "tonic",
@@ -3422,6 +3566,12 @@ version = "0.5.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4030760ffd992bef45b0ae3f10ce1aba99e33464c90d14dd7c039884963ddc7a"
 
+[[package]]
+name = "overload"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
+
 [[package]]
 name = "p256"
 version = "0.11.1"
@@ -3465,12 +3615,17 @@ dependencies = [
  "camino",
  "clap",
  "git-version",
+ "humantime",
  "pageserver",
+ "pageserver_api",
  "postgres_ffi",
+ "remote_storage",
  "serde",
  "serde_json",
  "svg_fmt",
  "tokio",
+ "tokio-util",
+ "toml_edit",
  "utils",
  "workspace_hack",
 ]
@@ -3505,7 +3660,7 @@ dependencies = [
  "hex-literal",
  "humantime",
  "humantime-serde",
- "hyper",
+ "hyper 0.14.26",
  "itertools",
  "leaky-bucket",
  "md5",
@@ -3524,11 +3679,11 @@ dependencies = [
  "postgres_connection",
  "postgres_ffi",
  "pq_proto",
- "procfs",
+ "procfs 0.14.2",
  "rand 0.8.5",
  "regex",
  "remote_storage",
- "reqwest",
+ "reqwest 0.12.4",
  "rpds",
  "scopeguard",
  "serde",
@@ -3554,6 +3709,7 @@ dependencies = [
  "tokio-util",
  "toml_edit",
  "tracing",
+ "twox-hash",
  "url",
  "utils",
  "walkdir",
@@ -3597,7 +3753,7 @@ dependencies = [
  "futures",
  "pageserver_api",
  "postgres",
- "reqwest",
+ "reqwest 0.12.4",
  "serde",
  "thiserror",
  "tokio",
@@ -3954,7 +4110,7 @@ dependencies = [
  "futures",
  "once_cell",
  "pq_proto",
- "rustls 0.22.2",
+ "rustls 0.22.4",
  "rustls-pemfile 2.1.1",
  "serde",
  "thiserror",
@@ -4083,6 +4239,29 @@ dependencies = [
  "rustix 0.36.16",
 ]
 
+[[package]]
+name = "procfs"
+version = "0.16.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "731e0d9356b0c25f16f33b5be79b1c57b562f141ebfcdb0ad8ac2c13a24293b4"
+dependencies = [
+ "bitflags 2.4.1",
+ "hex",
+ "lazy_static",
+ "procfs-core",
+ "rustix 0.38.28",
+]
+
+[[package]]
+name = "procfs-core"
+version = "0.16.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2d3554923a69f4ce04c4a754260c338f505ce22642d3830e049a399fc2059a29"
+dependencies = [
+ "bitflags 2.4.1",
+ "hex",
+]
+
 [[package]]
 name = "prometheus"
 version = "0.13.3"
@@ -4095,7 +4274,7 @@ dependencies = [
  "libc",
  "memchr",
  "parking_lot 0.12.1",
- "procfs",
+ "procfs 0.14.2",
  "thiserror",
 ]
 
@@ -4116,7 +4295,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "119533552c9a7ffacc21e099c24a0ac8bb19c2a2a3f363de84cd9b844feab270"
 dependencies = [
  "bytes",
- "heck",
+ "heck 0.4.1",
  "itertools",
  "lazy_static",
  "log",
@@ -4160,6 +4339,7 @@ dependencies = [
  "anyhow",
  "async-compression",
  "async-trait",
+ "atomic-take",
  "aws-config",
  "aws-sdk-iam",
  "aws-sigv4",
@@ -4181,15 +4361,19 @@ dependencies = [
  "hashlink",
  "hex",
  "hmac",
- "hostname",
+ "hostname 0.3.1",
  "http 1.1.0",
+ "http-body-util",
  "humantime",
- "hyper",
+ "hyper 0.14.26",
+ "hyper 1.2.0",
  "hyper-tungstenite",
+ "hyper-util",
  "ipnet",
  "itertools",
  "lasso",
  "md5",
+ "measured",
  "metrics",
  "native-tls",
  "once_cell",
@@ -4210,14 +4394,14 @@ dependencies = [
  "redis",
  "regex",
  "remote_storage",
- "reqwest",
+ "reqwest 0.12.4",
  "reqwest-middleware",
  "reqwest-retry",
  "reqwest-tracing",
  "routerify",
  "rstest",
  "rustc-hash",
- "rustls 0.22.2",
+ "rustls 0.22.4",
  "rustls-pemfile 2.1.1",
  "scopeguard",
  "serde",
@@ -4237,6 +4421,7 @@ dependencies = [
  "tokio-postgres-rustls",
  "tokio-rustls 0.25.0",
  "tokio-util",
+ "tower-service",
  "tracing",
  "tracing-opentelemetry",
  "tracing-subscriber",
@@ -4409,7 +4594,7 @@ dependencies = [
  "itoa",
  "percent-encoding",
  "pin-project-lite",
- "rustls 0.22.2",
+ "rustls 0.22.4",
  "rustls-native-certs 0.7.0",
  "rustls-pemfile 2.1.1",
  "rustls-pki-types",
@@ -4518,7 +4703,7 @@ dependencies = [
  "futures-util",
  "http-types",
  "humantime",
- "hyper",
+ "hyper 0.14.26",
  "itertools",
  "metrics",
  "once_cell",
@@ -4527,6 +4712,7 @@ dependencies = [
  "scopeguard",
  "serde",
  "serde_json",
+ "sync_wrapper",
  "test-context",
  "tokio",
  "tokio-stream",
@@ -4548,73 +4734,110 @@ dependencies = [
  "encoding_rs",
  "futures-core",
  "futures-util",
- "h2",
+ "h2 0.3.26",
  "http 0.2.9",
- "http-body",
- "hyper",
- "hyper-rustls",
+ "http-body 0.4.5",
+ "hyper 0.14.26",
  "hyper-tls",
  "ipnet",
  "js-sys",
  "log",
  "mime",
- "mime_guess",
  "native-tls",
  "once_cell",
  "percent-encoding",
  "pin-project-lite",
- "rustls 0.21.9",
- "rustls-pemfile 1.0.2",
  "serde",
  "serde_json",
  "serde_urlencoded",
  "tokio",
  "tokio-native-tls",
- "tokio-rustls 0.24.0",
  "tokio-util",
  "tower-service",
  "url",
  "wasm-bindgen",
  "wasm-bindgen-futures",
- "wasm-streams",
+ "wasm-streams 0.3.0",
  "web-sys",
- "webpki-roots 0.25.2",
- "winreg",
+ "winreg 0.50.0",
+]
+
+[[package]]
+name = "reqwest"
+version = "0.12.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "566cafdd92868e0939d3fb961bd0dc25fcfaaed179291093b3d43e6b3150ea10"
+dependencies = [
+ "base64 0.22.1",
+ "bytes",
+ "futures-channel",
+ "futures-core",
+ "futures-util",
+ "http 1.1.0",
+ "http-body 1.0.0",
+ "http-body-util",
+ "hyper 1.2.0",
+ "hyper-rustls 0.26.0",
+ "hyper-util",
+ "ipnet",
+ "js-sys",
+ "log",
+ "mime",
+ "once_cell",
+ "percent-encoding",
+ "pin-project-lite",
+ "rustls 0.22.4",
+ "rustls-pemfile 2.1.1",
+ "rustls-pki-types",
+ "serde",
+ "serde_json",
+ "serde_urlencoded",
+ "sync_wrapper",
+ "tokio",
+ "tokio-rustls 0.25.0",
+ "tokio-util",
+ "tower-service",
+ "url",
+ "wasm-bindgen",
+ "wasm-bindgen-futures",
+ "wasm-streams 0.4.0",
+ "web-sys",
+ "webpki-roots 0.26.1",
+ "winreg 0.52.0",
 ]
 
 [[package]]
 name = "reqwest-middleware"
-version = "0.2.2"
+version = "0.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4531c89d50effe1fac90d095c8b133c20c5c714204feee0bfc3fd158e784209d"
+checksum = "0209efb52486ad88136190094ee214759ef7507068b27992256ed6610eb71a01"
 dependencies = [
  "anyhow",
  "async-trait",
- "http 0.2.9",
- "reqwest",
+ "http 1.1.0",
+ "reqwest 0.12.4",
  "serde",
- "task-local-extensions",
  "thiserror",
+ "tower-service",
 ]
 
 [[package]]
 name = "reqwest-retry"
-version = "0.2.2"
+version = "0.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "48d0fd6ef4c6d23790399fe15efc8d12cd9f3d4133958f9bd7801ee5cbaec6c4"
+checksum = "40f342894422862af74c50e1e9601cf0931accc9c6981e5eb413c46603b616b5"
 dependencies = [
  "anyhow",
  "async-trait",
  "chrono",
  "futures",
  "getrandom 0.2.11",
- "http 0.2.9",
- "hyper",
+ "http 1.1.0",
+ "hyper 1.2.0",
  "parking_lot 0.11.2",
- "reqwest",
+ "reqwest 0.12.4",
  "reqwest-middleware",
  "retry-policies",
- "task-local-extensions",
  "tokio",
  "tracing",
  "wasm-timer",
@@ -4622,27 +4845,27 @@ dependencies = [
 
 [[package]]
 name = "reqwest-tracing"
-version = "0.4.7"
+version = "0.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5a0152176687dd5cfe7f507ac1cb1a491c679cfe483afd133a7db7aaea818bb3"
+checksum = "b253954a1979e02eabccd7e9c3d61d8f86576108baa160775e7f160bb4e800a3"
 dependencies = [
  "anyhow",
  "async-trait",
  "getrandom 0.2.11",
- "matchit",
+ "http 1.1.0",
+ "matchit 0.8.2",
  "opentelemetry",
- "reqwest",
+ "reqwest 0.12.4",
  "reqwest-middleware",
- "task-local-extensions",
  "tracing",
  "tracing-opentelemetry",
 ]
 
 [[package]]
 name = "retry-policies"
-version = "0.1.2"
+version = "0.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e09bbcb5003282bcb688f0bae741b278e9c7e8f378f561522c9806c58e075d9b"
+checksum = "493b4243e32d6eedd29f9a398896e35c6943a123b55eec97dcaee98310d25810"
 dependencies = [
  "anyhow",
  "chrono",
@@ -4696,7 +4919,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "496c1d3718081c45ba9c31fbfc07417900aa96f4070ff90dc29961836b7a9945"
 dependencies = [
  "http 0.2.9",
- "hyper",
+ "hyper 0.14.26",
  "lazy_static",
  "percent-encoding",
  "regex",
@@ -4809,10 +5032,23 @@ dependencies = [
 ]
 
 [[package]]
-name = "rustls"
-version = "0.21.9"
+name = "rustix"
+version = "0.38.28"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "629648aced5775d558af50b2b4c7b02983a04b312126d45eeead26e7caa498b9"
+checksum = "72e572a5e8ca657d7366229cdde4bd14c4eb5499a9573d4d366fe1b599daa316"
+dependencies = [
+ "bitflags 2.4.1",
+ "errno",
+ "libc",
+ "linux-raw-sys 0.4.13",
+ "windows-sys 0.52.0",
+]
+
+[[package]]
+name = "rustls"
+version = "0.21.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7fecbfb7b1444f477b345853b1fce097a2c6fb637b2bfb87e6bc5db0f043fae4"
 dependencies = [
  "log",
  "ring 0.17.6",
@@ -4822,9 +5058,9 @@ dependencies = [
 
 [[package]]
 name = "rustls"
-version = "0.22.2"
+version = "0.22.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e87c9956bd9807afa1f77e0f7594af32566e830e088a5576d27c5b6f30f49d41"
+checksum = "bf4ef73721ac7bcd79b2b315da7779d8fc09718c6b3d2d1b2d94850eb8c18432"
 dependencies = [
  "log",
  "ring 0.17.6",
@@ -4938,6 +5174,7 @@ dependencies = [
  "aws-smithy-async",
  "bincode",
  "bytes",
+ "camino",
  "chrono",
  "clap",
  "crc32c",
@@ -4947,18 +5184,23 @@ dependencies = [
  "hex",
  "histogram",
  "itertools",
+ "native-tls",
  "pageserver",
  "pageserver_api",
+ "postgres-native-tls",
+ "postgres_ffi",
  "rand 0.8.5",
  "remote_storage",
- "reqwest",
+ "reqwest 0.12.4",
  "serde",
  "serde_json",
  "serde_with",
  "thiserror",
  "tokio",
+ "tokio-postgres",
  "tokio-rustls 0.25.0",
  "tokio-stream",
+ "tokio-util",
  "tracing",
  "tracing-appender",
  "tracing-subscriber",
@@ -4988,7 +5230,7 @@ dependencies = [
  "git-version",
  "hex",
  "humantime",
- "hyper",
+ "hyper 0.14.26",
  "metrics",
  "once_cell",
  "parking_lot 0.12.1",
@@ -5000,7 +5242,7 @@ dependencies = [
  "rand 0.8.5",
  "regex",
  "remote_storage",
- "reqwest",
+ "reqwest 0.12.4",
  "safekeeper_api",
  "scopeguard",
  "sd-notify",
@@ -5130,13 +5372,13 @@ checksum = "bebd363326d05ec3e2f532ab7660680f3b02130d780c299bca73469d521bc0ed"
 
 [[package]]
 name = "sentry"
-version = "0.31.6"
+version = "0.32.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2e95efd0cefa32028cdb9766c96de71d96671072f9fb494dc9fb84c0ef93e52b"
+checksum = "00421ed8fa0c995f07cde48ba6c89e80f2b312f74ff637326f392fbfd23abe02"
 dependencies = [
  "httpdate",
- "reqwest",
- "rustls 0.21.9",
+ "reqwest 0.12.4",
+ "rustls 0.21.11",
  "sentry-backtrace",
  "sentry-contexts",
  "sentry-core",
@@ -5149,9 +5391,9 @@ dependencies = [
 
 [[package]]
 name = "sentry-backtrace"
-version = "0.31.6"
+version = "0.32.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6ac2bac6f310c4c4c4bb094d1541d32ae497f8c5c23405e85492cefdfe0971a9"
+checksum = "a79194074f34b0cbe5dd33896e5928bbc6ab63a889bd9df2264af5acb186921e"
 dependencies = [
  "backtrace",
  "once_cell",
@@ -5161,11 +5403,11 @@ dependencies = [
 
 [[package]]
 name = "sentry-contexts"
-version = "0.31.6"
+version = "0.32.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6c3e17295cecdbacf66c5bd38d6e1147e09e1e9d824d2d5341f76638eda02a3a"
+checksum = "eba8870c5dba2bfd9db25c75574a11429f6b95957b0a78ac02e2970dd7a5249a"
 dependencies = [
- "hostname",
+ "hostname 0.4.0",
  "libc",
  "os_info",
  "rustc_version",
@@ -5175,9 +5417,9 @@ dependencies = [
 
 [[package]]
 name = "sentry-core"
-version = "0.31.6"
+version = "0.32.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8339474f587f36cb110fa1ed1b64229eea6d47b0b886375579297b7e47aeb055"
+checksum = "46a75011ea1c0d5c46e9e57df03ce81f5c7f0a9e199086334a1f9c0a541e0826"
 dependencies = [
  "once_cell",
  "rand 0.8.5",
@@ -5188,9 +5430,9 @@ dependencies = [
 
 [[package]]
 name = "sentry-panic"
-version = "0.31.6"
+version = "0.32.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "875b69f506da75bd664029eafb05f8934297d2990192896d17325f066bd665b7"
+checksum = "2eaa3ecfa3c8750c78dcfd4637cfa2598b95b52897ed184b4dc77fcf7d95060d"
 dependencies = [
  "sentry-backtrace",
  "sentry-core",
@@ -5198,9 +5440,9 @@ dependencies = [
 
 [[package]]
 name = "sentry-tracing"
-version = "0.31.6"
+version = "0.32.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "89feead9bdd116f8035e89567651340fc382db29240b6c55ef412078b08d1aa3"
+checksum = "f715932bf369a61b7256687c6f0554141b7ce097287e30e3f7ed6e9de82498fe"
 dependencies = [
  "sentry-backtrace",
  "sentry-core",
@@ -5210,13 +5452,13 @@ dependencies = [
 
 [[package]]
 name = "sentry-types"
-version = "0.31.6"
+version = "0.32.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "99dc599bd6646884fc403d593cdcb9816dd67c50cff3271c01ff123617908dcd"
+checksum = "4519c900ce734f7a0eb7aba0869dfb225a7af8820634a7dd51449e3b093cfb7c"
 dependencies = [
  "debugid",
- "getrandom 0.2.11",
  "hex",
+ "rand 0.8.5",
  "serde",
  "serde_json",
  "thiserror",
@@ -5473,9 +5715,9 @@ dependencies = [
 
 [[package]]
 name = "smallvec"
-version = "1.11.0"
+version = "1.13.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "62bb4feee49fdd9f707ef802e22365a35de4b7b299de4763d44bfea899442ff9"
+checksum = "e6ecd384b10a64542d77071bd64bd7b231f4ed5940fba55e98c3de13824cf3d7"
 
 [[package]]
 name = "smol_str"
@@ -5567,7 +5809,7 @@ dependencies = [
  "futures-util",
  "git-version",
  "humantime",
- "hyper",
+ "hyper 0.14.26",
  "metrics",
  "once_cell",
  "parking_lot 0.12.1",
@@ -5598,7 +5840,7 @@ dependencies = [
  "git-version",
  "hex",
  "humantime",
- "hyper",
+ "hyper 0.14.26",
  "itertools",
  "lasso",
  "measured",
@@ -5608,10 +5850,12 @@ dependencies = [
  "pageserver_client",
  "postgres_connection",
  "r2d2",
- "reqwest",
+ "reqwest 0.12.4",
  "routerify",
  "serde",
  "serde_json",
+ "strum",
+ "strum_macros",
  "thiserror",
  "tokio",
  "tokio-util",
@@ -5627,10 +5871,10 @@ dependencies = [
  "anyhow",
  "clap",
  "comfy-table",
- "hyper",
+ "hyper 0.14.26",
  "pageserver_api",
  "pageserver_client",
- "reqwest",
+ "reqwest 0.12.4",
  "serde",
  "serde_json",
  "thiserror",
@@ -5668,7 +5912,7 @@ version = "0.24.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1e385be0d24f186b4ce2f9982191e7101bb737312ad61c1f2f984f34bcf85d59"
 dependencies = [
- "heck",
+ "heck 0.4.1",
  "proc-macro2",
  "quote",
  "rustversion",
@@ -5684,8 +5928,7 @@ checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc"
 [[package]]
 name = "svg_fmt"
 version = "0.4.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f83ba502a3265efb76efb89b0a2f7782ad6f2675015d4ce37e4b547dda42b499"
+source = "git+https://github.com/neondatabase/fork--nical--rust_debug?branch=neon#c1820b28664b5df68de7f043fccf2ed5d67b6ae8"
 
 [[package]]
 name = "syn"
@@ -5714,6 +5957,9 @@ name = "sync_wrapper"
 version = "0.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2047c6ded9c721764247e62cd3b03c09ffc529b2ba5b10ec482ae507a4a70160"
+dependencies = [
+ "futures-core",
+]
 
 [[package]]
 name = "synstructure"
@@ -6047,7 +6293,7 @@ checksum = "0ea13f22eda7127c827983bdaf0d7fff9df21c8817bab02815ac277a21143677"
 dependencies = [
  "futures",
  "ring 0.17.6",
- "rustls 0.22.2",
+ "rustls 0.22.4",
  "tokio",
  "tokio-postgres",
  "tokio-rustls 0.25.0",
@@ -6060,7 +6306,7 @@ version = "0.24.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e0d409377ff5b1e3ca6437aa86c1eb7d40c134bfec254e44c830defa92669db5"
 dependencies = [
- "rustls 0.21.9",
+ "rustls 0.21.11",
  "tokio",
 ]
 
@@ -6070,7 +6316,7 @@ version = "0.25.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "775e0c0f0adb3a2f22a00c4745d728b479985fc15ee7ca6a2608388c5569860f"
 dependencies = [
- "rustls 0.22.2",
+ "rustls 0.22.4",
  "rustls-pki-types",
  "tokio",
 ]
@@ -6110,7 +6356,19 @@ dependencies = [
  "futures-util",
  "log",
  "tokio",
- "tungstenite",
+ "tungstenite 0.20.1",
+]
+
+[[package]]
+name = "tokio-tungstenite"
+version = "0.21.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c83b561d025642014097b66e6c1bb422783339e0909e4429cde4749d1990bc38"
+dependencies = [
+ "futures-util",
+ "log",
+ "tokio",
+ "tungstenite 0.21.0",
 ]
 
 [[package]]
@@ -6177,10 +6435,10 @@ dependencies = [
  "bytes",
  "futures-core",
  "futures-util",
- "h2",
+ "h2 0.3.26",
  "http 0.2.9",
- "http-body",
- "hyper",
+ "http-body 0.4.5",
+ "hyper 0.14.26",
  "hyper-timeout",
  "percent-encoding",
  "pin-project",
@@ -6320,12 +6578,14 @@ dependencies = [
 
 [[package]]
 name = "tracing-opentelemetry"
-version = "0.20.0"
+version = "0.21.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fc09e402904a5261e42cf27aea09ccb7d5318c6717a9eec3d8e2e65c56b18f19"
+checksum = "75327c6b667828ddc28f5e3f169036cb793c3f588d83bf0f262a7f062ffed3c8"
 dependencies = [
  "once_cell",
  "opentelemetry",
+ "opentelemetry_sdk",
+ "smallvec",
  "tracing",
  "tracing-core",
  "tracing-log",
@@ -6349,6 +6609,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "30a651bc37f915e81f087d86e62a18eec5f79550c7faff886f7090b4ea757c77"
 dependencies = [
  "matchers",
+ "nu-ansi-term",
  "once_cell",
  "regex",
  "serde",
@@ -6366,11 +6627,11 @@ dependencies = [
 name = "tracing-utils"
 version = "0.1.0"
 dependencies = [
- "hyper",
+ "hyper 0.14.26",
  "opentelemetry",
  "opentelemetry-otlp",
  "opentelemetry-semantic-conventions",
- "reqwest",
+ "reqwest 0.12.4",
  "tokio",
  "tracing",
  "tracing-opentelemetry",
@@ -6403,6 +6664,25 @@ dependencies = [
  "utf-8",
 ]
 
+[[package]]
+name = "tungstenite"
+version = "0.21.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9ef1a641ea34f399a848dea702823bbecfb4c486f911735368f1f137cb8257e1"
+dependencies = [
+ "byteorder",
+ "bytes",
+ "data-encoding",
+ "http 1.1.0",
+ "httparse",
+ "log",
+ "rand 0.8.5",
+ "sha1",
+ "thiserror",
+ "url",
+ "utf-8",
+]
+
 [[package]]
 name = "twox-hash"
 version = "1.6.3"
@@ -6437,15 +6717,6 @@ dependencies = [
  "libc",
 ]
 
-[[package]]
-name = "unicase"
-version = "2.6.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "50f37be617794602aabbeee0be4f259dc1778fabe05e2d67ee8f79326d5cb4f6"
-dependencies = [
- "version_check",
-]
-
 [[package]]
 name = "unicode-bidi"
 version = "0.3.13"
@@ -6500,7 +6771,7 @@ dependencies = [
  "base64 0.21.1",
  "log",
  "once_cell",
- "rustls 0.21.9",
+ "rustls 0.21.11",
  "rustls-webpki 0.100.2",
  "url",
  "webpki-roots 0.23.1",
@@ -6568,7 +6839,7 @@ dependencies = [
  "hex",
  "hex-literal",
  "humantime",
- "hyper",
+ "hyper 0.14.26",
  "jsonwebtoken",
  "leaky-bucket",
  "metrics",
@@ -6727,9 +6998,9 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
 
 [[package]]
 name = "wasm-bindgen"
-version = "0.2.86"
+version = "0.2.92"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5bba0e8cb82ba49ff4e229459ff22a191bbe9a1cb3a341610c9c33efc27ddf73"
+checksum = "4be2531df63900aeb2bca0daaaddec08491ee64ceecbee5076636a3b026795a8"
 dependencies = [
  "cfg-if",
  "wasm-bindgen-macro",
@@ -6737,9 +7008,9 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen-backend"
-version = "0.2.86"
+version = "0.2.92"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "19b04bc93f9d6bdee709f6bd2118f57dd6679cf1176a1af464fca3ab0d66d8fb"
+checksum = "614d787b966d3989fa7bb98a654e369c762374fd3213d212cfc0251257e747da"
 dependencies = [
  "bumpalo",
  "log",
@@ -6752,9 +7023,9 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen-futures"
-version = "0.4.36"
+version = "0.4.42"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2d1985d03709c53167ce907ff394f5316aa22cb4e12761295c5dc57dacb6297e"
+checksum = "76bc14366121efc8dbb487ab05bcc9d346b3b5ec0eaa76e46594cabbe51762c0"
 dependencies = [
  "cfg-if",
  "js-sys",
@@ -6764,9 +7035,9 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen-macro"
-version = "0.2.86"
+version = "0.2.92"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "14d6b024f1a526bb0234f52840389927257beb670610081360e5a03c5df9c258"
+checksum = "a1f8823de937b71b9460c0c34e25f3da88250760bec0ebac694b49997550d726"
 dependencies = [
  "quote",
  "wasm-bindgen-macro-support",
@@ -6774,9 +7045,9 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen-macro-support"
-version = "0.2.86"
+version = "0.2.92"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e128beba882dd1eb6200e1dc92ae6c5dbaa4311aa7bb211ca035779e5efc39f8"
+checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -6787,9 +7058,9 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen-shared"
-version = "0.2.86"
+version = "0.2.92"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ed9d5b4305409d1fc9482fee2d7f9bcbf24b3972bf59817ef757e23982242a93"
+checksum = "af190c94f2773fdb3729c55b007a722abb5384da03bc0986df4c289bf5567e96"
 
 [[package]]
 name = "wasm-streams"
@@ -6804,6 +7075,19 @@ dependencies = [
  "web-sys",
 ]
 
+[[package]]
+name = "wasm-streams"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b65dc4c90b63b118468cf747d8bf3566c1913ef60be765b5730ead9e0a3ba129"
+dependencies = [
+ "futures-util",
+ "js-sys",
+ "wasm-bindgen",
+ "wasm-bindgen-futures",
+ "web-sys",
+]
+
 [[package]]
 name = "wasm-timer"
 version = "0.2.5"
@@ -6821,9 +7105,9 @@ dependencies = [
 
 [[package]]
 name = "web-sys"
-version = "0.3.63"
+version = "0.3.69"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3bdd9ef4e984da1187bf8110c5cf5b845fbc87a23602cdf912386a76fcd3a7c2"
+checksum = "77afa9a11836342370f4817622a2f0f418b134426d91a82dfb48f532d2ec13ef"
 dependencies = [
  "js-sys",
  "wasm-bindgen",
@@ -6844,6 +7128,15 @@ version = "0.25.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "14247bb57be4f377dfb94c72830b8ce8fc6beac03cf4bf7b9732eadd414123fc"
 
+[[package]]
+name = "webpki-roots"
+version = "0.26.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b3de34ae270483955a94f4b21bdaaeb83d508bb84a01435f393818edb0012009"
+dependencies = [
+ "rustls-pki-types",
+]
+
 [[package]]
 name = "which"
 version = "4.4.0"
@@ -6895,6 +7188,25 @@ dependencies = [
  "windows-targets 0.48.0",
 ]
 
+[[package]]
+name = "windows"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e48a53791691ab099e5e2ad123536d0fff50652600abaf43bbf952894110d0be"
+dependencies = [
+ "windows-core",
+ "windows-targets 0.52.4",
+]
+
+[[package]]
+name = "windows-core"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9"
+dependencies = [
+ "windows-targets 0.52.4",
+]
+
 [[package]]
 name = "windows-sys"
 version = "0.42.0"
@@ -6928,6 +7240,15 @@ dependencies = [
  "windows-targets 0.48.0",
 ]
 
+[[package]]
+name = "windows-sys"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
+dependencies = [
+ "windows-targets 0.52.4",
+]
+
 [[package]]
 name = "windows-targets"
 version = "0.42.2"
@@ -6958,6 +7279,21 @@ dependencies = [
  "windows_x86_64_msvc 0.48.0",
 ]
 
+[[package]]
+name = "windows-targets"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7dd37b7e5ab9018759f893a1952c9420d060016fc19a472b4bb20d1bdd694d1b"
+dependencies = [
+ "windows_aarch64_gnullvm 0.52.4",
+ "windows_aarch64_msvc 0.52.4",
+ "windows_i686_gnu 0.52.4",
+ "windows_i686_msvc 0.52.4",
+ "windows_x86_64_gnu 0.52.4",
+ "windows_x86_64_gnullvm 0.52.4",
+ "windows_x86_64_msvc 0.52.4",
+]
+
 [[package]]
 name = "windows_aarch64_gnullvm"
 version = "0.42.2"
@@ -6970,6 +7306,12 @@ version = "0.48.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc"
 
+[[package]]
+name = "windows_aarch64_gnullvm"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bcf46cf4c365c6f2d1cc93ce535f2c8b244591df96ceee75d8e83deb70a9cac9"
+
 [[package]]
 name = "windows_aarch64_msvc"
 version = "0.42.2"
@@ -6982,6 +7324,12 @@ version = "0.48.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3"
 
+[[package]]
+name = "windows_aarch64_msvc"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "da9f259dd3bcf6990b55bffd094c4f7235817ba4ceebde8e6d11cd0c5633b675"
+
 [[package]]
 name = "windows_i686_gnu"
 version = "0.42.2"
@@ -6994,6 +7342,12 @@ version = "0.48.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241"
 
+[[package]]
+name = "windows_i686_gnu"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b474d8268f99e0995f25b9f095bc7434632601028cf86590aea5c8a5cb7801d3"
+
 [[package]]
 name = "windows_i686_msvc"
 version = "0.42.2"
@@ -7006,6 +7360,12 @@ version = "0.48.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00"
 
+[[package]]
+name = "windows_i686_msvc"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1515e9a29e5bed743cb4415a9ecf5dfca648ce85ee42e15873c3cd8610ff8e02"
+
 [[package]]
 name = "windows_x86_64_gnu"
 version = "0.42.2"
@@ -7018,6 +7378,12 @@ version = "0.48.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1"
 
+[[package]]
+name = "windows_x86_64_gnu"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5eee091590e89cc02ad514ffe3ead9eb6b660aedca2183455434b93546371a03"
+
 [[package]]
 name = "windows_x86_64_gnullvm"
 version = "0.42.2"
@@ -7030,6 +7396,12 @@ version = "0.48.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953"
 
+[[package]]
+name = "windows_x86_64_gnullvm"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "77ca79f2451b49fa9e2af39f0747fe999fcda4f5e241b2898624dca97a1f2177"
+
 [[package]]
 name = "windows_x86_64_msvc"
 version = "0.42.2"
@@ -7042,6 +7414,12 @@ version = "0.48.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a"
 
+[[package]]
+name = "windows_x86_64_msvc"
+version = "0.52.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8"
+
 [[package]]
 name = "winnow"
 version = "0.4.6"
@@ -7061,6 +7439,16 @@ dependencies = [
  "windows-sys 0.48.0",
 ]
 
+[[package]]
+name = "winreg"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a277a57398d4bfa075df44f501a17cfdf8542d224f0d36095a2adc7aee4ef0a5"
+dependencies = [
+ "cfg-if",
+ "windows-sys 0.48.0",
+]
+
 [[package]]
 name = "workspace_hack"
 version = "0.1.0"
@@ -7090,11 +7478,10 @@ dependencies = [
  "futures-sink",
  "futures-util",
  "getrandom 0.2.11",
- "hashbrown 0.13.2",
  "hashbrown 0.14.0",
  "hex",
  "hmac",
- "hyper",
+ "hyper 0.14.26",
  "indexmap 1.9.3",
  "itertools",
  "libc",
@@ -7111,8 +7498,9 @@ dependencies = [
  "regex",
  "regex-automata 0.4.3",
  "regex-syntax 0.8.2",
- "reqwest",
- "rustls 0.21.9",
+ "reqwest 0.11.19",
+ "reqwest 0.12.4",
+ "rustls 0.21.11",
  "scopeguard",
  "serde",
  "serde_json",
@@ -7121,6 +7509,7 @@ dependencies = [
  "subtle",
  "syn 1.0.109",
  "syn 2.0.52",
+ "sync_wrapper",
  "time",
  "time-macros",
  "tokio",
@@ -7132,7 +7521,6 @@ dependencies = [
  "tower",
  "tracing",
  "tracing-core",
- "tungstenite",
  "url",
  "uuid",
  "zeroize",
diff --git a/Cargo.toml b/Cargo.toml
index 3c6077648e..a6d406dc2f 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -44,10 +44,11 @@ license = "Apache-2.0"
 anyhow = { version = "1.0", features = ["backtrace"] }
 arc-swap = "1.6"
 async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
-azure_core = "0.18"
-azure_identity = "0.18"
-azure_storage = "0.18"
-azure_storage_blobs = "0.18"
+atomic-take = "1.1.0"
+azure_core = "0.19"
+azure_identity = "0.19"
+azure_storage = "0.19"
+azure_storage_blobs = "0.19"
 flate2 = "1.0.26"
 async-stream = "0.3"
 async-trait = "0.1"
@@ -97,7 +98,7 @@ http-types = { version = "2", default-features = false }
 humantime = "2.1"
 humantime-serde = "1.1.1"
 hyper = "0.14"
-hyper-tungstenite = "0.11"
+hyper-tungstenite = "0.13.0"
 inotify = "0.10.2"
 ipnet = "2.9.0"
 itertools = "0.10"
@@ -106,7 +107,8 @@ lasso = "0.7"
 leaky-bucket = "1.0.1"
 libc = "0.2"
 md5 = "0.7.0"
-measured = { version = "0.0.13", features=["default", "lasso"] }
+measured = { version = "0.0.21", features=["lasso"] }
+measured-process = { version = "0.0.21" }
 memoffset = "0.8"
 native-tls = "0.2"
 nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] }
@@ -128,10 +130,10 @@ prost = "0.11"
 rand = "0.8"
 redis = { version = "0.25.2", features = ["tokio-rustls-comp", "keep-alive"] }
 regex = "1.10.2"
-reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
-reqwest-tracing = { version = "0.4.7", features = ["opentelemetry_0_20"] }
-reqwest-middleware = "0.2.0"
-reqwest-retry = "0.2.2"
+reqwest = { version = "0.12", default-features = false, features = ["rustls-tls"] }
+reqwest-tracing = { version = "0.5", features = ["opentelemetry_0_20"] }
+reqwest-middleware = "0.3.0"
+reqwest-retry = "0.5"
 routerify = "3"
 rpds = "0.13"
 rustc-hash = "1.1.0"
@@ -141,7 +143,7 @@ rustls-split = "0.3"
 scopeguard = "1.1"
 sysinfo = "0.29.2"
 sd-notify = "0.4.1"
-sentry = { version = "0.31", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
+sentry = { version = "0.32", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1"
 serde_path_to_error = "0.1"
@@ -155,7 +157,8 @@ socket2 = "0.5"
 strum = "0.24"
 strum_macros = "0.24"
 "subtle"  = "2.5.0"
-svg_fmt = "0.4.1"
+# https://github.com/nical/rust_debug/pull/4
+svg_fmt = { git = "https://github.com/neondatabase/fork--nical--rust_debug", branch = "neon" }
 sync_wrapper = "0.1.2"
 tar = "0.4"
 task-local-extensions = "0.1.4"
@@ -174,10 +177,11 @@ tokio-util = { version = "0.7.10", features = ["io", "rt"] }
 toml = "0.7"
 toml_edit = "0.19"
 tonic = {version = "0.9", features = ["tls", "tls-roots"]}
+tower-service = "0.3.2"
 tracing = "0.1"
 tracing-error = "0.2.0"
-tracing-opentelemetry = "0.20.0"
-tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
+tracing-opentelemetry = "0.21.0"
+tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json", "ansi"] }
 twox-hash = { version = "1.6.3", default-features = false }
 url = "2.2"
 urlencoding = "2.1"
diff --git a/Dockerfile.build-tools b/Dockerfile.build-tools
index 1ed6f87473..19739cc1f8 100644
--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -58,8 +58,14 @@ RUN curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v$
     && mv protoc/include/google /usr/local/include/google \
     && rm -rf protoc.zip protoc
 
+# s5cmd
+ENV S5CMD_VERSION=2.2.2
+RUN curl -sL "https://github.com/peak/s5cmd/releases/download/v${S5CMD_VERSION}/s5cmd_${S5CMD_VERSION}_Linux-$(uname -m | sed 's/x86_64/64bit/g' | sed 's/aarch64/arm64/g').tar.gz" | tar zxvf - s5cmd \
+    && chmod +x s5cmd \
+    && mv s5cmd /usr/local/bin/s5cmd
+
 # LLVM
-ENV LLVM_VERSION=17
+ENV LLVM_VERSION=18
 RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \
     && echo "deb http://apt.llvm.org/bullseye/ llvm-toolchain-bullseye-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \
     && apt update \
@@ -135,7 +141,7 @@ WORKDIR /home/nonroot
 
 # Rust
 # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
-ENV RUSTC_VERSION=1.77.0
+ENV RUSTC_VERSION=1.78.0
 ENV RUSTUP_HOME="/home/nonroot/.rustup"
 ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
 RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
diff --git a/Makefile b/Makefile
index f13f080f1a..5e2b3c4367 100644
--- a/Makefile
+++ b/Makefile
@@ -25,14 +25,16 @@ ifeq ($(UNAME_S),Linux)
 	# Seccomp BPF is only available for Linux
 	PG_CONFIGURE_OPTS += --with-libseccomp
 else ifeq ($(UNAME_S),Darwin)
-	# macOS with brew-installed openssl requires explicit paths
-	# It can be configured with OPENSSL_PREFIX variable
-	OPENSSL_PREFIX ?= $(shell brew --prefix openssl@3)
-	PG_CONFIGURE_OPTS += --with-includes=$(OPENSSL_PREFIX)/include --with-libraries=$(OPENSSL_PREFIX)/lib
-	PG_CONFIGURE_OPTS += PKG_CONFIG_PATH=$(shell brew --prefix icu4c)/lib/pkgconfig
-	# macOS already has bison and flex in the system, but they are old and result in postgres-v14 target failure
-	# brew formulae are keg-only and not symlinked into HOMEBREW_PREFIX, force their usage
-	EXTRA_PATH_OVERRIDES += $(shell brew --prefix bison)/bin/:$(shell brew --prefix flex)/bin/:
+	ifndef DISABLE_HOMEBREW
+		# macOS with brew-installed openssl requires explicit paths
+		# It can be configured with OPENSSL_PREFIX variable
+		OPENSSL_PREFIX ?= $(shell brew --prefix openssl@3)
+		PG_CONFIGURE_OPTS += --with-includes=$(OPENSSL_PREFIX)/include --with-libraries=$(OPENSSL_PREFIX)/lib
+		PG_CONFIGURE_OPTS += PKG_CONFIG_PATH=$(shell brew --prefix icu4c)/lib/pkgconfig
+		# macOS already has bison and flex in the system, but they are old and result in postgres-v14 target failure
+		# brew formulae are keg-only and not symlinked into HOMEBREW_PREFIX, force their usage
+		EXTRA_PATH_OVERRIDES += $(shell brew --prefix bison)/bin/:$(shell brew --prefix flex)/bin/:
+	endif
 endif
 
 # Use -C option so that when PostgreSQL "make install" installs the
diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index 117919786e..67c5250376 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -47,7 +47,7 @@ use chrono::Utc;
 use clap::Arg;
 use signal_hook::consts::{SIGQUIT, SIGTERM};
 use signal_hook::{consts::SIGINT, iterator::Signals};
-use tracing::{error, info};
+use tracing::{error, info, warn};
 use url::Url;
 
 use compute_api::responses::ComputeStatus;
@@ -62,6 +62,7 @@ use compute_tools::logger::*;
 use compute_tools::monitor::launch_monitor;
 use compute_tools::params::*;
 use compute_tools::spec::*;
+use compute_tools::swap::resize_swap;
 
 // this is an arbitrary build tag. Fine as a default / for testing purposes
 // in-case of not-set environment var
@@ -110,6 +111,7 @@ fn main() -> Result<()> {
         .expect("Postgres connection string is required");
     let spec_json = matches.get_one::<String>("spec");
     let spec_path = matches.get_one::<String>("spec-path");
+    let resize_swap_on_bind = matches.get_flag("resize-swap-on-bind");
 
     // Extract OpenTelemetry context for the startup actions from the
     // TRACEPARENT and TRACESTATE env variables, and attach it to the current
@@ -226,14 +228,14 @@ fn main() -> Result<()> {
 
     // If this is a pooled VM, prewarm before starting HTTP server and becoming
     // available for binding. Prewarming helps Postgres start quicker later,
-    // because QEMU will already have it's memory allocated from the host, and
+    // because QEMU will already have its memory allocated from the host, and
     // the necessary binaries will already be cached.
     if !spec_set {
         compute.prewarm_postgres()?;
     }
 
-    // Launch http service first, so we were able to serve control-plane
-    // requests, while configuration is still in progress.
+    // Launch http service first, so that we can serve control-plane requests
+    // while configuration is still in progress.
     let _http_handle =
         launch_http_server(http_port, &compute).expect("cannot launch http endpoint thread");
 
@@ -253,21 +255,22 @@ fn main() -> Result<()> {
                 break;
             }
         }
+
+        // Record for how long we slept waiting for the spec.
+        let now = Utc::now();
+        state.metrics.wait_for_spec_ms = now
+            .signed_duration_since(state.start_time)
+            .to_std()
+            .unwrap()
+            .as_millis() as u64;
+
+        // Reset start time, so that the total startup time that is calculated later will
+        // not include the time that we waited for the spec.
+        state.start_time = now;
     }
 
     // We got all we need, update the state.
     let mut state = compute.state.lock().unwrap();
-
-    // Record for how long we slept waiting for the spec.
-    state.metrics.wait_for_spec_ms = Utc::now()
-        .signed_duration_since(state.start_time)
-        .to_std()
-        .unwrap()
-        .as_millis() as u64;
-    // Reset start time to the actual start of the configuration, so that
-    // total startup time was properly measured at the end.
-    state.start_time = Utc::now();
-
     state.status = ComputeStatus::Init;
     compute.state_changed.notify_all();
 
@@ -275,33 +278,72 @@ fn main() -> Result<()> {
         "running compute with features: {:?}",
         state.pspec.as_ref().unwrap().spec.features
     );
+    // before we release the mutex, fetch the swap size (if any) for later.
+    let swap_size_bytes = state.pspec.as_ref().unwrap().spec.swap_size_bytes;
     drop(state);
 
     // Launch remaining service threads
     let _monitor_handle = launch_monitor(&compute);
     let _configurator_handle = launch_configurator(&compute);
 
-    // Start Postgres
+    let mut prestartup_failed = false;
     let mut delay_exit = false;
-    let mut exit_code = None;
-    let pg = match compute.start_compute(extension_server_port) {
-        Ok(pg) => Some(pg),
-        Err(err) => {
-            error!("could not start the compute node: {:#}", err);
-            let mut state = compute.state.lock().unwrap();
-            state.error = Some(format!("{:?}", err));
-            state.status = ComputeStatus::Failed;
-            // Notify others that Postgres failed to start. In case of configuring the
-            // empty compute, it's likely that API handler is still waiting for compute
-            // state change. With this we will notify it that compute is in Failed state,
-            // so control plane will know about it earlier and record proper error instead
-            // of timeout.
-            compute.state_changed.notify_all();
-            drop(state); // unlock
-            delay_exit = true;
-            None
+
+    // Resize swap to the desired size if the compute spec says so
+    if let (Some(size_bytes), true) = (swap_size_bytes, resize_swap_on_bind) {
+        // To avoid 'swapoff' hitting postgres startup, we need to run resize-swap to completion
+        // *before* starting postgres.
+        //
+        // In theory, we could do this asynchronously if SkipSwapon was enabled for VMs, but this
+        // carries a risk of introducing hard-to-debug issues - e.g. if postgres sometimes gets
+        // OOM-killed during startup because swap wasn't available yet.
+        match resize_swap(size_bytes) {
+            Ok(()) => {
+                let size_gib = size_bytes as f32 / (1 << 20) as f32; // just for more coherent display.
+                info!(%size_bytes, %size_gib, "resized swap");
+            }
+            Err(err) => {
+                let err = err.context("failed to resize swap");
+                error!("{err:#}");
+
+                // Mark compute startup as failed; don't try to start postgres, and report this
+                // error to the control plane when it next asks.
+                prestartup_failed = true;
+                let mut state = compute.state.lock().unwrap();
+                state.error = Some(format!("{err:?}"));
+                state.status = ComputeStatus::Failed;
+                compute.state_changed.notify_all();
+                delay_exit = true;
+            }
         }
-    };
+    }
+
+    // Start Postgres
+    let mut pg = None;
+    let mut exit_code = None;
+
+    if !prestartup_failed {
+        pg = match compute.start_compute(extension_server_port) {
+            Ok(pg) => Some(pg),
+            Err(err) => {
+                error!("could not start the compute node: {:#}", err);
+                let mut state = compute.state.lock().unwrap();
+                state.error = Some(format!("{:?}", err));
+                state.status = ComputeStatus::Failed;
+                // Notify others that Postgres failed to start. In case of configuring the
+                // empty compute, it's likely that API handler is still waiting for compute
+                // state change. With this we will notify it that compute is in Failed state,
+                // so control plane will know about it earlier and record proper error instead
+                // of timeout.
+                compute.state_changed.notify_all();
+                drop(state); // unlock
+                delay_exit = true;
+                None
+            }
+        };
+    } else {
+        warn!("skipping postgres startup because pre-startup step failed");
+    }
 
     // Start the vm-monitor if directed to. The vm-monitor only runs on linux
     // because it requires cgroups.
@@ -526,6 +568,11 @@ fn cli() -> clap::Command {
                 )
                 .value_name("FILECACHE_CONNSTR"),
         )
+        .arg(
+            Arg::new("resize-swap-on-bind")
+                .long("resize-swap-on-bind")
+                .action(clap::ArgAction::SetTrue),
+        )
 }
 
 /// When compute_ctl is killed, send also termination signal to sync-safekeepers
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 88dc4aca2b..40060f4117 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -818,9 +818,15 @@ impl ComputeNode {
                         Client::connect(zenith_admin_connstr.as_str(), NoTls)
                             .context("broken cloud_admin credential: tried connecting with cloud_admin but could not authenticate, and zenith_admin does not work either")?;
                     // Disable forwarding so that users don't get a cloud_admin role
-                    client.simple_query("SET neon.forward_ddl = false")?;
-                    client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?;
-                    client.simple_query("GRANT zenith_admin TO cloud_admin")?;
+
+                    let mut func = || {
+                        client.simple_query("SET neon.forward_ddl = false")?;
+                        client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?;
+                        client.simple_query("GRANT zenith_admin TO cloud_admin")?;
+                        Ok::<_, anyhow::Error>(())
+                    };
+                    func().context("apply_config setup cloud_admin")?;
+
                     drop(client);
 
                     // reconnect with connstring with expected name
@@ -832,24 +838,29 @@ impl ComputeNode {
         };
 
         // Disable DDL forwarding because control plane already knows about these roles/databases.
-        client.simple_query("SET neon.forward_ddl = false")?;
+        client
+            .simple_query("SET neon.forward_ddl = false")
+            .context("apply_config SET neon.forward_ddl = false")?;
 
         // Proceed with post-startup configuration. Note, that order of operations is important.
         let spec = &compute_state.pspec.as_ref().expect("spec must be set").spec;
-        create_neon_superuser(spec, &mut client)?;
-        cleanup_instance(&mut client)?;
-        handle_roles(spec, &mut client)?;
-        handle_databases(spec, &mut client)?;
-        handle_role_deletions(spec, connstr.as_str(), &mut client)?;
+        create_neon_superuser(spec, &mut client).context("apply_config create_neon_superuser")?;
+        cleanup_instance(&mut client).context("apply_config cleanup_instance")?;
+        handle_roles(spec, &mut client).context("apply_config handle_roles")?;
+        handle_databases(spec, &mut client).context("apply_config handle_databases")?;
+        handle_role_deletions(spec, connstr.as_str(), &mut client)
+            .context("apply_config handle_role_deletions")?;
         handle_grants(
             spec,
             &mut client,
             connstr.as_str(),
             self.has_feature(ComputeFeature::AnonExtension),
-        )?;
-        handle_extensions(spec, &mut client)?;
-        handle_extension_neon(&mut client)?;
-        create_availability_check_data(&mut client)?;
+        )
+        .context("apply_config handle_grants")?;
+        handle_extensions(spec, &mut client).context("apply_config handle_extensions")?;
+        handle_extension_neon(&mut client).context("apply_config handle_extension_neon")?;
+        create_availability_check_data(&mut client)
+            .context("apply_config create_availability_check_data")?;
 
         // 'Close' connection
         drop(client);
@@ -857,7 +868,7 @@ impl ComputeNode {
         // Run migrations separately to not hold up cold starts
         thread::spawn(move || {
             let mut client = Client::connect(connstr.as_str(), NoTls)?;
-            handle_migrations(&mut client)
+            handle_migrations(&mut client).context("apply_config handle_migrations")
         });
         Ok(())
     }
diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs
index f1fd8637f5..89c866b20c 100644
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -6,8 +6,8 @@ use std::path::Path;
 use anyhow::Result;
 
 use crate::pg_helpers::escape_conf_value;
-use crate::pg_helpers::PgOptionsSerialize;
-use compute_api::spec::{ComputeMode, ComputeSpec};
+use crate::pg_helpers::{GenericOptionExt, PgOptionsSerialize};
+use compute_api::spec::{ComputeMode, ComputeSpec, GenericOption};
 
 /// Check that `line` is inside a text file and put it there if it is not.
 /// Create file if it doesn't exist.
@@ -92,6 +92,27 @@ pub fn write_postgres_conf(
         }
     }
 
+    if cfg!(target_os = "linux") {
+        // Check /proc/sys/vm/overcommit_memory -- if it equals 2 (i.e. linux memory overcommit is
+        // disabled), then the control plane has enabled swap and we should set
+        // dynamic_shared_memory_type = 'mmap'.
+        //
+        // This is (maybe?) temporary - for more, see https://github.com/neondatabase/cloud/issues/12047.
+        let overcommit_memory_contents = std::fs::read_to_string("/proc/sys/vm/overcommit_memory")
+            // ignore any errors - they may be expected to occur under certain situations (e.g. when
+            // not running in Linux).
+            .unwrap_or_else(|_| String::new());
+        if overcommit_memory_contents.trim() == "2" {
+            let opt = GenericOption {
+                name: "dynamic_shared_memory_type".to_owned(),
+                value: Some("mmap".to_owned()),
+                vartype: "enum".to_owned(),
+            };
+
+            write!(file, "{}", opt.to_pg_setting())?;
+        }
+    }
+
     // If there are any extra options in the 'settings' field, append those
     if spec.cluster.settings.is_some() {
         writeln!(file, "# Managed by compute_ctl: begin")?;
diff --git a/compute_tools/src/lib.rs b/compute_tools/src/lib.rs
index 4e01ffd954..eac808385c 100644
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -14,4 +14,5 @@ pub mod monitor;
 pub mod params;
 pub mod pg_helpers;
 pub mod spec;
+pub mod swap;
 pub mod sync_sk;
diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs
index 5deb50d6b7..fa0822748b 100644
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -44,7 +44,7 @@ pub fn escape_conf_value(s: &str) -> String {
     format!("'{}'", res)
 }
 
-trait GenericOptionExt {
+pub trait GenericOptionExt {
     fn to_pg_option(&self) -> String;
     fn to_pg_setting(&self) -> String;
 }
diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index 5643634633..3a6e18b638 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -2,7 +2,7 @@ use std::fs::File;
 use std::path::Path;
 use std::str::FromStr;
 
-use anyhow::{anyhow, bail, Result};
+use anyhow::{anyhow, bail, Context, Result};
 use postgres::config::Config;
 use postgres::{Client, NoTls};
 use reqwest::StatusCode;
@@ -490,7 +490,7 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
                 "rename_db" => {
                     let new_name = op.new_name.as_ref().unwrap();
 
-                    if existing_dbs.get(&op.name).is_some() {
+                    if existing_dbs.contains_key(&op.name) {
                         let query: String = format!(
                             "ALTER DATABASE {} RENAME TO {}",
                             op.name.pg_quote(),
@@ -698,7 +698,8 @@ pub fn handle_grants(
 
         // it is important to run this after all grants
         if enable_anon_extension {
-            handle_extension_anon(spec, &db.owner, &mut db_client, false)?;
+            handle_extension_anon(spec, &db.owner, &mut db_client, false)
+                .context("handle_grants handle_extension_anon")?;
         }
     }
 
@@ -813,28 +814,36 @@ $$;"#,
         // Add new migrations below.
     ];
 
-    let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
-    client.simple_query(query)?;
+    let mut func = || {
+        let query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
+        client.simple_query(query)?;
 
-    query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)";
-    client.simple_query(query)?;
+        let query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)";
+        client.simple_query(query)?;
 
-    query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING";
-    client.simple_query(query)?;
+        let query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING";
+        client.simple_query(query)?;
 
-    query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin";
-    client.simple_query(query)?;
+        let query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin";
+        client.simple_query(query)?;
 
-    query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC";
-    client.simple_query(query)?;
+        let query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC";
+        client.simple_query(query)?;
+        Ok::<_, anyhow::Error>(())
+    };
+    func().context("handle_migrations prepare")?;
 
-    query = "SELECT id FROM neon_migration.migration_id";
-    let row = client.query_one(query, &[])?;
+    let query = "SELECT id FROM neon_migration.migration_id";
+    let row = client
+        .query_one(query, &[])
+        .context("handle_migrations get migration_id")?;
     let mut current_migration: usize = row.get::<&str, i64>("id") as usize;
     let starting_migration_id = current_migration;
 
-    query = "BEGIN";
-    client.simple_query(query)?;
+    let query = "BEGIN";
+    client
+        .simple_query(query)
+        .context("handle_migrations begin")?;
 
     while current_migration < migrations.len() {
         let migration = &migrations[current_migration];
@@ -842,7 +851,9 @@ $$;"#,
             info!("Skip migration id={}", current_migration);
         } else {
             info!("Running migration:\n{}\n", migration);
-            client.simple_query(migration)?;
+            client.simple_query(migration).with_context(|| {
+                format!("handle_migrations current_migration={}", current_migration)
+            })?;
         }
         current_migration += 1;
     }
@@ -850,10 +861,14 @@ $$;"#,
         "UPDATE neon_migration.migration_id SET id={}",
         migrations.len()
     );
-    client.simple_query(&setval)?;
+    client
+        .simple_query(&setval)
+        .context("handle_migrations update id")?;
 
-    query = "COMMIT";
-    client.simple_query(query)?;
+    let query = "COMMIT";
+    client
+        .simple_query(query)
+        .context("handle_migrations commit")?;
 
     info!(
         "Ran {} migrations",
diff --git a/compute_tools/src/swap.rs b/compute_tools/src/swap.rs
new file mode 100644
index 0000000000..c22b6bc14e
--- /dev/null
+++ b/compute_tools/src/swap.rs
@@ -0,0 +1,36 @@
+use anyhow::{anyhow, Context};
+use tracing::warn;
+
+pub const RESIZE_SWAP_BIN: &str = "/neonvm/bin/resize-swap";
+
+pub fn resize_swap(size_bytes: u64) -> anyhow::Result<()> {
+    // run `/neonvm/bin/resize-swap --once {size_bytes}`
+    //
+    // Passing '--once' causes resize-swap to delete itself after successful completion, which
+    // means that if compute_ctl restarts later, we won't end up calling 'swapoff' while
+    // postgres is running.
+    //
+    // NOTE: resize-swap is not very clever. If present, --once MUST be the first arg.
+    let child_result = std::process::Command::new("/usr/bin/sudo")
+        .arg(RESIZE_SWAP_BIN)
+        .arg("--once")
+        .arg(size_bytes.to_string())
+        .spawn();
+
+    if matches!(&child_result, Err(e) if e.kind() == std::io::ErrorKind::NotFound) {
+        warn!("ignoring \"not found\" error from resize-swap to avoid swapoff while compute is running");
+        return Ok(());
+    }
+
+    child_result
+        .context("spawn() failed")
+        .and_then(|mut child| child.wait().context("wait() failed"))
+        .and_then(|status| match status.success() {
+            true => Ok(()),
+            false => Err(anyhow!("process exited with {status}")),
+        })
+        // wrap any prior error with the overall context that we couldn't run the command
+        .with_context(|| {
+            format!("could not run `/usr/bin/sudo {RESIZE_SWAP_BIN} --once {size_bytes}`")
+        })
+}
diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml
index b544a8c587..2ce041068e 100644
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -17,6 +17,7 @@ nix.workspace = true
 once_cell.workspace = true
 postgres.workspace = true
 hex.workspace = true
+humantime-serde.workspace = true
 hyper.workspace = true
 regex.workspace = true
 reqwest = { workspace = true, features = ["blocking", "json"] }
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 56495dd2da..e01d5c9799 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -14,15 +14,15 @@ use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR};
 use control_plane::safekeeper::SafekeeperNode;
 use control_plane::storage_controller::StorageController;
 use control_plane::{broker, local_env};
+use pageserver_api::config::{
+    DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT,
+    DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT,
+};
 use pageserver_api::controller_api::PlacementPolicy;
 use pageserver_api::models::{
     ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo,
 };
 use pageserver_api::shard::{ShardCount, ShardStripeSize, TenantShardId};
-use pageserver_api::{
-    DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT,
-    DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT,
-};
 use postgres_backend::AuthType;
 use postgres_connection::parse_host_port;
 use safekeeper_api::{
@@ -417,6 +417,54 @@ async fn handle_tenant(
                 println!("{} {:?}", t.id, t.state);
             }
         }
+        Some(("import", import_match)) => {
+            let tenant_id = parse_tenant_id(import_match)?.unwrap_or_else(TenantId::generate);
+
+            let storage_controller = StorageController::from_env(env);
+            let create_response = storage_controller.tenant_import(tenant_id).await?;
+
+            let shard_zero = create_response
+                .shards
+                .first()
+                .expect("Import response omitted shards");
+
+            let attached_pageserver_id = shard_zero.node_id;
+            let pageserver =
+                PageServerNode::from_env(env, env.get_pageserver_conf(attached_pageserver_id)?);
+
+            println!(
+                "Imported tenant {tenant_id}, attached to pageserver {attached_pageserver_id}"
+            );
+
+            let timelines = pageserver
+                .http_client
+                .list_timelines(shard_zero.shard_id)
+                .await?;
+
+            // Pick a 'main' timeline that has no ancestors, the rest will get arbitrary names
+            let main_timeline = timelines
+                .iter()
+                .find(|t| t.ancestor_timeline_id.is_none())
+                .expect("No timelines found")
+                .timeline_id;
+
+            let mut branch_i = 0;
+            for timeline in timelines.iter() {
+                let branch_name = if timeline.timeline_id == main_timeline {
+                    "main".to_string()
+                } else {
+                    branch_i += 1;
+                    format!("branch_{branch_i}")
+                };
+
+                println!(
+                    "Importing timeline {tenant_id}/{} as branch {branch_name}",
+                    timeline.timeline_id
+                );
+
+                env.register_branch_mapping(branch_name, tenant_id, timeline.timeline_id)?;
+            }
+        }
         Some(("create", create_match)) => {
             let tenant_conf: HashMap<_, _> = create_match
                 .get_many::<String>("config")
@@ -789,6 +837,8 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
                 .copied()
                 .unwrap_or(false);
 
+            let allow_multiple = sub_args.get_flag("allow-multiple");
+
             let mode = match (lsn, hot_standby) {
                 (Some(lsn), false) => ComputeMode::Static(lsn),
                 (None, true) => ComputeMode::Replica,
@@ -806,7 +856,9 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
                 _ => {}
             }
 
-            cplane.check_conflicting_endpoints(mode, tenant_id, timeline_id)?;
+            if !allow_multiple {
+                cplane.check_conflicting_endpoints(mode, tenant_id, timeline_id)?;
+            }
 
             cplane.new_endpoint(
                 &endpoint_id,
@@ -835,6 +887,8 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
 
             let remote_ext_config = sub_args.get_one::<String>("remote-ext-config");
 
+            let allow_multiple = sub_args.get_flag("allow-multiple");
+
             // If --safekeepers argument is given, use only the listed safekeeper nodes.
             let safekeepers =
                 if let Some(safekeepers_str) = sub_args.get_one::<String>("safekeepers") {
@@ -860,11 +914,13 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
                 .cloned()
                 .unwrap_or_default();
 
-            cplane.check_conflicting_endpoints(
-                endpoint.mode,
-                endpoint.tenant_id,
-                endpoint.timeline_id,
-            )?;
+            if !allow_multiple {
+                cplane.check_conflicting_endpoints(
+                    endpoint.mode,
+                    endpoint.tenant_id,
+                    endpoint.timeline_id,
+                )?;
+            }
 
             let (pageservers, stripe_size) = if let Some(pageserver_id) = pageserver_id {
                 let conf = env.get_pageserver_conf(pageserver_id).unwrap();
@@ -1231,7 +1287,7 @@ async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
     match ComputeControlPlane::load(env.clone()) {
         Ok(cplane) => {
             for (_k, node) in cplane.endpoints {
-                if let Err(e) = node.stop(if immediate { "immediate" } else { "fast " }, false) {
+                if let Err(e) = node.stop(if immediate { "immediate" } else { "fast" }, false) {
                     eprintln!("postgres stop failed: {e:#}");
                 }
             }
@@ -1396,6 +1452,12 @@ fn cli() -> Command {
         .help("If set, will create test user `user` and `neondb` database. Requires `update-catalog = true`")
         .required(false);
 
+    let allow_multiple = Arg::new("allow-multiple")
+        .help("Allow multiple primary endpoints running on the same branch. Shouldn't be used normally, but useful for tests.")
+        .long("allow-multiple")
+        .action(ArgAction::SetTrue)
+        .required(false);
+
     Command::new("Neon CLI")
         .arg_required_else_help(true)
         .version(GIT_VERSION)
@@ -1417,6 +1479,7 @@ fn cli() -> Command {
         .subcommand(
             Command::new("timeline")
             .about("Manage timelines")
+            .arg_required_else_help(true)
             .subcommand(Command::new("list")
                 .about("List all timelines, available to this pageserver")
                 .arg(tenant_id_arg.clone()))
@@ -1479,6 +1542,8 @@ fn cli() -> Command {
             .subcommand(Command::new("config")
                 .arg(tenant_id_arg.clone())
                 .arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false)))
+            .subcommand(Command::new("import").arg(tenant_id_arg.clone().required(true))
+                .about("Import a tenant that is present in remote storage, and create branches for its timelines"))
         )
         .subcommand(
             Command::new("pageserver")
@@ -1503,8 +1568,8 @@ fn cli() -> Command {
             Command::new("storage_controller")
                 .arg_required_else_help(true)
                 .about("Manage storage_controller")
-                .subcommand(Command::new("start").about("Start local pageserver").arg(pageserver_config_args.clone()))
-                .subcommand(Command::new("stop").about("Stop local pageserver")
+                .subcommand(Command::new("start").about("Start storage controller"))
+                .subcommand(Command::new("stop").about("Stop storage controller")
                             .arg(stop_mode_arg.clone()))
         )
         .subcommand(
@@ -1550,6 +1615,7 @@ fn cli() -> Command {
                     .arg(pg_version_arg.clone())
                     .arg(hot_standby_arg.clone())
                     .arg(update_catalog)
+                    .arg(allow_multiple.clone())
                 )
                 .subcommand(Command::new("start")
                     .about("Start postgres.\n If the endpoint doesn't exist yet, it is created.")
@@ -1558,6 +1624,7 @@ fn cli() -> Command {
                     .arg(safekeepers_arg)
                     .arg(remote_ext_config_args)
                     .arg(create_test_user)
+                    .arg(allow_multiple.clone())
                 )
                 .subcommand(Command::new("reconfigure")
                             .about("Reconfigure the endpoint")
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index 03f7db99fb..20371e1cb8 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -554,6 +554,7 @@ impl Endpoint {
             format_version: 1.0,
             operation_uuid: None,
             features: self.features.clone(),
+            swap_size_bytes: None,
             cluster: Cluster {
                 cluster_id: None, // project ID: not used
                 name: None,       // project name: not used
diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index bd3dbef453..6437d04ec8 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -17,6 +17,7 @@ use std::net::Ipv4Addr;
 use std::net::SocketAddr;
 use std::path::{Path, PathBuf};
 use std::process::{Command, Stdio};
+use std::time::Duration;
 use utils::{
     auth::{encode_from_key_file, Claims},
     id::{NodeId, TenantId, TenantTimelineId, TimelineId},
@@ -66,6 +67,10 @@ pub struct LocalEnv {
 
     pub broker: NeonBroker,
 
+    // Configuration for the storage controller (1 per neon_local environment)
+    #[serde(default)]
+    pub storage_controller: NeonStorageControllerConf,
+
     /// This Vec must always contain at least one pageserver
     pub pageservers: Vec<PageServerConf>,
 
@@ -98,6 +103,29 @@ pub struct NeonBroker {
     pub listen_addr: SocketAddr,
 }
 
+/// Broker config for cluster internal communication.
+#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
+#[serde(default)]
+pub struct NeonStorageControllerConf {
+    /// Heartbeat timeout before marking a node offline
+    #[serde(with = "humantime_serde")]
+    pub max_unavailable: Duration,
+}
+
+impl NeonStorageControllerConf {
+    // Use a shorter pageserver unavailability interval than the default to speed up tests.
+    const DEFAULT_MAX_UNAVAILABLE_INTERVAL: std::time::Duration =
+        std::time::Duration::from_secs(10);
+}
+
+impl Default for NeonStorageControllerConf {
+    fn default() -> Self {
+        Self {
+            max_unavailable: Self::DEFAULT_MAX_UNAVAILABLE_INTERVAL,
+        }
+    }
+}
+
 // Dummy Default impl to satisfy Deserialize derive.
 impl Default for NeonBroker {
     fn default() -> Self {
@@ -129,6 +157,8 @@ pub struct PageServerConf {
 
     pub(crate) virtual_file_io_engine: Option<String>,
     pub(crate) get_vectored_impl: Option<String>,
+    pub(crate) get_impl: Option<String>,
+    pub(crate) validate_vectored_get: Option<bool>,
 }
 
 impl Default for PageServerConf {
@@ -141,6 +171,8 @@ impl Default for PageServerConf {
             http_auth_type: AuthType::Trust,
             virtual_file_io_engine: None,
             get_vectored_impl: None,
+            get_impl: None,
+            validate_vectored_get: None,
         }
     }
 }
@@ -156,6 +188,7 @@ pub struct SafekeeperConf {
     pub remote_storage: Option<String>,
     pub backup_threads: Option<u32>,
     pub auth_enabled: bool,
+    pub listen_addr: Option<String>,
 }
 
 impl Default for SafekeeperConf {
@@ -169,6 +202,7 @@ impl Default for SafekeeperConf {
             remote_storage: None,
             backup_threads: None,
             auth_enabled: false,
+            listen_addr: None,
         }
     }
 }
@@ -348,7 +382,10 @@ impl LocalEnv {
 
         // Find neon binaries.
         if env.neon_distrib_dir == Path::new("") {
-            env.neon_distrib_dir = env::current_exe()?.parent().unwrap().to_owned();
+            env::current_exe()?
+                .parent()
+                .unwrap()
+                .clone_into(&mut env.neon_distrib_dir);
         }
 
         if env.pageservers.is_empty() {
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index abf815f07a..1a64391306 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -92,6 +92,8 @@ impl PageServerNode {
             http_auth_type,
             virtual_file_io_engine,
             get_vectored_impl,
+            get_impl,
+            validate_vectored_get,
         } = &self.conf;
 
         let id = format!("id={}", id);
@@ -111,6 +113,16 @@ impl PageServerNode {
         } else {
             String::new()
         };
+        let get_impl = if let Some(get_impl) = get_impl {
+            format!("get_impl='{get_impl}'")
+        } else {
+            String::new()
+        };
+        let validate_vectored_get = if let Some(validate_vectored_get) = validate_vectored_get {
+            format!("validate_vectored_get={validate_vectored_get}")
+        } else {
+            String::new()
+        };
 
         let broker_endpoint_param = format!("broker_endpoint='{}'", self.env.broker.client_url());
 
@@ -124,6 +136,8 @@ impl PageServerNode {
             broker_endpoint_param,
             virtual_file_io_engine,
             get_vectored_impl,
+            get_impl,
+            validate_vectored_get,
         ];
 
         if let Some(control_plane_api) = &self.env.control_plane_api {
@@ -234,12 +248,13 @@ impl PageServerNode {
         // situation: the metadata is written by some other script.
         std::fs::write(
             metadata_path,
-            serde_json::to_vec(&serde_json::json!({
-                "host": "localhost",
-                "port": self.pg_connection_config.port(),
-                "http_host": "localhost",
-                "http_port": http_port,
-            }))
+            serde_json::to_vec(&pageserver_api::config::NodeMetadata {
+                postgres_host: "localhost".to_string(),
+                postgres_port: self.pg_connection_config.port(),
+                http_host: "localhost".to_string(),
+                http_port,
+                other: HashMap::new(),
+            })
             .unwrap(),
         )
         .expect("Failed to write metadata file");
@@ -434,6 +449,11 @@ impl PageServerNode {
                 .map(serde_json::from_str)
                 .transpose()
                 .context("parse `timeline_get_throttle` from json")?,
+            switch_to_aux_file_v2: settings
+                .remove("switch_to_aux_file_v2")
+                .map(|x| x.parse::<bool>())
+                .transpose()
+                .context("Failed to parse 'switch_to_aux_file_v2' as bool")?,
         };
         if !settings.is_empty() {
             bail!("Unrecognized tenant settings: {settings:?}")
@@ -552,6 +572,11 @@ impl PageServerNode {
                     .map(serde_json::from_str)
                     .transpose()
                     .context("parse `timeline_get_throttle` from json")?,
+                switch_to_aux_file_v2: settings
+                    .remove("switch_to_aux_file_v2")
+                    .map(|x| x.parse::<bool>())
+                    .transpose()
+                    .context("Failed to parse 'switch_to_aux_file_v2' as bool")?,
             }
         };
 
diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs
index 6ac71dfe51..d62a2e80b5 100644
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -70,24 +70,31 @@ pub struct SafekeeperNode {
     pub pg_connection_config: PgConnectionConfig,
     pub env: LocalEnv,
     pub http_client: reqwest::Client,
+    pub listen_addr: String,
     pub http_base_url: String,
 }
 
 impl SafekeeperNode {
     pub fn from_env(env: &LocalEnv, conf: &SafekeeperConf) -> SafekeeperNode {
+        let listen_addr = if let Some(ref listen_addr) = conf.listen_addr {
+            listen_addr.clone()
+        } else {
+            "127.0.0.1".to_string()
+        };
         SafekeeperNode {
             id: conf.id,
             conf: conf.clone(),
-            pg_connection_config: Self::safekeeper_connection_config(conf.pg_port),
+            pg_connection_config: Self::safekeeper_connection_config(&listen_addr, conf.pg_port),
             env: env.clone(),
             http_client: reqwest::Client::new(),
-            http_base_url: format!("http://127.0.0.1:{}/v1", conf.http_port),
+            http_base_url: format!("http://{}:{}/v1", listen_addr, conf.http_port),
+            listen_addr,
         }
     }
 
     /// Construct libpq connection string for connecting to this safekeeper.
-    fn safekeeper_connection_config(port: u16) -> PgConnectionConfig {
-        PgConnectionConfig::new_host_port(url::Host::parse("127.0.0.1").unwrap(), port)
+    fn safekeeper_connection_config(addr: &str, port: u16) -> PgConnectionConfig {
+        PgConnectionConfig::new_host_port(url::Host::parse(addr).unwrap(), port)
     }
 
     pub fn datadir_path_by_id(env: &LocalEnv, sk_id: NodeId) -> PathBuf {
@@ -111,8 +118,8 @@ impl SafekeeperNode {
         );
         io::stdout().flush().unwrap();
 
-        let listen_pg = format!("127.0.0.1:{}", self.conf.pg_port);
-        let listen_http = format!("127.0.0.1:{}", self.conf.http_port);
+        let listen_pg = format!("{}:{}", self.listen_addr, self.conf.pg_port);
+        let listen_http = format!("{}:{}", self.listen_addr, self.conf.http_port);
         let id = self.id;
         let datadir = self.datadir_path();
 
@@ -139,7 +146,7 @@ impl SafekeeperNode {
             availability_zone,
         ];
         if let Some(pg_tenant_only_port) = self.conf.pg_tenant_only_port {
-            let listen_pg_tenant_only = format!("127.0.0.1:{}", pg_tenant_only_port);
+            let listen_pg_tenant_only = format!("{}:{}", self.listen_addr, pg_tenant_only_port);
             args.extend(["--listen-pg-tenant-only".to_owned(), listen_pg_tenant_only]);
         }
         if !self.conf.sync {
diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs
index 7f2b973391..f1c43f4036 100644
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -1,6 +1,8 @@
-use crate::{background_process, local_env::LocalEnv};
+use crate::{
+    background_process,
+    local_env::{LocalEnv, NeonStorageControllerConf},
+};
 use camino::{Utf8Path, Utf8PathBuf};
-use hyper::Method;
 use pageserver_api::{
     controller_api::{
         NodeConfigureRequest, NodeRegisterRequest, TenantCreateResponse, TenantLocateResponse,
@@ -14,6 +16,7 @@ use pageserver_api::{
 };
 use pageserver_client::mgmt_api::ResponseErrorMessageExt;
 use postgres_backend::AuthType;
+use reqwest::Method;
 use serde::{de::DeserializeOwned, Deserialize, Serialize};
 use std::{fs, str::FromStr};
 use tokio::process::Command;
@@ -32,15 +35,13 @@ pub struct StorageController {
     public_key: Option<String>,
     postgres_port: u16,
     client: reqwest::Client,
+    config: NeonStorageControllerConf,
 }
 
 const COMMAND: &str = "storage_controller";
 
 const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16;
 
-// Use a shorter pageserver unavailability interval than the default to speed up tests.
-const NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL: std::time::Duration = std::time::Duration::from_secs(10);
-
 #[derive(Serialize, Deserialize)]
 pub struct AttachHookRequest {
     pub tenant_shard_id: TenantShardId,
@@ -135,6 +136,7 @@ impl StorageController {
             client: reqwest::ClientBuilder::new()
                 .build()
                 .expect("Failed to construct http client"),
+            config: env.storage_controller.clone(),
         }
     }
 
@@ -272,8 +274,6 @@ impl StorageController {
         // Run migrations on every startup, in case something changed.
         let database_url = self.setup_database().await?;
 
-        let max_unavailable: humantime::Duration = NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL.into();
-
         let mut args = vec![
             "-l",
             &self.listen,
@@ -283,7 +283,7 @@ impl StorageController {
             "--database-url",
             &database_url,
             "--max-unavailable-interval",
-            &max_unavailable.to_string(),
+            &humantime::Duration::from(self.config.max_unavailable).to_string(),
         ]
         .into_iter()
         .map(|s| s.to_string())
@@ -379,7 +379,7 @@ impl StorageController {
     /// Simple HTTP request wrapper for calling into storage controller
     async fn dispatch<RQ, RS>(
         &self,
-        method: hyper::Method,
+        method: reqwest::Method,
         path: String,
         body: Option<RQ>,
     ) -> anyhow::Result<RS>
@@ -472,6 +472,16 @@ impl StorageController {
             .await
     }
 
+    #[instrument(skip(self))]
+    pub async fn tenant_import(&self, tenant_id: TenantId) -> anyhow::Result<TenantCreateResponse> {
+        self.dispatch::<(), TenantCreateResponse>(
+            Method::POST,
+            format!("debug/v1/tenant/{tenant_id}/import"),
+            None,
+        )
+        .await
+    }
+
     #[instrument(skip(self))]
     pub async fn tenant_locate(&self, tenant_id: TenantId) -> anyhow::Result<TenantLocateResponse> {
         self.dispatch::<(), _>(
diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs
index 2edd09eac1..c19bc96cdb 100644
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -1,20 +1,19 @@
-use std::{collections::HashMap, str::FromStr};
+use std::{collections::HashMap, str::FromStr, time::Duration};
 
 use clap::{Parser, Subcommand};
-use hyper::Method;
 use pageserver_api::{
     controller_api::{
         NodeAvailabilityWrapper, NodeDescribeResponse, ShardSchedulingPolicy,
         TenantDescribeResponse, TenantPolicyRequest,
     },
     models::{
-        ShardParameters, TenantConfig, TenantConfigRequest, TenantCreateRequest,
-        TenantShardSplitRequest, TenantShardSplitResponse,
+        LocationConfigSecondary, ShardParameters, TenantConfig, TenantConfigRequest,
+        TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse,
     },
     shard::{ShardStripeSize, TenantShardId},
 };
 use pageserver_client::mgmt_api::{self, ResponseErrorMessageExt};
-use reqwest::Url;
+use reqwest::{Method, StatusCode, Url};
 use serde::{de::DeserializeOwned, Serialize};
 use utils::id::{NodeId, TenantId};
 
@@ -120,6 +119,12 @@ enum Command {
         #[arg(long)]
         tenant_id: TenantId,
     },
+    /// For a tenant which hasn't been onboarded to the storage controller yet, add it in secondary
+    /// mode so that it can warm up content on a pageserver.
+    TenantWarmup {
+        #[arg(long)]
+        tenant_id: TenantId,
+    },
 }
 
 #[derive(Parser)]
@@ -226,7 +231,7 @@ impl Client {
     /// Simple HTTP request wrapper for calling into storage controller
     async fn dispatch<RQ, RS>(
         &self,
-        method: hyper::Method,
+        method: Method,
         path: String,
         body: Option<RQ>,
     ) -> mgmt_api::Result<RS>
@@ -581,6 +586,94 @@ async fn main() -> anyhow::Result<()> {
             }
             println!("{table}");
         }
+        Command::TenantWarmup { tenant_id } => {
+            let describe_response = storcon_client
+                .dispatch::<(), TenantDescribeResponse>(
+                    Method::GET,
+                    format!("control/v1/tenant/{tenant_id}"),
+                    None,
+                )
+                .await;
+            match describe_response {
+                Ok(describe) => {
+                    if matches!(describe.policy, PlacementPolicy::Secondary) {
+                        // Fine: it's already known to controller in secondary mode: calling
+                        // again to put it into secondary mode won't cause problems.
+                    } else {
+                        anyhow::bail!("Tenant already present with policy {:?}", describe.policy);
+                    }
+                }
+                Err(mgmt_api::Error::ApiError(StatusCode::NOT_FOUND, _)) => {
+                    // Fine: this tenant isn't know to the storage controller yet.
+                }
+                Err(e) => {
+                    // Unexpected API error
+                    return Err(e.into());
+                }
+            }
+
+            vps_client
+                .location_config(
+                    TenantShardId::unsharded(tenant_id),
+                    pageserver_api::models::LocationConfig {
+                        mode: pageserver_api::models::LocationConfigMode::Secondary,
+                        generation: None,
+                        secondary_conf: Some(LocationConfigSecondary { warm: true }),
+                        shard_number: 0,
+                        shard_count: 0,
+                        shard_stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE.0,
+                        tenant_conf: TenantConfig::default(),
+                    },
+                    None,
+                    true,
+                )
+                .await?;
+
+            let describe_response = storcon_client
+                .dispatch::<(), TenantDescribeResponse>(
+                    Method::GET,
+                    format!("control/v1/tenant/{tenant_id}"),
+                    None,
+                )
+                .await?;
+
+            let secondary_ps_id = describe_response
+                .shards
+                .first()
+                .unwrap()
+                .node_secondary
+                .first()
+                .unwrap();
+
+            println!("Tenant {tenant_id} warming up on pageserver {secondary_ps_id}");
+            loop {
+                let (status, progress) = vps_client
+                    .tenant_secondary_download(
+                        TenantShardId::unsharded(tenant_id),
+                        Some(Duration::from_secs(10)),
+                    )
+                    .await?;
+                println!(
+                    "Progress: {}/{} layers, {}/{} bytes",
+                    progress.layers_downloaded,
+                    progress.layers_total,
+                    progress.bytes_downloaded,
+                    progress.bytes_total
+                );
+                match status {
+                    StatusCode::OK => {
+                        println!("Download complete");
+                        break;
+                    }
+                    StatusCode::ACCEPTED => {
+                        // Loop
+                    }
+                    _ => {
+                        anyhow::bail!("Unexpected download status: {status}");
+                    }
+                }
+            }
+        }
     }
 
     Ok(())
diff --git a/docs/storage_controller.md b/docs/storage_controller.md
new file mode 100644
index 0000000000..daf4d0c8b7
--- /dev/null
+++ b/docs/storage_controller.md
@@ -0,0 +1,150 @@
+# Storage Controller
+
+## Concepts
+
+The storage controller sits between administrative API clients and pageservers, and handles the details of mapping tenants to pageserver tenant shards. For example, creating a tenant is one API call to the storage controller,
+which is mapped into many API calls to many pageservers (for multiple shards, and for secondary locations).
+
+It implements a pageserver-compatible API that may be used for CRUD operations on tenants and timelines, translating these requests into appropriate operations on the shards within a tenant, which may be on many different pageservers. Using this API, the storage controller may be used in the same way as the pageserver's administrative HTTP API, hiding
+the underlying details of how data is spread across multiple nodes.
+
+The storage controller also manages generations, high availability (via secondary locations) and live migrations for tenants under its management. This is done with a reconciliation loop pattern, where tenants have an “intent” state and a “reconcile” task that tries to make the outside world match the intent.
+
+## APIs
+
+The storage controller’s HTTP server implements four logically separate APIs:
+
+- `/v1/...` path is the pageserver-compatible API. This has to be at the path root because that’s where clients expect to find it on a pageserver.
+- `/control/v1/...` path is the storage controller’s API, which enables operations such as registering and management pageservers, or executing shard splits.
+- `/debug/v1/...` path contains endpoints which are either exclusively used in tests, or are for use by engineers when supporting a deployed system.
+- `/upcall/v1/...` path contains endpoints that are called by pageservers. This includes the `/re-attach` and `/validate` APIs used by pageservers
+  to ensure data safety with generation numbers.
+
+The API is authenticated with a JWT token, and tokens must have scope `pageserverapi` (i.e. the same scope as pageservers’ APIs).
+
+See the `http.rs` file in the source for where the HTTP APIs are implemented.
+
+## Database
+
+The storage controller uses a postgres database to persist a subset of its state. Note that the storage controller does _not_ keep all its state in the database: this is a design choice to enable most operations to be done efficiently in memory, rather than having to read from the database. See `persistence.rs` for a more comprehensive comment explaining what we do and do not persist: a useful metaphor is that we persist objects like tenants and nodes, but we do not
+persist the _relationships_ between them: the attachment state of a tenant's shards to nodes is kept in memory and
+rebuilt on startup.
+
+The file `persistence.rs` contains all the code for accessing the database, and has a large doc comment that goes into more detail about exactly what we persist and why.
+
+The `diesel` crate is used for defining models & migrations.
+
+Running a local cluster with `cargo neon` automatically starts a vanilla postgress process to host the storage controller’s database.
+
+### Diesel tip: migrations
+
+If you need to modify the database schema, here’s how to create a migration:
+
+- Install the diesel CLI with `cargo install diesel_cli`
+- Use `diesel migration generate <name>` to create a new migration
+- Populate the SQL files in the `migrations/` subdirectory
+- Use `DATABASE_URL=... diesel migration run` to apply the migration you just wrote: this will update the `[schema.rs](http://schema.rs)` file automatically.
+  - This requires a running database: the easiest way to do that is to just run `cargo neon init ; cargo neon start`, which will leave a database available at `postgresql://localhost:1235/attachment_service`
+- Commit the migration files and the changes to schema.rs
+- If you need to iterate, you can rewind migrations with `diesel migration revert -a` and then `diesel migration run` again.
+- The migrations are build into the storage controller binary, and automatically run at startup after it is deployed, so once you’ve committed a migration no further steps are needed.
+
+## storcon_cli
+
+The `storcon_cli` tool enables interactive management of the storage controller. This is usually
+only necessary for debug, but may also be used to manage nodes (e.g. marking a node as offline).
+
+`storcon_cli --help` includes details on commands.
+
+# Deploying
+
+This section is aimed at engineers deploying the storage controller outside of Neon's cloud platform, as
+part of a self-hosted system.
+
+_General note: since the default `neon_local` environment includes a storage controller, this is a useful
+reference when figuring out deployment._
+
+## Database
+
+It is **essential** that the database used by the storage controller is durable (**do not store it on ephemeral
+local disk**). This database contains pageserver generation numbers, which are essential to data safety on the pageserver.
+
+The resource requirements for the database are very low: a single CPU core and 1GiB of memory should work well for most deployments. The physical size of the database is typically under a gigabyte.
+
+Set the URL to the database using the `--database-url` CLI option.
+
+There is no need to run migrations manually: the storage controller automatically applies migrations
+when it starts up.
+
+## Configure pageservers to use the storage controller
+
+1. The pageserver `control_plane_api` and `control_plane_api_token` should be set in the `pageserver.toml` file. The API setting should
+   point to the "upcall" prefix, for example `http://127.0.0.1:1234/upcall/v1/` is used in neon_local clusters.
+2. Create a `metadata.json` file in the same directory as `pageserver.toml`: this enables the pageserver to automatically register itself
+   with the storage controller when it starts up. See the example below for the format of this file.
+
+### Example `metadata.json`
+
+```
+{"host":"acmehost.localdomain","http_host":"acmehost.localdomain","http_port":9898,"port":64000}
+```
+
+- `port` and `host` refer to the _postgres_ port and host, and these must be accessible from wherever
+  postgres runs.
+- `http_port` and `http_host` refer to the pageserver's HTTP api, this must be accessible from where
+  the storage controller runs.
+
+## Handle compute notifications.
+
+The storage controller independently moves tenant attachments between pageservers in response to
+changes such as a pageserver node becoming unavailable, or the tenant's shard count changing. To enable
+postgres clients to handle such changes, the storage controller calls an API hook when a tenant's pageserver
+location changes.
+
+The hook is configured using the storage controller's `--compute-hook-url` CLI option. If the hook requires
+JWT auth, the token may be provided with `--control-plane-jwt-token`. The hook will be invoked with a `PUT` request.
+
+In the Neon cloud service, this hook is implemented by Neon's internal cloud control plane. In `neon_local` systems
+the storage controller integrates directly with neon_local to reconfigure local postgres processes instead of calling
+the compute hook.
+
+When implementing an on-premise Neon deployment, you must implement a service that handles the compute hook. This is not complicated:
+the request body has format of the `ComputeHookNotifyRequest` structure, provided below for convenience.
+
+```
+struct ComputeHookNotifyRequestShard {
+    node_id: NodeId,
+    shard_number: ShardNumber,
+}
+
+struct ComputeHookNotifyRequest {
+    tenant_id: TenantId,
+    stripe_size: Option<ShardStripeSize>,
+    shards: Vec<ComputeHookNotifyRequestShard>,
+}
+```
+
+When a notification is received:
+
+1. Modify postgres configuration for this tenant:
+
+   - set `neon.pageserver_connstr` to a comma-separated list of postgres connection strings to pageservers according to the `shards` list. The
+     shards identified by `NodeId` must be converted to the address+port of the node.
+   - if stripe_size is not None, set `neon.stripe_size` to this value
+
+2. Send SIGHUP to postgres to reload configuration
+3. Respond with 200 to the notification request. Do not return success if postgres was not updated: if an error is returned, the controller
+   will retry the notification until it succeeds..
+
+### Example notification body
+
+```
+{
+  "tenant_id": "1f359dd625e519a1a4e8d7509690f6fc",
+  "stripe_size": 32768,
+  "shards": [
+      {"node_id": 344, "shard_number": 0},
+      {"node_id": 722, "shard_number": 1},
+  ],
+}
+```
diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs
index 71ae66c45c..1c4ee2089f 100644
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -33,6 +33,23 @@ pub struct ComputeSpec {
     #[serde(default)]
     pub features: Vec<ComputeFeature>,
 
+    /// If compute_ctl was passed `--resize-swap-on-bind`, a value of `Some(_)` instructs
+    /// compute_ctl to `/neonvm/bin/resize-swap` with the given size, when the spec is first
+    /// received.
+    ///
+    /// Both this field and `--resize-swap-on-bind` are required, so that the control plane's
+    /// spec generation doesn't need to be aware of the actual compute it's running on, while
+    /// guaranteeing gradual rollout of swap. Otherwise, without `--resize-swap-on-bind`, we could
+    /// end up trying to resize swap in VMs without it -- or end up *not* resizing swap, thus
+    /// giving every VM much more swap than it should have (32GiB).
+    ///
+    /// Eventually we may remove `--resize-swap-on-bind` and exclusively use `swap_size_bytes` for
+    /// enabling the swap resizing behavior once rollout is complete.
+    ///
+    /// See neondatabase/cloud#12047 for more.
+    #[serde(default)]
+    pub swap_size_bytes: Option<u64>,
+
     /// Expected cluster state at the end of transition process.
     pub cluster: Cluster,
     pub delta_operations: Option<Vec<DeltaOp>>,
diff --git a/libs/metrics/Cargo.toml b/libs/metrics/Cargo.toml
index f6a49a0166..0bd804051c 100644
--- a/libs/metrics/Cargo.toml
+++ b/libs/metrics/Cargo.toml
@@ -10,11 +10,13 @@ libc.workspace = true
 once_cell.workspace = true
 chrono.workspace = true
 twox-hash.workspace = true
+measured.workspace = true
 
 workspace_hack.workspace = true
 
 [target.'cfg(target_os = "linux")'.dependencies]
 procfs.workspace = true
+measured-process.workspace = true
 
 [dev-dependencies]
 rand = "0.8"
diff --git a/libs/metrics/src/hll.rs b/libs/metrics/src/hll.rs
index dfb4461ce9..f53511ab5c 100644
--- a/libs/metrics/src/hll.rs
+++ b/libs/metrics/src/hll.rs
@@ -7,14 +7,19 @@
 //! use significantly less memory than this, but can only approximate the cardinality.
 
 use std::{
-    collections::HashMap,
-    hash::{BuildHasher, BuildHasherDefault, Hash, Hasher},
-    sync::{atomic::AtomicU8, Arc, RwLock},
+    hash::{BuildHasher, BuildHasherDefault, Hash},
+    sync::atomic::AtomicU8,
 };
 
-use prometheus::{
-    core::{self, Describer},
-    proto, Opts,
+use measured::{
+    label::{LabelGroupVisitor, LabelName, LabelValue, LabelVisitor},
+    metric::{
+        group::{Encoding, MetricValue},
+        name::MetricNameEncoder,
+        Metric, MetricType, MetricVec,
+    },
+    text::TextEncoder,
+    LabelGroup,
 };
 use twox_hash::xxh3;
 
@@ -93,203 +98,25 @@ macro_rules! register_hll {
 /// ```
 ///
 /// See <https://en.wikipedia.org/wiki/HyperLogLog#Practical_considerations> for estimates on alpha
-#[derive(Clone)]
-pub struct HyperLogLogVec<const N: usize> {
-    core: Arc<HyperLogLogVecCore<N>>,
+pub type HyperLogLogVec<L, const N: usize> = MetricVec<HyperLogLogState<N>, L>;
+pub type HyperLogLog<const N: usize> = Metric<HyperLogLogState<N>>;
+
+pub struct HyperLogLogState<const N: usize> {
+    shards: [AtomicU8; N],
 }
-
-struct HyperLogLogVecCore<const N: usize> {
-    pub children: RwLock<HashMap<u64, HyperLogLog<N>, BuildHasherDefault<xxh3::Hash64>>>,
-    pub desc: core::Desc,
-    pub opts: Opts,
-}
-
-impl<const N: usize> core::Collector for HyperLogLogVec<N> {
-    fn desc(&self) -> Vec<&core::Desc> {
-        vec![&self.core.desc]
-    }
-
-    fn collect(&self) -> Vec<proto::MetricFamily> {
-        let mut m = proto::MetricFamily::default();
-        m.set_name(self.core.desc.fq_name.clone());
-        m.set_help(self.core.desc.help.clone());
-        m.set_field_type(proto::MetricType::GAUGE);
-
-        let mut metrics = Vec::new();
-        for child in self.core.children.read().unwrap().values() {
-            child.core.collect_into(&mut metrics);
-        }
-        m.set_metric(metrics);
-
-        vec![m]
+impl<const N: usize> Default for HyperLogLogState<N> {
+    fn default() -> Self {
+        #[allow(clippy::declare_interior_mutable_const)]
+        const ZERO: AtomicU8 = AtomicU8::new(0);
+        Self { shards: [ZERO; N] }
     }
 }
 
-impl<const N: usize> HyperLogLogVec<N> {
-    /// Create a new [`HyperLogLogVec`] based on the provided
-    /// [`Opts`] and partitioned by the given label names. At least one label name must be
-    /// provided.
-    pub fn new(opts: Opts, label_names: &[&str]) -> prometheus::Result<Self> {
-        assert!(N.is_power_of_two());
-        let variable_names = label_names.iter().map(|s| (*s).to_owned()).collect();
-        let opts = opts.variable_labels(variable_names);
-
-        let desc = opts.describe()?;
-        let v = HyperLogLogVecCore {
-            children: RwLock::new(HashMap::default()),
-            desc,
-            opts,
-        };
-
-        Ok(Self { core: Arc::new(v) })
-    }
-
-    /// `get_metric_with_label_values` returns the [`HyperLogLog<P>`] for the given slice
-    /// of label values (same order as the VariableLabels in Desc). If that combination of
-    /// label values is accessed for the first time, a new [`HyperLogLog<P>`] is created.
-    ///
-    /// An error is returned if the number of label values is not the same as the
-    /// number of VariableLabels in Desc.
-    pub fn get_metric_with_label_values(
-        &self,
-        vals: &[&str],
-    ) -> prometheus::Result<HyperLogLog<N>> {
-        self.core.get_metric_with_label_values(vals)
-    }
-
-    /// `with_label_values` works as `get_metric_with_label_values`, but panics if an error
-    /// occurs.
-    pub fn with_label_values(&self, vals: &[&str]) -> HyperLogLog<N> {
-        self.get_metric_with_label_values(vals).unwrap()
-    }
+impl<const N: usize> MetricType for HyperLogLogState<N> {
+    type Metadata = ();
 }
 
-impl<const N: usize> HyperLogLogVecCore<N> {
-    pub fn get_metric_with_label_values(
-        &self,
-        vals: &[&str],
-    ) -> prometheus::Result<HyperLogLog<N>> {
-        let h = self.hash_label_values(vals)?;
-
-        if let Some(metric) = self.children.read().unwrap().get(&h).cloned() {
-            return Ok(metric);
-        }
-
-        self.get_or_create_metric(h, vals)
-    }
-
-    pub(crate) fn hash_label_values(&self, vals: &[&str]) -> prometheus::Result<u64> {
-        if vals.len() != self.desc.variable_labels.len() {
-            return Err(prometheus::Error::InconsistentCardinality {
-                expect: self.desc.variable_labels.len(),
-                got: vals.len(),
-            });
-        }
-
-        let mut h = xxh3::Hash64::default();
-        for val in vals {
-            h.write(val.as_bytes());
-        }
-
-        Ok(h.finish())
-    }
-
-    fn get_or_create_metric(
-        &self,
-        hash: u64,
-        label_values: &[&str],
-    ) -> prometheus::Result<HyperLogLog<N>> {
-        let mut children = self.children.write().unwrap();
-        // Check exist first.
-        if let Some(metric) = children.get(&hash).cloned() {
-            return Ok(metric);
-        }
-
-        let metric = HyperLogLog::with_opts_and_label_values(&self.opts, label_values)?;
-        children.insert(hash, metric.clone());
-        Ok(metric)
-    }
-}
-
-/// HLL is a probabilistic cardinality measure.
-///
-/// How to use this time-series for a metric name `my_metrics_total_hll`:
-///
-/// ```promql
-/// # harmonic mean
-/// 1 / (
-///     sum (
-///         2 ^ -(
-///             # HLL merge operation
-///             max (my_metrics_total_hll{}) by (hll_shard, other_labels...)
-///         )
-///     ) without (hll_shard)
-/// )
-/// * alpha
-/// * shards_count
-/// * shards_count
-/// ```
-///
-/// If you want an estimate over time, you can use the following query:
-///
-/// ```promql
-/// # harmonic mean
-/// 1 / (
-///     sum (
-///         2 ^ -(
-///             # HLL merge operation
-///             max (
-///                 max_over_time(my_metrics_total_hll{}[$__rate_interval])
-///             ) by (hll_shard, other_labels...)
-///         )
-///     ) without (hll_shard)
-/// )
-/// * alpha
-/// * shards_count
-/// * shards_count
-/// ```
-///
-/// In the case of low cardinality, you might want to use the linear counting approximation:
-///
-/// ```promql
-/// # LinearCounting(m, V) = m log (m / V)
-/// shards_count * ln(shards_count /
-///     # calculate V = how many shards contain a 0
-///     count(max (proxy_connecting_endpoints{}) by (hll_shard, protocol) == 0) without (hll_shard)
-/// )
-/// ```
-///
-/// See <https://en.wikipedia.org/wiki/HyperLogLog#Practical_considerations> for estimates on alpha
-#[derive(Clone)]
-pub struct HyperLogLog<const N: usize> {
-    core: Arc<HyperLogLogCore<N>>,
-}
-
-impl<const N: usize> HyperLogLog<N> {
-    /// Create a [`HyperLogLog`] with the `name` and `help` arguments.
-    pub fn new<S1: Into<String>, S2: Into<String>>(name: S1, help: S2) -> prometheus::Result<Self> {
-        assert!(N.is_power_of_two());
-        let opts = Opts::new(name, help);
-        Self::with_opts(opts)
-    }
-
-    /// Create a [`HyperLogLog`] with the `opts` options.
-    pub fn with_opts(opts: Opts) -> prometheus::Result<Self> {
-        Self::with_opts_and_label_values(&opts, &[])
-    }
-
-    fn with_opts_and_label_values(opts: &Opts, label_values: &[&str]) -> prometheus::Result<Self> {
-        let desc = opts.describe()?;
-        let labels = make_label_pairs(&desc, label_values)?;
-
-        let v = HyperLogLogCore {
-            shards: [0; N].map(AtomicU8::new),
-            desc,
-            labels,
-        };
-        Ok(Self { core: Arc::new(v) })
-    }
-
+impl<const N: usize> HyperLogLogState<N> {
     pub fn measure(&self, item: &impl Hash) {
         // changing the hasher will break compatibility with previous measurements.
         self.record(BuildHasherDefault::<xxh3::Hash64>::default().hash_one(item));
@@ -299,42 +126,11 @@ impl<const N: usize> HyperLogLog<N> {
         let p = N.ilog2() as u8;
         let j = hash & (N as u64 - 1);
         let rho = (hash >> p).leading_zeros() as u8 + 1 - p;
-        self.core.shards[j as usize].fetch_max(rho, std::sync::atomic::Ordering::Relaxed);
-    }
-}
-
-struct HyperLogLogCore<const N: usize> {
-    shards: [AtomicU8; N],
-    desc: core::Desc,
-    labels: Vec<proto::LabelPair>,
-}
-
-impl<const N: usize> core::Collector for HyperLogLog<N> {
-    fn desc(&self) -> Vec<&core::Desc> {
-        vec![&self.core.desc]
+        self.shards[j as usize].fetch_max(rho, std::sync::atomic::Ordering::Relaxed);
     }
 
-    fn collect(&self) -> Vec<proto::MetricFamily> {
-        let mut m = proto::MetricFamily::default();
-        m.set_name(self.core.desc.fq_name.clone());
-        m.set_help(self.core.desc.help.clone());
-        m.set_field_type(proto::MetricType::GAUGE);
-
-        let mut metrics = Vec::new();
-        self.core.collect_into(&mut metrics);
-        m.set_metric(metrics);
-
-        vec![m]
-    }
-}
-
-impl<const N: usize> HyperLogLogCore<N> {
-    fn collect_into(&self, metrics: &mut Vec<proto::Metric>) {
-        self.shards.iter().enumerate().for_each(|(i, x)| {
-            let mut shard_label = proto::LabelPair::default();
-            shard_label.set_name("hll_shard".to_owned());
-            shard_label.set_value(format!("{i}"));
-
+    fn take_sample(&self) -> [u8; N] {
+        self.shards.each_ref().map(|x| {
             // We reset the counter to 0 so we can perform a cardinality measure over any time slice in prometheus.
 
             // This seems like it would be a race condition,
@@ -344,85 +140,90 @@ impl<const N: usize> HyperLogLogCore<N> {
 
             // TODO: maybe we shouldn't reset this on every collect, instead, only after a time window.
             // this would mean that a dev port-forwarding the metrics url won't break the sampling.
-            let v = x.swap(0, std::sync::atomic::Ordering::Relaxed);
-
-            let mut m = proto::Metric::default();
-            let mut c = proto::Gauge::default();
-            c.set_value(v as f64);
-            m.set_gauge(c);
-
-            let mut labels = Vec::with_capacity(self.labels.len() + 1);
-            labels.extend_from_slice(&self.labels);
-            labels.push(shard_label);
-
-            m.set_label(labels);
-            metrics.push(m);
+            x.swap(0, std::sync::atomic::Ordering::Relaxed)
         })
     }
 }
-
-fn make_label_pairs(
-    desc: &core::Desc,
-    label_values: &[&str],
-) -> prometheus::Result<Vec<proto::LabelPair>> {
-    if desc.variable_labels.len() != label_values.len() {
-        return Err(prometheus::Error::InconsistentCardinality {
-            expect: desc.variable_labels.len(),
-            got: label_values.len(),
-        });
+impl<W: std::io::Write, const N: usize> measured::metric::MetricEncoding<TextEncoder<W>>
+    for HyperLogLogState<N>
+{
+    fn write_type(
+        name: impl MetricNameEncoder,
+        enc: &mut TextEncoder<W>,
+    ) -> Result<(), std::io::Error> {
+        enc.write_type(&name, measured::text::MetricType::Gauge)
     }
+    fn collect_into(
+        &self,
+        _: &(),
+        labels: impl LabelGroup,
+        name: impl MetricNameEncoder,
+        enc: &mut TextEncoder<W>,
+    ) -> Result<(), std::io::Error> {
+        struct I64(i64);
+        impl LabelValue for I64 {
+            fn visit<V: LabelVisitor>(&self, v: V) -> V::Output {
+                v.write_int(self.0)
+            }
+        }
 
-    let total_len = desc.variable_labels.len() + desc.const_label_pairs.len();
-    if total_len == 0 {
-        return Ok(vec![]);
-    }
+        struct HllShardLabel {
+            hll_shard: i64,
+        }
 
-    if desc.variable_labels.is_empty() {
-        return Ok(desc.const_label_pairs.clone());
-    }
+        impl LabelGroup for HllShardLabel {
+            fn visit_values(&self, v: &mut impl LabelGroupVisitor) {
+                const LE: &LabelName = LabelName::from_str("hll_shard");
+                v.write_value(LE, &I64(self.hll_shard));
+            }
+        }
 
-    let mut label_pairs = Vec::with_capacity(total_len);
-    for (i, n) in desc.variable_labels.iter().enumerate() {
-        let mut label_pair = proto::LabelPair::default();
-        label_pair.set_name(n.clone());
-        label_pair.set_value(label_values[i].to_owned());
-        label_pairs.push(label_pair);
+        self.take_sample()
+            .into_iter()
+            .enumerate()
+            .try_for_each(|(hll_shard, val)| {
+                enc.write_metric_value(
+                    name.by_ref(),
+                    labels.by_ref().compose_with(HllShardLabel {
+                        hll_shard: hll_shard as i64,
+                    }),
+                    MetricValue::Int(val as i64),
+                )
+            })
     }
-
-    for label_pair in &desc.const_label_pairs {
-        label_pairs.push(label_pair.clone());
-    }
-    label_pairs.sort();
-    Ok(label_pairs)
 }
 
 #[cfg(test)]
 mod tests {
     use std::collections::HashSet;
 
-    use prometheus::{proto, Opts};
+    use measured::{label::StaticLabelSet, FixedCardinalityLabel};
     use rand::{rngs::StdRng, Rng, SeedableRng};
     use rand_distr::{Distribution, Zipf};
 
     use crate::HyperLogLogVec;
 
-    fn collect(hll: &HyperLogLogVec<32>) -> Vec<proto::Metric> {
-        let mut metrics = vec![];
-        hll.core
-            .children
-            .read()
-            .unwrap()
-            .values()
-            .for_each(|c| c.core.collect_into(&mut metrics));
-        metrics
+    #[derive(FixedCardinalityLabel, Clone, Copy)]
+    #[label(singleton = "x")]
+    enum Label {
+        A,
+        B,
     }
-    fn get_cardinality(metrics: &[proto::Metric], filter: impl Fn(&proto::Metric) -> bool) -> f64 {
+
+    fn collect(hll: &HyperLogLogVec<StaticLabelSet<Label>, 32>) -> ([u8; 32], [u8; 32]) {
+        // cannot go through the `hll.collect_family_into` interface yet...
+        // need to see if I can fix the conflicting impls problem in measured.
+        (
+            hll.get_metric(hll.with_labels(Label::A)).take_sample(),
+            hll.get_metric(hll.with_labels(Label::B)).take_sample(),
+        )
+    }
+
+    fn get_cardinality(samples: &[[u8; 32]]) -> f64 {
         let mut buckets = [0.0; 32];
-        for metric in metrics.chunks_exact(32) {
-            if filter(&metric[0]) {
-                for (i, m) in metric.iter().enumerate() {
-                    buckets[i] = f64::max(buckets[i], m.get_gauge().get_value());
-                }
+        for &sample in samples {
+            for (i, m) in sample.into_iter().enumerate() {
+                buckets[i] = f64::max(buckets[i], m as f64);
             }
         }
 
@@ -437,7 +238,7 @@ mod tests {
     }
 
     fn test_cardinality(n: usize, dist: impl Distribution<f64>) -> ([usize; 3], [f64; 3]) {
-        let hll = HyperLogLogVec::<32>::new(Opts::new("foo", "bar"), &["x"]).unwrap();
+        let hll = HyperLogLogVec::<StaticLabelSet<Label>, 32>::new();
 
         let mut iter = StdRng::seed_from_u64(0x2024_0112).sample_iter(dist);
         let mut set_a = HashSet::new();
@@ -445,18 +246,20 @@ mod tests {
 
         for x in iter.by_ref().take(n) {
             set_a.insert(x.to_bits());
-            hll.with_label_values(&["a"]).measure(&x.to_bits());
+            hll.get_metric(hll.with_labels(Label::A))
+                .measure(&x.to_bits());
         }
         for x in iter.by_ref().take(n) {
             set_b.insert(x.to_bits());
-            hll.with_label_values(&["b"]).measure(&x.to_bits());
+            hll.get_metric(hll.with_labels(Label::B))
+                .measure(&x.to_bits());
         }
         let merge = &set_a | &set_b;
 
-        let metrics = collect(&hll);
-        let len = get_cardinality(&metrics, |_| true);
-        let len_a = get_cardinality(&metrics, |l| l.get_label()[0].get_value() == "a");
-        let len_b = get_cardinality(&metrics, |l| l.get_label()[0].get_value() == "b");
+        let (a, b) = collect(&hll);
+        let len = get_cardinality(&[a, b]);
+        let len_a = get_cardinality(&[a]);
+        let len_b = get_cardinality(&[b]);
 
         ([merge.len(), set_a.len(), set_b.len()], [len, len_a, len_b])
     }
diff --git a/libs/metrics/src/lib.rs b/libs/metrics/src/lib.rs
index 22b0a18933..8e0dbe6ce4 100644
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -4,6 +4,17 @@
 //! a default registry.
 #![deny(clippy::undocumented_unsafe_blocks)]
 
+use measured::{
+    label::{LabelGroupSet, LabelGroupVisitor, LabelName, NoLabels},
+    metric::{
+        counter::CounterState,
+        gauge::GaugeState,
+        group::{Encoding, MetricValue},
+        name::{MetricName, MetricNameEncoder},
+        MetricEncoding, MetricFamilyEncoding,
+    },
+    FixedCardinalityLabel, LabelGroup, MetricGroup,
+};
 use once_cell::sync::Lazy;
 use prometheus::core::{
     Atomic, AtomicU64, Collector, GenericCounter, GenericCounterVec, GenericGauge, GenericGaugeVec,
@@ -11,6 +22,7 @@ use prometheus::core::{
 pub use prometheus::opts;
 pub use prometheus::register;
 pub use prometheus::Error;
+use prometheus::Registry;
 pub use prometheus::{core, default_registry, proto};
 pub use prometheus::{exponential_buckets, linear_buckets};
 pub use prometheus::{register_counter_vec, Counter, CounterVec};
@@ -23,13 +35,12 @@ pub use prometheus::{register_int_counter_vec, IntCounterVec};
 pub use prometheus::{register_int_gauge, IntGauge};
 pub use prometheus::{register_int_gauge_vec, IntGaugeVec};
 pub use prometheus::{Encoder, TextEncoder};
-use prometheus::{Registry, Result};
 
 pub mod launch_timestamp;
 mod wrappers;
 pub use wrappers::{CountedReader, CountedWriter};
 mod hll;
-pub use hll::{HyperLogLog, HyperLogLogVec};
+pub use hll::{HyperLogLog, HyperLogLogState, HyperLogLogVec};
 #[cfg(target_os = "linux")]
 pub mod more_process_metrics;
 
@@ -59,7 +70,7 @@ static INTERNAL_REGISTRY: Lazy<Registry> = Lazy::new(Registry::new);
 /// Register a collector in the internal registry. MUST be called before the first call to `gather()`.
 /// Otherwise, we can have a deadlock in the `gather()` call, trying to register a new collector
 /// while holding the lock.
-pub fn register_internal(c: Box<dyn Collector>) -> Result<()> {
+pub fn register_internal(c: Box<dyn Collector>) -> prometheus::Result<()> {
     INTERNAL_REGISTRY.register(c)
 }
 
@@ -96,6 +107,127 @@ pub const DISK_WRITE_SECONDS_BUCKETS: &[f64] = &[
     0.000_050, 0.000_100, 0.000_500, 0.001, 0.003, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5,
 ];
 
+pub struct BuildInfo {
+    pub revision: &'static str,
+    pub build_tag: &'static str,
+}
+
+// todo: allow label group without the set
+impl LabelGroup for BuildInfo {
+    fn visit_values(&self, v: &mut impl LabelGroupVisitor) {
+        const REVISION: &LabelName = LabelName::from_str("revision");
+        v.write_value(REVISION, &self.revision);
+        const BUILD_TAG: &LabelName = LabelName::from_str("build_tag");
+        v.write_value(BUILD_TAG, &self.build_tag);
+    }
+}
+
+impl<T: Encoding> MetricFamilyEncoding<T> for BuildInfo
+where
+    GaugeState: MetricEncoding<T>,
+{
+    fn collect_family_into(
+        &self,
+        name: impl measured::metric::name::MetricNameEncoder,
+        enc: &mut T,
+    ) -> Result<(), T::Err> {
+        enc.write_help(&name, "Build/version information")?;
+        GaugeState::write_type(&name, enc)?;
+        GaugeState {
+            count: std::sync::atomic::AtomicI64::new(1),
+        }
+        .collect_into(&(), self, name, enc)
+    }
+}
+
+#[derive(MetricGroup)]
+#[metric(new(build_info: BuildInfo))]
+pub struct NeonMetrics {
+    #[cfg(target_os = "linux")]
+    #[metric(namespace = "process")]
+    #[metric(init = measured_process::ProcessCollector::for_self())]
+    process: measured_process::ProcessCollector,
+
+    #[metric(namespace = "libmetrics")]
+    #[metric(init = LibMetrics::new(build_info))]
+    libmetrics: LibMetrics,
+}
+
+#[derive(MetricGroup)]
+#[metric(new(build_info: BuildInfo))]
+pub struct LibMetrics {
+    #[metric(init = build_info)]
+    build_info: BuildInfo,
+
+    #[metric(flatten)]
+    rusage: Rusage,
+
+    serve_count: CollectionCounter,
+}
+
+fn write_gauge<Enc: Encoding>(
+    x: i64,
+    labels: impl LabelGroup,
+    name: impl MetricNameEncoder,
+    enc: &mut Enc,
+) -> Result<(), Enc::Err> {
+    enc.write_metric_value(name, labels, MetricValue::Int(x))
+}
+
+#[derive(Default)]
+struct Rusage;
+
+#[derive(FixedCardinalityLabel, Clone, Copy)]
+#[label(singleton = "io_operation")]
+enum IoOp {
+    Read,
+    Write,
+}
+
+impl<T: Encoding> MetricGroup<T> for Rusage
+where
+    GaugeState: MetricEncoding<T>,
+{
+    fn collect_group_into(&self, enc: &mut T) -> Result<(), T::Err> {
+        const DISK_IO: &MetricName = MetricName::from_str("disk_io_bytes_total");
+        const MAXRSS: &MetricName = MetricName::from_str("maxrss_kb");
+
+        let ru = get_rusage_stats();
+
+        enc.write_help(
+            DISK_IO,
+            "Bytes written and read from disk, grouped by the operation (read|write)",
+        )?;
+        GaugeState::write_type(DISK_IO, enc)?;
+        write_gauge(ru.ru_inblock * BYTES_IN_BLOCK, IoOp::Read, DISK_IO, enc)?;
+        write_gauge(ru.ru_oublock * BYTES_IN_BLOCK, IoOp::Write, DISK_IO, enc)?;
+
+        enc.write_help(MAXRSS, "Memory usage (Maximum Resident Set Size)")?;
+        GaugeState::write_type(MAXRSS, enc)?;
+        write_gauge(ru.ru_maxrss, IoOp::Read, MAXRSS, enc)?;
+
+        Ok(())
+    }
+}
+
+#[derive(Default)]
+struct CollectionCounter(CounterState);
+
+impl<T: Encoding> MetricFamilyEncoding<T> for CollectionCounter
+where
+    CounterState: MetricEncoding<T>,
+{
+    fn collect_family_into(
+        &self,
+        name: impl measured::metric::name::MetricNameEncoder,
+        enc: &mut T,
+    ) -> Result<(), T::Err> {
+        self.0.inc();
+        enc.write_help(&name, "Number of metric requests made")?;
+        self.0.collect_into(&(), NoLabels, name, enc)
+    }
+}
+
 pub fn set_build_info_metric(revision: &str, build_tag: &str) {
     let metric = register_int_gauge_vec!(
         "libmetrics_build_info",
@@ -105,6 +237,7 @@ pub fn set_build_info_metric(revision: &str, build_tag: &str) {
     .expect("Failed to register build info metric");
     metric.with_label_values(&[revision, build_tag]).set(1);
 }
+const BYTES_IN_BLOCK: i64 = 512;
 
 // Records I/O stats in a "cross-platform" way.
 // Compiles both on macOS and Linux, but current macOS implementation always returns 0 as values for I/O stats.
@@ -117,14 +250,22 @@ pub fn set_build_info_metric(revision: &str, build_tag: &str) {
 fn update_rusage_metrics() {
     let rusage_stats = get_rusage_stats();
 
-    const BYTES_IN_BLOCK: i64 = 512;
     DISK_IO_BYTES
         .with_label_values(&["read"])
         .set(rusage_stats.ru_inblock * BYTES_IN_BLOCK);
     DISK_IO_BYTES
         .with_label_values(&["write"])
         .set(rusage_stats.ru_oublock * BYTES_IN_BLOCK);
-    MAXRSS_KB.set(rusage_stats.ru_maxrss);
+
+    // On macOS, the unit of maxrss is bytes; on Linux, it's kilobytes. https://stackoverflow.com/a/59915669
+    #[cfg(target_os = "macos")]
+    {
+        MAXRSS_KB.set(rusage_stats.ru_maxrss / 1024);
+    }
+    #[cfg(not(target_os = "macos"))]
+    {
+        MAXRSS_KB.set(rusage_stats.ru_maxrss);
+    }
 }
 
 fn get_rusage_stats() -> libc::rusage {
@@ -151,6 +292,7 @@ macro_rules! register_int_counter_pair_vec {
         }
     }};
 }
+
 /// Create an [`IntCounterPair`] and registers to default registry.
 #[macro_export(local_inner_macros)]
 macro_rules! register_int_counter_pair {
@@ -188,7 +330,10 @@ impl<P: Atomic> GenericCounterPairVec<P> {
     ///
     /// An error is returned if the number of label values is not the same as the
     /// number of VariableLabels in Desc.
-    pub fn get_metric_with_label_values(&self, vals: &[&str]) -> Result<GenericCounterPair<P>> {
+    pub fn get_metric_with_label_values(
+        &self,
+        vals: &[&str],
+    ) -> prometheus::Result<GenericCounterPair<P>> {
         Ok(GenericCounterPair {
             inc: self.inc.get_metric_with_label_values(vals)?,
             dec: self.dec.get_metric_with_label_values(vals)?,
@@ -201,7 +346,7 @@ impl<P: Atomic> GenericCounterPairVec<P> {
         self.get_metric_with_label_values(vals).unwrap()
     }
 
-    pub fn remove_label_values(&self, res: &mut [Result<()>; 2], vals: &[&str]) {
+    pub fn remove_label_values(&self, res: &mut [prometheus::Result<()>; 2], vals: &[&str]) {
         res[0] = self.inc.remove_label_values(vals);
         res[1] = self.dec.remove_label_values(vals);
     }
@@ -285,3 +430,171 @@ pub type IntCounterPair = GenericCounterPair<AtomicU64>;
 
 /// A guard for [`IntCounterPair`] that will decrement the gauge on drop
 pub type IntCounterPairGuard = GenericCounterPairGuard<AtomicU64>;
+
+pub trait CounterPairAssoc {
+    const INC_NAME: &'static MetricName;
+    const DEC_NAME: &'static MetricName;
+
+    const INC_HELP: &'static str;
+    const DEC_HELP: &'static str;
+
+    type LabelGroupSet: LabelGroupSet;
+}
+
+pub struct CounterPairVec<A: CounterPairAssoc> {
+    vec: measured::metric::MetricVec<MeasuredCounterPairState, A::LabelGroupSet>,
+}
+
+impl<A: CounterPairAssoc> Default for CounterPairVec<A>
+where
+    A::LabelGroupSet: Default,
+{
+    fn default() -> Self {
+        Self {
+            vec: Default::default(),
+        }
+    }
+}
+
+impl<A: CounterPairAssoc> CounterPairVec<A> {
+    pub fn guard(
+        &self,
+        labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>,
+    ) -> MeasuredCounterPairGuard<'_, A> {
+        let id = self.vec.with_labels(labels);
+        self.vec.get_metric(id).inc.inc();
+        MeasuredCounterPairGuard { vec: &self.vec, id }
+    }
+    pub fn inc(&self, labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>) {
+        let id = self.vec.with_labels(labels);
+        self.vec.get_metric(id).inc.inc();
+    }
+    pub fn dec(&self, labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>) {
+        let id = self.vec.with_labels(labels);
+        self.vec.get_metric(id).dec.inc();
+    }
+    pub fn remove_metric(
+        &self,
+        labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>,
+    ) -> Option<MeasuredCounterPairState> {
+        let id = self.vec.with_labels(labels);
+        self.vec.remove_metric(id)
+    }
+}
+
+impl<T, A> ::measured::metric::group::MetricGroup<T> for CounterPairVec<A>
+where
+    T: ::measured::metric::group::Encoding,
+    A: CounterPairAssoc,
+    ::measured::metric::counter::CounterState: ::measured::metric::MetricEncoding<T>,
+{
+    fn collect_group_into(&self, enc: &mut T) -> Result<(), T::Err> {
+        // write decrement first to avoid a race condition where inc - dec < 0
+        T::write_help(enc, A::DEC_NAME, A::DEC_HELP)?;
+        self.vec
+            .collect_family_into(A::DEC_NAME, &mut Dec(&mut *enc))?;
+
+        T::write_help(enc, A::INC_NAME, A::INC_HELP)?;
+        self.vec
+            .collect_family_into(A::INC_NAME, &mut Inc(&mut *enc))?;
+
+        Ok(())
+    }
+}
+
+#[derive(MetricGroup, Default)]
+pub struct MeasuredCounterPairState {
+    pub inc: CounterState,
+    pub dec: CounterState,
+}
+
+impl measured::metric::MetricType for MeasuredCounterPairState {
+    type Metadata = ();
+}
+
+pub struct MeasuredCounterPairGuard<'a, A: CounterPairAssoc> {
+    vec: &'a measured::metric::MetricVec<MeasuredCounterPairState, A::LabelGroupSet>,
+    id: measured::metric::LabelId<A::LabelGroupSet>,
+}
+
+impl<A: CounterPairAssoc> Drop for MeasuredCounterPairGuard<'_, A> {
+    fn drop(&mut self) {
+        self.vec.get_metric(self.id).dec.inc();
+    }
+}
+
+/// [`MetricEncoding`] for [`MeasuredCounterPairState`] that only writes the inc counter to the inner encoder.
+struct Inc<T>(T);
+/// [`MetricEncoding`] for [`MeasuredCounterPairState`] that only writes the dec counter to the inner encoder.
+struct Dec<T>(T);
+
+impl<T: Encoding> Encoding for Inc<T> {
+    type Err = T::Err;
+
+    fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> {
+        self.0.write_help(name, help)
+    }
+
+    fn write_metric_value(
+        &mut self,
+        name: impl MetricNameEncoder,
+        labels: impl LabelGroup,
+        value: MetricValue,
+    ) -> Result<(), Self::Err> {
+        self.0.write_metric_value(name, labels, value)
+    }
+}
+
+impl<T: Encoding> MetricEncoding<Inc<T>> for MeasuredCounterPairState
+where
+    CounterState: MetricEncoding<T>,
+{
+    fn write_type(name: impl MetricNameEncoder, enc: &mut Inc<T>) -> Result<(), T::Err> {
+        CounterState::write_type(name, &mut enc.0)
+    }
+    fn collect_into(
+        &self,
+        metadata: &(),
+        labels: impl LabelGroup,
+        name: impl MetricNameEncoder,
+        enc: &mut Inc<T>,
+    ) -> Result<(), T::Err> {
+        self.inc.collect_into(metadata, labels, name, &mut enc.0)
+    }
+}
+
+impl<T: Encoding> Encoding for Dec<T> {
+    type Err = T::Err;
+
+    fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> {
+        self.0.write_help(name, help)
+    }
+
+    fn write_metric_value(
+        &mut self,
+        name: impl MetricNameEncoder,
+        labels: impl LabelGroup,
+        value: MetricValue,
+    ) -> Result<(), Self::Err> {
+        self.0.write_metric_value(name, labels, value)
+    }
+}
+
+/// Write the dec counter to the encoder
+impl<T: Encoding> MetricEncoding<Dec<T>> for MeasuredCounterPairState
+where
+    CounterState: MetricEncoding<T>,
+{
+    fn write_type(name: impl MetricNameEncoder, enc: &mut Dec<T>) -> Result<(), T::Err> {
+        CounterState::write_type(name, &mut enc.0)
+    }
+    fn collect_into(
+        &self,
+        metadata: &(),
+        labels: impl LabelGroup,
+        name: impl MetricNameEncoder,
+        enc: &mut Dec<T>,
+    ) -> Result<(), T::Err> {
+        self.dec.collect_into(metadata, labels, name, &mut enc.0)
+    }
+}
diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
new file mode 100644
index 0000000000..d996a62349
--- /dev/null
+++ b/libs/pageserver_api/src/config.rs
@@ -0,0 +1,31 @@
+use std::collections::HashMap;
+
+use const_format::formatcp;
+
+#[cfg(test)]
+mod tests;
+
+pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
+pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
+pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898;
+pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}");
+
+// Certain metadata (e.g. externally-addressable name, AZ) is delivered
+// as a separate structure.  This information is not neeed by the pageserver
+// itself, it is only used for registering the pageserver with the control
+// plane and/or storage controller.
+//
+#[derive(PartialEq, Eq, Debug, serde::Serialize, serde::Deserialize)]
+pub struct NodeMetadata {
+    #[serde(rename = "host")]
+    pub postgres_host: String,
+    #[serde(rename = "port")]
+    pub postgres_port: u16,
+    pub http_host: String,
+    pub http_port: u16,
+
+    // Deployment tools may write fields to the metadata file beyond what we
+    // use in this type: this type intentionally only names fields that require.
+    #[serde(flatten)]
+    pub other: HashMap<String, serde_json::Value>,
+}
diff --git a/libs/pageserver_api/src/config/tests.rs b/libs/pageserver_api/src/config/tests.rs
new file mode 100644
index 0000000000..edeefc156e
--- /dev/null
+++ b/libs/pageserver_api/src/config/tests.rs
@@ -0,0 +1,22 @@
+use super::*;
+
+#[test]
+fn test_node_metadata_v1_backward_compatibilty() {
+    let v1 = serde_json::to_vec(&serde_json::json!({
+        "host": "localhost",
+        "port": 23,
+        "http_host": "localhost",
+        "http_port": 42,
+    }));
+
+    assert_eq!(
+        serde_json::from_slice::<NodeMetadata>(&v1.unwrap()).unwrap(),
+        NodeMetadata {
+            postgres_host: "localhost".to_string(),
+            postgres_port: 23,
+            http_host: "localhost".to_string(),
+            http_port: 42,
+            other: HashMap::new(),
+        }
+    )
+}
diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs
index 852670af2c..2511de00d5 100644
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -1,5 +1,6 @@
 use anyhow::{bail, Result};
 use byteorder::{ByteOrder, BE};
+use bytes::BufMut;
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::{Oid, TransactionId};
 use serde::{Deserialize, Serialize};
@@ -21,15 +22,107 @@ pub struct Key {
     pub field6: u32,
 }
 
+/// The storage key size.
 pub const KEY_SIZE: usize = 18;
 
+/// The metadata key size. 2B fewer than the storage key size because field2 is not fully utilized.
+/// See [`Key::to_i128`] for more information on the encoding.
+pub const METADATA_KEY_SIZE: usize = 16;
+
+/// The key prefix start range for the metadata keys. All keys with the first byte >= 0x40 is a metadata key.
+pub const METADATA_KEY_BEGIN_PREFIX: u8 = 0x60;
+pub const METADATA_KEY_END_PREFIX: u8 = 0x7F;
+
+/// The (reserved) key prefix of relation sizes.
+pub const RELATION_SIZE_PREFIX: u8 = 0x61;
+
+/// The key prefix of AUX file keys.
+pub const AUX_KEY_PREFIX: u8 = 0x62;
+
+/// Check if the key falls in the range of metadata keys.
+pub const fn is_metadata_key_slice(key: &[u8]) -> bool {
+    key[0] >= METADATA_KEY_BEGIN_PREFIX && key[0] < METADATA_KEY_END_PREFIX
+}
+
 impl Key {
+    /// Check if the key falls in the range of metadata keys.
+    pub const fn is_metadata_key(&self) -> bool {
+        self.field1 >= METADATA_KEY_BEGIN_PREFIX && self.field1 < METADATA_KEY_END_PREFIX
+    }
+
+    /// Encode a metadata key to a storage key.
+    pub fn from_metadata_key_fixed_size(key: &[u8; METADATA_KEY_SIZE]) -> Self {
+        assert!(is_metadata_key_slice(key), "key not in metadata key range");
+        Key {
+            field1: key[0],
+            field2: u16::from_be_bytes(key[1..3].try_into().unwrap()) as u32,
+            field3: u32::from_be_bytes(key[3..7].try_into().unwrap()),
+            field4: u32::from_be_bytes(key[7..11].try_into().unwrap()),
+            field5: key[11],
+            field6: u32::from_be_bytes(key[12..16].try_into().unwrap()),
+        }
+    }
+
+    /// Encode a metadata key to a storage key.
+    pub fn from_metadata_key(key: &[u8]) -> Self {
+        Self::from_metadata_key_fixed_size(key.try_into().expect("expect 16 byte metadata key"))
+    }
+
+    /// Extract a metadata key to a writer. The result should always be 16 bytes.
+    pub fn extract_metadata_key_to_writer(&self, mut writer: impl BufMut) {
+        writer.put_u8(self.field1);
+        assert!(self.field2 <= 0xFFFF);
+        writer.put_u16(self.field2 as u16);
+        writer.put_u32(self.field3);
+        writer.put_u32(self.field4);
+        writer.put_u8(self.field5);
+        writer.put_u32(self.field6);
+    }
+
+    /// Get the range of metadata keys.
+    pub const fn metadata_key_range() -> Range<Self> {
+        Key {
+            field1: METADATA_KEY_BEGIN_PREFIX,
+            field2: 0,
+            field3: 0,
+            field4: 0,
+            field5: 0,
+            field6: 0,
+        }..Key {
+            field1: METADATA_KEY_END_PREFIX,
+            field2: 0,
+            field3: 0,
+            field4: 0,
+            field5: 0,
+            field6: 0,
+        }
+    }
+
+    /// Get the range of aux keys.
+    pub fn metadata_aux_key_range() -> Range<Self> {
+        Key {
+            field1: AUX_KEY_PREFIX,
+            field2: 0,
+            field3: 0,
+            field4: 0,
+            field5: 0,
+            field6: 0,
+        }..Key {
+            field1: AUX_KEY_PREFIX + 1,
+            field2: 0,
+            field3: 0,
+            field4: 0,
+            field5: 0,
+            field6: 0,
+        }
+    }
+
     /// 'field2' is used to store tablespaceid for relations and small enum numbers for other relish.
     /// As long as Neon does not support tablespace (because of lack of access to local file system),
     /// we can assume that only some predefined namespace OIDs are used which can fit in u16
     pub fn to_i128(&self) -> i128 {
         assert!(self.field2 < 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222);
-        (((self.field1 & 0xf) as i128) << 120)
+        (((self.field1 & 0x7F) as i128) << 120)
             | (((self.field2 & 0xFFFF) as i128) << 104)
             | ((self.field3 as i128) << 72)
             | ((self.field4 as i128) << 40)
@@ -39,7 +132,7 @@ impl Key {
 
     pub const fn from_i128(x: i128) -> Self {
         Key {
-            field1: ((x >> 120) & 0xf) as u8,
+            field1: ((x >> 120) & 0x7F) as u8,
             field2: ((x >> 104) & 0xFFFF) as u32,
             field3: (x >> 72) as u32,
             field4: (x >> 40) as u32,
@@ -48,11 +141,11 @@ impl Key {
         }
     }
 
-    pub fn next(&self) -> Key {
+    pub const fn next(&self) -> Key {
         self.add(1)
     }
 
-    pub fn add(&self, x: u32) -> Key {
+    pub const fn add(&self, x: u32) -> Key {
         let mut key = *self;
 
         let r = key.field6.overflowing_add(x);
@@ -81,6 +174,8 @@ impl Key {
         key
     }
 
+    /// Convert a 18B slice to a key. This function should not be used for metadata keys because field2 is handled differently.
+    /// Use [`Key::from_metadata_key`] instead.
     pub fn from_slice(b: &[u8]) -> Self {
         Key {
             field1: b[0],
@@ -92,6 +187,8 @@ impl Key {
         }
     }
 
+    /// Convert a key to a 18B slice. This function should not be used for metadata keys because field2 is handled differently.
+    /// Use [`Key::extract_metadata_key_to_writer`] instead.
     pub fn write_to_byte_slice(&self, buf: &mut [u8]) {
         buf[0] = self.field1;
         BE::write_u32(&mut buf[1..5], self.field2);
@@ -475,12 +572,17 @@ pub const AUX_FILES_KEY: Key = Key {
 // Reverse mappings for a few Keys.
 // These are needed by WAL redo manager.
 
+/// Non inherited range for vectored get.
+pub const NON_INHERITED_RANGE: Range<Key> = AUX_FILES_KEY..AUX_FILES_KEY.next();
+/// Sparse keyspace range for vectored get. Missing key error will be ignored for this range.
+pub const NON_INHERITED_SPARSE_RANGE: Range<Key> = Key::metadata_key_range();
+
 // AUX_FILES currently stores only data for logical replication (slots etc), and
 // we don't preserve these on a branch because safekeepers can't follow timeline
 // switch (and generally it likely should be optional), so ignore these.
 #[inline(always)]
 pub fn is_inherited_key(key: Key) -> bool {
-    key != AUX_FILES_KEY
+    !NON_INHERITED_RANGE.contains(&key) && !NON_INHERITED_SPARSE_RANGE.contains(&key)
 }
 
 #[inline(always)]
@@ -556,11 +658,14 @@ impl std::str::FromStr for Key {
 mod tests {
     use std::str::FromStr;
 
+    use crate::key::is_metadata_key_slice;
     use crate::key::Key;
 
     use rand::Rng;
     use rand::SeedableRng;
 
+    use super::AUX_KEY_PREFIX;
+
     #[test]
     fn display_fromstr_bijection() {
         let mut rng = rand::rngs::StdRng::seed_from_u64(42);
@@ -576,4 +681,16 @@ mod tests {
 
         assert_eq!(key, Key::from_str(&format!("{key}")).unwrap());
     }
+
+    #[test]
+    fn test_metadata_keys() {
+        let mut metadata_key = vec![AUX_KEY_PREFIX];
+        metadata_key.extend_from_slice(&[0xFF; 15]);
+        let encoded_key = Key::from_metadata_key(&metadata_key);
+        let mut output_key = Vec::new();
+        encoded_key.extract_metadata_key_to_writer(&mut output_key);
+        assert_eq!(metadata_key, output_key);
+        assert!(encoded_key.is_metadata_key());
+        assert!(is_metadata_key_slice(&metadata_key));
+    }
 }
diff --git a/libs/pageserver_api/src/keyspace.rs b/libs/pageserver_api/src/keyspace.rs
index 05fa4562e1..a9ad3aca18 100644
--- a/libs/pageserver_api/src/keyspace.rs
+++ b/libs/pageserver_api/src/keyspace.rs
@@ -1,7 +1,10 @@
 use postgres_ffi::BLCKSZ;
 use std::ops::Range;
 
-use crate::key::Key;
+use crate::{
+    key::Key,
+    shard::{ShardCount, ShardIdentity},
+};
 use itertools::Itertools;
 
 ///
@@ -14,44 +17,279 @@ pub struct KeySpace {
     pub ranges: Vec<Range<Key>>,
 }
 
-impl KeySpace {
+/// A wrapper type for sparse keyspaces.
+#[derive(Clone, Debug, Default, PartialEq, Eq)]
+pub struct SparseKeySpace(pub KeySpace);
+
+/// Represents a contiguous half-open range of the keyspace, masked according to a particular
+/// ShardNumber's stripes: within this range of keys, only some "belong" to the current
+/// shard.
+///
+/// When we iterate over keys within this object, we will skip any keys that don't belong
+/// to this shard.
+///
+/// The start + end keys may not belong to the shard: these specify where layer files should
+/// start  + end, but we will never actually read/write those keys.
+#[derive(Clone, Debug, PartialEq, Eq)]
+pub struct ShardedRange<'a> {
+    pub shard_identity: &'a ShardIdentity,
+    pub range: Range<Key>,
+}
+
+// Calculate the size of a range within the blocks of the same relation, or spanning only the
+// top page in the previous relation's space.
+fn contiguous_range_len(range: &Range<Key>) -> u32 {
+    debug_assert!(is_contiguous_range(range));
+    if range.start.field6 == 0xffffffff {
+        range.end.field6 + 1
+    } else {
+        range.end.field6 - range.start.field6
+    }
+}
+
+/// Return true if this key range includes only keys in the same relation's data blocks, or
+/// just spanning one relation and the logical size (0xffffffff) block of the relation before it.
+///
+/// Contiguous in this context means we know the keys are in use _somewhere_, but it might not
+/// be on our shard.  Later in ShardedRange we do the extra work to figure out how much
+/// of a given contiguous range is present on one shard.
+///
+/// This matters, because:
+/// - Within such ranges, keys are used contiguously.  Outside such ranges it is sparse.
+/// - Within such ranges, we may calculate distances using simple subtraction of field6.
+fn is_contiguous_range(range: &Range<Key>) -> bool {
+    range.start.field1 == range.end.field1
+        && range.start.field2 == range.end.field2
+        && range.start.field3 == range.end.field3
+        && range.start.field4 == range.end.field4
+        && (range.start.field5 == range.end.field5
+            || (range.start.field6 == 0xffffffff && range.start.field5 + 1 == range.end.field5))
+}
+
+impl<'a> ShardedRange<'a> {
+    pub fn new(range: Range<Key>, shard_identity: &'a ShardIdentity) -> Self {
+        Self {
+            shard_identity,
+            range,
+        }
+    }
+
+    /// Break up this range into chunks, each of which has at least one local key in it if the
+    /// total range has at least one local key.
+    pub fn fragment(self, target_nblocks: u32) -> Vec<(u32, Range<Key>)> {
+        // Optimization for single-key case (e.g. logical size keys)
+        if self.range.end == self.range.start.add(1) {
+            return vec![(
+                if self.shard_identity.is_key_disposable(&self.range.start) {
+                    0
+                } else {
+                    1
+                },
+                self.range,
+            )];
+        }
+
+        if !is_contiguous_range(&self.range) {
+            // Ranges that span relations are not fragmented.  We only get these ranges as a result
+            // of operations that act on existing layers, so we trust that the existing range is
+            // reasonably small.
+            return vec![(u32::MAX, self.range)];
+        }
+
+        let mut fragments: Vec<(u32, Range<Key>)> = Vec::new();
+
+        let mut cursor = self.range.start;
+        while cursor < self.range.end {
+            let advance_by = self.distance_to_next_boundary(cursor);
+            let is_fragment_disposable = self.shard_identity.is_key_disposable(&cursor);
+
+            // If the previous fragment is undersized, then we seek to consume enough
+            // blocks to complete it.
+            let (want_blocks, merge_last_fragment) = match fragments.last_mut() {
+                Some(frag) if frag.0 < target_nblocks => (target_nblocks - frag.0, Some(frag)),
+                Some(frag) => {
+                    // Prev block is complete, want the full number.
+                    (
+                        target_nblocks,
+                        if is_fragment_disposable {
+                            // If this current range will be empty (not shard-local data), we will merge into previous
+                            Some(frag)
+                        } else {
+                            None
+                        },
+                    )
+                }
+                None => {
+                    // First iteration, want the full number
+                    (target_nblocks, None)
+                }
+            };
+
+            let advance_by = if is_fragment_disposable {
+                advance_by
+            } else {
+                std::cmp::min(advance_by, want_blocks)
+            };
+
+            let next_cursor = cursor.add(advance_by);
+
+            let this_frag = (
+                if is_fragment_disposable {
+                    0
+                } else {
+                    advance_by
+                },
+                cursor..next_cursor,
+            );
+            cursor = next_cursor;
+
+            if let Some(last_fragment) = merge_last_fragment {
+                // Previous fragment was short or this one is empty, merge into it
+                last_fragment.0 += this_frag.0;
+                last_fragment.1.end = this_frag.1.end;
+            } else {
+                fragments.push(this_frag);
+            }
+        }
+
+        fragments
+    }
+
+    /// Estimate the physical pages that are within this range, on this shard.  This returns
+    /// u32::MAX if the range spans relations: this return value should be interpreted as "large".
+    pub fn page_count(&self) -> u32 {
+        // Special cases for single keys like logical sizes
+        if self.range.end == self.range.start.add(1) {
+            return if self.shard_identity.is_key_disposable(&self.range.start) {
+                0
+            } else {
+                1
+            };
+        }
+
+        // We can only do an authentic calculation of contiguous key ranges
+        if !is_contiguous_range(&self.range) {
+            return u32::MAX;
+        }
+
+        // Special case for single sharded tenants: our logical and physical sizes are the same
+        if self.shard_identity.count < ShardCount::new(2) {
+            return contiguous_range_len(&self.range);
+        }
+
+        // Normal path: step through stripes and part-stripes in the range, evaluate whether each one belongs
+        // to Self, and add the stripe's block count to our total if so.
+        let mut result: u64 = 0;
+        let mut cursor = self.range.start;
+        while cursor < self.range.end {
+            // Count up to the next stripe_size boundary or end of range
+            let advance_by = self.distance_to_next_boundary(cursor);
+
+            // If this blocks in this stripe belong to us, add them to our count
+            if !self.shard_identity.is_key_disposable(&cursor) {
+                result += advance_by as u64;
+            }
+
+            cursor = cursor.add(advance_by);
+        }
+
+        if result > u32::MAX as u64 {
+            u32::MAX
+        } else {
+            result as u32
+        }
+    }
+
+    /// Advance the cursor to the next potential fragment boundary: this is either
+    /// a stripe boundary, or the end of the range.
+    fn distance_to_next_boundary(&self, cursor: Key) -> u32 {
+        let distance_to_range_end = contiguous_range_len(&(cursor..self.range.end));
+
+        if self.shard_identity.count < ShardCount::new(2) {
+            // Optimization: don't bother stepping through stripes if the tenant isn't sharded.
+            return distance_to_range_end;
+        }
+
+        if cursor.field6 == 0xffffffff {
+            // We are wrapping from one relation's logical size to the next relation's first data block
+            return 1;
+        }
+
+        let stripe_index = cursor.field6 / self.shard_identity.stripe_size.0;
+        let stripe_remainder = self.shard_identity.stripe_size.0
+            - (cursor.field6 - stripe_index * self.shard_identity.stripe_size.0);
+
+        if cfg!(debug_assertions) {
+            // We should never overflow field5 and field6 -- our callers check this earlier
+            // and would have returned their u32::MAX cases if the input range violated this.
+            let next_cursor = cursor.add(stripe_remainder);
+            debug_assert!(
+                next_cursor.field1 == cursor.field1
+                    && next_cursor.field2 == cursor.field2
+                    && next_cursor.field3 == cursor.field3
+                    && next_cursor.field4 == cursor.field4
+                    && next_cursor.field5 == cursor.field5
+            )
+        }
+
+        std::cmp::min(stripe_remainder, distance_to_range_end)
+    }
+
+    /// Whereas `page_count` estimates the number of pages physically in this range on this shard,
+    /// this function simply calculates the number of pages in the space, without accounting for those
+    /// pages that would not actually be stored on this node.
     ///
+    /// Don't use this function in code that works with physical entities like layer files.
+    fn raw_size(range: &Range<Key>) -> u32 {
+        if is_contiguous_range(range) {
+            contiguous_range_len(range)
+        } else {
+            u32::MAX
+        }
+    }
+}
+
+impl KeySpace {
+    /// Create a key space with a single range.
+    pub fn single(key_range: Range<Key>) -> Self {
+        Self {
+            ranges: vec![key_range],
+        }
+    }
+
     /// Partition a key space into roughly chunks of roughly 'target_size' bytes
     /// in each partition.
     ///
-    pub fn partition(&self, target_size: u64) -> KeyPartitioning {
+    pub fn partition(&self, shard_identity: &ShardIdentity, target_size: u64) -> KeyPartitioning {
         // Assume that each value is 8k in size.
-        let target_nblocks = (target_size / BLCKSZ as u64) as usize;
+        let target_nblocks = (target_size / BLCKSZ as u64) as u32;
 
         let mut parts = Vec::new();
         let mut current_part = Vec::new();
         let mut current_part_size: usize = 0;
         for range in &self.ranges {
-            // If appending the next contiguous range in the keyspace to the current
-            // partition would cause it to be too large, start a new partition.
-            let this_size = key_range_size(range) as usize;
-            if current_part_size + this_size > target_nblocks && !current_part.is_empty() {
-                parts.push(KeySpace {
-                    ranges: current_part,
-                });
-                current_part = Vec::new();
-                current_part_size = 0;
-            }
+            // While doing partitioning, wrap the range in ShardedRange so that our size calculations
+            // will respect shard striping rather than assuming all keys within a range are present.
+            let range = ShardedRange::new(range.clone(), shard_identity);
 
-            // If the next range is larger than 'target_size', split it into
-            // 'target_size' chunks.
-            let mut remain_size = this_size;
-            let mut start = range.start;
-            while remain_size > target_nblocks {
-                let next = start.add(target_nblocks as u32);
-                parts.push(KeySpace {
-                    ranges: vec![start..next],
-                });
-                start = next;
-                remain_size -= target_nblocks
+            // Chunk up the range into parts that each contain up to target_size local blocks
+            for (frag_on_shard_size, frag_range) in range.fragment(target_nblocks) {
+                // If appending the next contiguous range in the keyspace to the current
+                // partition would cause it to be too large, and our current partition
+                // covers at least one block that is physically present in this shard,
+                // then start a new partition
+                if current_part_size + frag_on_shard_size as usize > target_nblocks as usize
+                    && current_part_size > 0
+                {
+                    parts.push(KeySpace {
+                        ranges: current_part,
+                    });
+                    current_part = Vec::new();
+                    current_part_size = 0;
+                }
+                current_part.push(frag_range.start..frag_range.end);
+                current_part_size += frag_on_shard_size as usize;
             }
-            current_part.push(start..range.end);
-            current_part_size += remain_size;
         }
 
         // add last partition that wasn't full yet.
@@ -64,6 +302,10 @@ impl KeySpace {
         KeyPartitioning { parts }
     }
 
+    pub fn is_empty(&self) -> bool {
+        self.total_raw_size() == 0
+    }
+
     /// Merge another keyspace into the current one.
     /// Note: the keyspaces must not ovelap (enforced via assertions)
     pub fn merge(&mut self, other: &KeySpace) {
@@ -94,12 +336,13 @@ impl KeySpace {
 
     /// Remove all keys in `other` from `self`.
     /// This can involve splitting or removing of existing ranges.
-    pub fn remove_overlapping_with(&mut self, other: &KeySpace) {
+    /// Returns the removed keyspace
+    pub fn remove_overlapping_with(&mut self, other: &KeySpace) -> KeySpace {
         let (self_start, self_end) = match (self.start(), self.end()) {
             (Some(start), Some(end)) => (start, end),
             _ => {
                 // self is empty
-                return;
+                return KeySpace::default();
             }
         };
 
@@ -112,30 +355,37 @@ impl KeySpace {
             .skip_while(|range| self_start >= range.end)
             .take_while(|range| self_end > range.start);
 
+        let mut removed_accum = KeySpaceRandomAccum::new();
         for range in other_ranges {
             while let Some(overlap_at) = self.overlaps_at(range) {
                 let overlapped = self.ranges[overlap_at].clone();
 
                 if overlapped.start < range.start && overlapped.end <= range.end {
                     // Higher part of the range is completely overlapped.
+                    removed_accum.add_range(range.start..self.ranges[overlap_at].end);
                     self.ranges[overlap_at].end = range.start;
                 }
                 if overlapped.start >= range.start && overlapped.end > range.end {
                     // Lower part of the range is completely overlapped.
+                    removed_accum.add_range(self.ranges[overlap_at].start..range.end);
                     self.ranges[overlap_at].start = range.end;
                 }
                 if overlapped.start < range.start && overlapped.end > range.end {
                     // Middle part of the range is overlapped.
+                    removed_accum.add_range(range.clone());
                     self.ranges[overlap_at].end = range.start;
                     self.ranges
                         .insert(overlap_at + 1, range.end..overlapped.end);
                 }
                 if overlapped.start >= range.start && overlapped.end <= range.end {
                     // Whole range is overlapped
+                    removed_accum.add_range(self.ranges[overlap_at].clone());
                     self.ranges.remove(overlap_at);
                 }
             }
         }
+
+        removed_accum.to_keyspace()
     }
 
     pub fn start(&self) -> Option<Key> {
@@ -146,11 +396,11 @@ impl KeySpace {
         self.ranges.last().map(|range| range.end)
     }
 
-    #[allow(unused)]
-    pub fn total_size(&self) -> usize {
+    /// The size of the keyspace in pages, before accounting for sharding
+    pub fn total_raw_size(&self) -> usize {
         self.ranges
             .iter()
-            .map(|range| key_range_size(range) as usize)
+            .map(|range| ShardedRange::raw_size(range) as usize)
             .sum()
     }
 
@@ -170,6 +420,11 @@ impl KeySpace {
     pub fn overlaps(&self, range: &Range<Key>) -> bool {
         self.overlaps_at(range).is_some()
     }
+
+    /// Check if the keyspace contains a key
+    pub fn contains(&self, key: &Key) -> bool {
+        self.overlaps(&(*key..key.next()))
+    }
 }
 
 ///
@@ -184,10 +439,33 @@ pub struct KeyPartitioning {
     pub parts: Vec<KeySpace>,
 }
 
+/// Represents a partitioning of the sparse key space.
+#[derive(Clone, Debug, Default)]
+pub struct SparseKeyPartitioning {
+    pub parts: Vec<SparseKeySpace>,
+}
+
 impl KeyPartitioning {
     pub fn new() -> Self {
         KeyPartitioning { parts: Vec::new() }
     }
+
+    /// Convert a key partitioning to a sparse partition.
+    pub fn into_sparse(self) -> SparseKeyPartitioning {
+        SparseKeyPartitioning {
+            parts: self.parts.into_iter().map(SparseKeySpace).collect(),
+        }
+    }
+}
+
+impl SparseKeyPartitioning {
+    /// Note: use this function with caution. Attempt to handle a sparse keyspace in the same way as a dense keyspace will
+    /// cause long/dead loops.
+    pub fn into_dense(self) -> KeyPartitioning {
+        KeyPartitioning {
+            parts: self.parts.into_iter().map(|x| x.0).collect(),
+        }
+    }
 }
 
 ///
@@ -219,7 +497,7 @@ impl KeySpaceAccum {
 
     #[inline(always)]
     pub fn add_range(&mut self, range: Range<Key>) {
-        self.size += key_range_size(&range) as u64;
+        self.size += ShardedRange::raw_size(&range) as u64;
 
         match self.accum.as_mut() {
             Some(accum) => {
@@ -251,7 +529,9 @@ impl KeySpaceAccum {
         std::mem::take(self).to_keyspace()
     }
 
-    pub fn size(&self) -> u64 {
+    // The total number of keys in this object, ignoring any sharding effects that might cause some of
+    // the keys to be omitted in storage on this shard.
+    pub fn raw_size(&self) -> u64 {
         self.size
     }
 }
@@ -307,36 +587,19 @@ impl KeySpaceRandomAccum {
     }
 }
 
-#[inline(always)]
-pub fn key_range_size(key_range: &Range<Key>) -> u32 {
-    let start = key_range.start;
-    let end = key_range.end;
-
-    if end.field1 != start.field1
-        || end.field2 != start.field2
-        || end.field3 != start.field3
-        || end.field4 != start.field4
-    {
-        return u32::MAX;
-    }
-
-    let start = (start.field5 as u64) << 32 | start.field6 as u64;
-    let end = (end.field5 as u64) << 32 | end.field6 as u64;
-
-    let diff = end - start;
-    if diff > u32::MAX as u64 {
-        u32::MAX
-    } else {
-        diff as u32
-    }
-}
-
 pub fn singleton_range(key: Key) -> Range<Key> {
     key..key.next()
 }
 
 #[cfg(test)]
 mod tests {
+    use rand::{RngCore, SeedableRng};
+
+    use crate::{
+        models::ShardParameters,
+        shard::{ShardCount, ShardNumber},
+    };
+
     use super::*;
     use std::fmt::Write;
 
@@ -379,14 +642,17 @@ mod tests {
             accum.add_range(range.clone());
         }
 
-        let expected_size: u64 = ranges.iter().map(|r| key_range_size(r) as u64).sum();
-        assert_eq!(accum.size(), expected_size);
+        let expected_size: u64 = ranges
+            .iter()
+            .map(|r| ShardedRange::raw_size(r) as u64)
+            .sum();
+        assert_eq!(accum.raw_size(), expected_size);
 
         assert_ks_eq(&accum.consume_keyspace(), ranges.clone());
-        assert_eq!(accum.size(), 0);
+        assert_eq!(accum.raw_size(), 0);
 
         assert_ks_eq(&accum.consume_keyspace(), vec![]);
-        assert_eq!(accum.size(), 0);
+        assert_eq!(accum.raw_size(), 0);
 
         for range in &ranges {
             accum.add_range(range.clone());
@@ -553,7 +819,16 @@ mod tests {
                 Key::from_i128(11)..Key::from_i128(13),
             ],
         };
-        key_space1.remove_overlapping_with(&key_space2);
+        let removed = key_space1.remove_overlapping_with(&key_space2);
+        let removed_expected = KeySpace {
+            ranges: vec![
+                Key::from_i128(2)..Key::from_i128(3),
+                Key::from_i128(6)..Key::from_i128(7),
+                Key::from_i128(11)..Key::from_i128(12),
+            ],
+        };
+        assert_eq!(removed, removed_expected);
+
         assert_eq!(
             key_space1.ranges,
             vec![
@@ -583,7 +858,17 @@ mod tests {
                 Key::from_i128(14)..Key::from_i128(17),
             ],
         };
-        key_space1.remove_overlapping_with(&key_space2);
+
+        let removed = key_space1.remove_overlapping_with(&key_space2);
+        let removed_expected = KeySpace {
+            ranges: vec![
+                Key::from_i128(3)..Key::from_i128(5),
+                Key::from_i128(8)..Key::from_i128(10),
+                Key::from_i128(14)..Key::from_i128(15),
+            ],
+        };
+        assert_eq!(removed, removed_expected);
+
         assert_eq!(
             key_space1.ranges,
             vec![
@@ -610,7 +895,11 @@ mod tests {
                 Key::from_i128(15)..Key::from_i128(17),
             ],
         };
-        key_space1.remove_overlapping_with(&key_space2);
+
+        let removed = key_space1.remove_overlapping_with(&key_space2);
+        let removed_expected = KeySpace::default();
+        assert_eq!(removed, removed_expected);
+
         assert_eq!(
             key_space1.ranges,
             vec![
@@ -637,7 +926,17 @@ mod tests {
         let key_space2 = KeySpace {
             ranges: vec![Key::from_i128(9)..Key::from_i128(19)],
         };
-        key_space1.remove_overlapping_with(&key_space2);
+
+        let removed = key_space1.remove_overlapping_with(&key_space2);
+        let removed_expected = KeySpace {
+            ranges: vec![
+                Key::from_i128(9)..Key::from_i128(10),
+                Key::from_i128(12)..Key::from_i128(15),
+                Key::from_i128(17)..Key::from_i128(19),
+            ],
+        };
+        assert_eq!(removed, removed_expected);
+
         assert_eq!(
             key_space1.ranges,
             vec![
@@ -650,4 +949,412 @@ mod tests {
             ]
         );
     }
+    #[test]
+    fn sharded_range_relation_gap() {
+        let shard_identity = ShardIdentity::new(
+            ShardNumber(0),
+            ShardCount::new(4),
+            ShardParameters::DEFAULT_STRIPE_SIZE,
+        )
+        .unwrap();
+
+        let range = ShardedRange::new(
+            Range {
+                start: Key::from_hex("000000067F00000005000040100300000000").unwrap(),
+                end: Key::from_hex("000000067F00000005000040130000004000").unwrap(),
+            },
+            &shard_identity,
+        );
+
+        // Key range spans relations, expect MAX
+        assert_eq!(range.page_count(), u32::MAX);
+    }
+
+    #[test]
+    fn shard_identity_keyspaces_single_key() {
+        let shard_identity = ShardIdentity::new(
+            ShardNumber(1),
+            ShardCount::new(4),
+            ShardParameters::DEFAULT_STRIPE_SIZE,
+        )
+        .unwrap();
+
+        let range = ShardedRange::new(
+            Range {
+                start: Key::from_hex("000000067f000000010000007000ffffffff").unwrap(),
+                end: Key::from_hex("000000067f00000001000000700100000000").unwrap(),
+            },
+            &shard_identity,
+        );
+        // Single-key range on logical size key
+        assert_eq!(range.page_count(), 1);
+    }
+
+    /// Test the helper that we use to identify ranges which go outside the data blocks of a single relation
+    #[test]
+    fn contiguous_range_check() {
+        assert!(!is_contiguous_range(
+            &(Key::from_hex("000000067f00000001000004df00fffffffe").unwrap()
+                ..Key::from_hex("000000067f00000001000004df0100000003").unwrap())
+        ),);
+
+        // The ranges goes all the way up to the 0xffffffff, including it: this is
+        // not considered a rel block range because 0xffffffff stores logical sizes,
+        // not blocks.
+        assert!(!is_contiguous_range(
+            &(Key::from_hex("000000067f00000001000004df00fffffffe").unwrap()
+                ..Key::from_hex("000000067f00000001000004df0100000000").unwrap())
+        ),);
+
+        // Keys within the normal data region of a relation
+        assert!(is_contiguous_range(
+            &(Key::from_hex("000000067f00000001000004df0000000000").unwrap()
+                ..Key::from_hex("000000067f00000001000004df0000000080").unwrap())
+        ),);
+
+        // The logical size key of one forkno, then some blocks in the next
+        assert!(is_contiguous_range(
+            &(Key::from_hex("000000067f00000001000004df00ffffffff").unwrap()
+                ..Key::from_hex("000000067f00000001000004df0100000080").unwrap())
+        ),);
+    }
+
+    #[test]
+    fn shard_identity_keyspaces_forkno_gap() {
+        let shard_identity = ShardIdentity::new(
+            ShardNumber(1),
+            ShardCount::new(4),
+            ShardParameters::DEFAULT_STRIPE_SIZE,
+        )
+        .unwrap();
+
+        let range = ShardedRange::new(
+            Range {
+                start: Key::from_hex("000000067f00000001000004df00fffffffe").unwrap(),
+                end: Key::from_hex("000000067f00000001000004df0100000003").unwrap(),
+            },
+            &shard_identity,
+        );
+
+        // Range spanning the end of one forkno and the start of the next: we do not attempt to
+        // calculate a valid size, because we have no way to know if they keys between start
+        // and end are actually in use.
+        assert_eq!(range.page_count(), u32::MAX);
+    }
+
+    #[test]
+    fn shard_identity_keyspaces_one_relation() {
+        for shard_number in 0..4 {
+            let shard_identity = ShardIdentity::new(
+                ShardNumber(shard_number),
+                ShardCount::new(4),
+                ShardParameters::DEFAULT_STRIPE_SIZE,
+            )
+            .unwrap();
+
+            let range = ShardedRange::new(
+                Range {
+                    start: Key::from_hex("000000067f00000001000000ae0000000000").unwrap(),
+                    end: Key::from_hex("000000067f00000001000000ae0000000001").unwrap(),
+                },
+                &shard_identity,
+            );
+
+            // Very simple case: range covering block zero of one relation, where that block maps to shard zero
+            if shard_number == 0 {
+                assert_eq!(range.page_count(), 1);
+            } else {
+                // Other shards should perceive the range's size as zero
+                assert_eq!(range.page_count(), 0);
+            }
+        }
+    }
+
+    /// Test helper: construct a ShardedRange and call fragment() on it, returning
+    /// the total page count in the range and the fragments.
+    fn do_fragment(
+        range_start: Key,
+        range_end: Key,
+        shard_identity: &ShardIdentity,
+        target_nblocks: u32,
+    ) -> (u32, Vec<(u32, Range<Key>)>) {
+        let range = ShardedRange::new(
+            Range {
+                start: range_start,
+                end: range_end,
+            },
+            shard_identity,
+        );
+
+        let page_count = range.page_count();
+        let fragments = range.fragment(target_nblocks);
+
+        // Invariant: we always get at least one fragment
+        assert!(!fragments.is_empty());
+
+        // Invariant: the first/last fragment start/end should equal the input start/end
+        assert_eq!(fragments.first().unwrap().1.start, range_start);
+        assert_eq!(fragments.last().unwrap().1.end, range_end);
+
+        if page_count > 0 {
+            // Invariant: every fragment must contain at least one shard-local page, if the
+            // total range contains at least one shard-local page
+            let all_nonzero = fragments.iter().all(|f| f.0 > 0);
+            if !all_nonzero {
+                eprintln!("Found a zero-length fragment: {:?}", fragments);
+            }
+            assert!(all_nonzero);
+        } else {
+            // A range with no shard-local pages should always be returned as a single fragment
+            assert_eq!(fragments, vec![(0, range_start..range_end)]);
+        }
+
+        // Invariant: fragments must be ordered and non-overlapping
+        let mut last: Option<Range<Key>> = None;
+        for frag in &fragments {
+            if let Some(last) = last {
+                assert!(frag.1.start >= last.end);
+                assert!(frag.1.start > last.start);
+            }
+            last = Some(frag.1.clone())
+        }
+
+        // Invariant: fragments respect target_nblocks
+        for frag in &fragments {
+            assert!(frag.0 == u32::MAX || frag.0 <= target_nblocks);
+        }
+
+        (page_count, fragments)
+    }
+
+    /// Really simple tests for fragment(), on a range that just contains a single stripe
+    /// for a single tenant.
+    #[test]
+    fn sharded_range_fragment_simple() {
+        let shard_identity = ShardIdentity::new(
+            ShardNumber(0),
+            ShardCount::new(4),
+            ShardParameters::DEFAULT_STRIPE_SIZE,
+        )
+        .unwrap();
+
+        // A range which we happen to know covers exactly one stripe which belongs to this shard
+        let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
+        let input_end = Key::from_hex("000000067f00000001000000ae0000008000").unwrap();
+
+        // Ask for stripe_size blocks, we get the whole stripe
+        assert_eq!(
+            do_fragment(input_start, input_end, &shard_identity, 32768),
+            (32768, vec![(32768, input_start..input_end)])
+        );
+
+        // Ask for more, we still get the whole stripe
+        assert_eq!(
+            do_fragment(input_start, input_end, &shard_identity, 10000000),
+            (32768, vec![(32768, input_start..input_end)])
+        );
+
+        // Ask for target_nblocks of half the stripe size, we get two halves
+        assert_eq!(
+            do_fragment(input_start, input_end, &shard_identity, 16384),
+            (
+                32768,
+                vec![
+                    (16384, input_start..input_start.add(16384)),
+                    (16384, input_start.add(16384)..input_end)
+                ]
+            )
+        );
+    }
+
+    #[test]
+    fn sharded_range_fragment_multi_stripe() {
+        let shard_identity = ShardIdentity::new(
+            ShardNumber(0),
+            ShardCount::new(4),
+            ShardParameters::DEFAULT_STRIPE_SIZE,
+        )
+        .unwrap();
+
+        // A range which covers multiple stripes, exactly one of which belongs to the current shard.
+        let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
+        let input_end = Key::from_hex("000000067f00000001000000ae0000020000").unwrap();
+        // Ask for all the blocks, get a fragment that covers the whole range but reports
+        // its size to be just the blocks belonging to our shard.
+        assert_eq!(
+            do_fragment(input_start, input_end, &shard_identity, 131072),
+            (32768, vec![(32768, input_start..input_end)])
+        );
+
+        // Ask for a sub-stripe quantity
+        assert_eq!(
+            do_fragment(input_start, input_end, &shard_identity, 16000),
+            (
+                32768,
+                vec![
+                    (16000, input_start..input_start.add(16000)),
+                    (16000, input_start.add(16000)..input_start.add(32000)),
+                    (768, input_start.add(32000)..input_end),
+                ]
+            )
+        );
+
+        // Try on a range that starts slightly after our owned stripe
+        assert_eq!(
+            do_fragment(input_start.add(1), input_end, &shard_identity, 131072),
+            (32767, vec![(32767, input_start.add(1)..input_end)])
+        );
+    }
+
+    /// Test our calculations work correctly when we start a range from the logical size key of
+    /// a previous relation.
+    #[test]
+    fn sharded_range_fragment_starting_from_logical_size() {
+        let input_start = Key::from_hex("000000067f00000001000000ae00ffffffff").unwrap();
+        let input_end = Key::from_hex("000000067f00000001000000ae0100008000").unwrap();
+
+        // Shard 0 owns the first stripe in the relation, and the preceding logical size is shard local too
+        let shard_identity = ShardIdentity::new(
+            ShardNumber(0),
+            ShardCount::new(4),
+            ShardParameters::DEFAULT_STRIPE_SIZE,
+        )
+        .unwrap();
+        assert_eq!(
+            do_fragment(input_start, input_end, &shard_identity, 0x10000),
+            (0x8001, vec![(0x8001, input_start..input_end)])
+        );
+
+        // Shard 1 does not own the first stripe in the relation, but it does own the logical size (all shards
+        // store all logical sizes)
+        let shard_identity = ShardIdentity::new(
+            ShardNumber(1),
+            ShardCount::new(4),
+            ShardParameters::DEFAULT_STRIPE_SIZE,
+        )
+        .unwrap();
+        assert_eq!(
+            do_fragment(input_start, input_end, &shard_identity, 0x10000),
+            (0x1, vec![(0x1, input_start..input_end)])
+        );
+    }
+
+    /// Test that ShardedRange behaves properly when used on un-sharded data
+    #[test]
+    fn sharded_range_fragment_unsharded() {
+        let shard_identity = ShardIdentity::unsharded();
+
+        let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
+        let input_end = Key::from_hex("000000067f00000001000000ae0000010000").unwrap();
+        assert_eq!(
+            do_fragment(input_start, input_end, &shard_identity, 0x8000),
+            (
+                0x10000,
+                vec![
+                    (0x8000, input_start..input_start.add(0x8000)),
+                    (0x8000, input_start.add(0x8000)..input_start.add(0x10000))
+                ]
+            )
+        );
+    }
+
+    #[test]
+    fn sharded_range_fragment_cross_relation() {
+        let shard_identity = ShardIdentity::unsharded();
+
+        // A range that spans relations: expect fragmentation to give up and return a u32::MAX size
+        let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
+        let input_end = Key::from_hex("000000068f00000001000000ae0000010000").unwrap();
+        assert_eq!(
+            do_fragment(input_start, input_end, &shard_identity, 0x8000),
+            (u32::MAX, vec![(u32::MAX, input_start..input_end),])
+        );
+
+        // Same, but using a sharded identity
+        let shard_identity = ShardIdentity::new(
+            ShardNumber(0),
+            ShardCount::new(4),
+            ShardParameters::DEFAULT_STRIPE_SIZE,
+        )
+        .unwrap();
+        assert_eq!(
+            do_fragment(input_start, input_end, &shard_identity, 0x8000),
+            (u32::MAX, vec![(u32::MAX, input_start..input_end),])
+        );
+    }
+
+    #[test]
+    fn sharded_range_fragment_tiny_nblocks() {
+        let shard_identity = ShardIdentity::unsharded();
+
+        // A range that spans relations: expect fragmentation to give up and return a u32::MAX size
+        let input_start = Key::from_hex("000000067F00000001000004E10000000000").unwrap();
+        let input_end = Key::from_hex("000000067F00000001000004E10000000038").unwrap();
+        assert_eq!(
+            do_fragment(input_start, input_end, &shard_identity, 16),
+            (
+                0x38,
+                vec![
+                    (16, input_start..input_start.add(16)),
+                    (16, input_start.add(16)..input_start.add(32)),
+                    (16, input_start.add(32)..input_start.add(48)),
+                    (8, input_start.add(48)..input_end),
+                ]
+            )
+        );
+    }
+
+    #[test]
+    fn sharded_range_fragment_fuzz() {
+        // Use a fixed seed: we don't want to explicitly pick values, but we do want
+        // the test to be reproducible.
+        let mut prng = rand::rngs::StdRng::seed_from_u64(0xdeadbeef);
+
+        for _i in 0..1000 {
+            let shard_identity = if prng.next_u32() % 2 == 0 {
+                ShardIdentity::unsharded()
+            } else {
+                let shard_count = prng.next_u32() % 127 + 1;
+                ShardIdentity::new(
+                    ShardNumber((prng.next_u32() % shard_count) as u8),
+                    ShardCount::new(shard_count as u8),
+                    ShardParameters::DEFAULT_STRIPE_SIZE,
+                )
+                .unwrap()
+            };
+
+            let target_nblocks = prng.next_u32() % 65536 + 1;
+
+            let start_offset = prng.next_u32() % 16384;
+
+            // Try ranges up to 4GiB in size, that are always at least 1
+            let range_size = prng.next_u32() % 8192 + 1;
+
+            // A range that spans relations: expect fragmentation to give up and return a u32::MAX size
+            let input_start = Key::from_hex("000000067F00000001000004E10000000000")
+                .unwrap()
+                .add(start_offset);
+            let input_end = input_start.add(range_size);
+
+            // This test's main success conditions are the invariants baked into do_fragment
+            let (_total_size, fragments) =
+                do_fragment(input_start, input_end, &shard_identity, target_nblocks);
+
+            // Pick a random key within the range and check it appears in the output
+            let example_key = input_start.add(prng.next_u32() % range_size);
+
+            // Panic on unwrap if it isn't found
+            let example_key_frag = fragments
+                .iter()
+                .find(|f| f.1.contains(&example_key))
+                .unwrap();
+
+            // Check that the fragment containing our random key has a nonzero size if
+            // that key is shard-local
+            let example_key_local = !shard_identity.is_key_disposable(&example_key);
+            if example_key_local {
+                assert!(example_key_frag.0 > 0);
+            }
+        }
+    }
 }
diff --git a/libs/pageserver_api/src/lib.rs b/libs/pageserver_api/src/lib.rs
index 1b948d60c3..532185a366 100644
--- a/libs/pageserver_api/src/lib.rs
+++ b/libs/pageserver_api/src/lib.rs
@@ -1,6 +1,5 @@
 #![deny(unsafe_code)]
 #![deny(clippy::undocumented_unsafe_blocks)]
-use const_format::formatcp;
 
 pub mod controller_api;
 pub mod key;
@@ -11,7 +10,4 @@ pub mod shard;
 /// Public API types
 pub mod upcall_api;
 
-pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
-pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
-pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898;
-pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}");
+pub mod config;
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index b4909f247f..a54cdb520d 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -303,6 +303,7 @@ pub struct TenantConfig {
     pub lazy_slru_download: Option<bool>,
     pub timeline_get_throttle: Option<ThrottleConfig>,
     pub image_layer_creation_check_threshold: Option<u8>,
+    pub switch_to_aux_file_v2: Option<bool>,
 }
 
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
@@ -429,7 +430,6 @@ pub struct StatusResponse {
 #[derive(Serialize, Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
 pub struct TenantLocationConfigRequest {
-    pub tenant_id: Option<TenantShardId>,
     #[serde(flatten)]
     pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
 }
@@ -747,10 +747,18 @@ pub struct TimelineGcRequest {
     pub gc_horizon: Option<u64>,
 }
 
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct WalRedoManagerProcessStatus {
+    pub pid: u32,
+    /// The strum-generated `into::<&'static str>()` for `pageserver::walredo::ProcessKind`.
+    /// `ProcessKind` are a transitory thing, so, they have no enum representation in `pageserver_api`.
+    pub kind: Cow<'static, str>,
+}
+
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct WalRedoManagerStatus {
     pub last_redo_at: Option<chrono::DateTime<chrono::Utc>>,
-    pub pid: Option<u32>,
+    pub process: Option<WalRedoManagerProcessStatus>,
 }
 
 /// The progress of a secondary tenant is mostly useful when doing a long running download: e.g. initiating
@@ -772,6 +780,17 @@ pub struct SecondaryProgress {
     pub bytes_total: u64,
 }
 
+#[derive(Serialize, Deserialize, Debug)]
+pub struct TenantScanRemoteStorageShard {
+    pub tenant_shard_id: TenantShardId,
+    pub generation: Option<u32>,
+}
+
+#[derive(Serialize, Deserialize, Debug, Default)]
+pub struct TenantScanRemoteStorageResponse {
+    pub shards: Vec<TenantScanRemoteStorageShard>,
+}
+
 pub mod virtual_file {
     #[derive(
         Copy,
@@ -839,39 +858,72 @@ impl TryFrom<u8> for PagestreamBeMessageTag {
     }
 }
 
+// In the V2 protocol version, a GetPage request contains two LSN values:
+//
+// request_lsn: Get the page version at this point in time.  Lsn::Max is a special value that means
+// "get the latest version present". It's used by the primary server, which knows that no one else
+// is writing WAL. 'not_modified_since' must be set to a proper value even if request_lsn is
+// Lsn::Max. Standby servers use the current replay LSN as the request LSN.
+//
+// not_modified_since: Hint to the pageserver that the client knows that the page has not been
+// modified between 'not_modified_since' and the request LSN. It's always correct to set
+// 'not_modified_since equal' to 'request_lsn' (unless Lsn::Max is used as the 'request_lsn'), but
+// passing an earlier LSN can speed up the request, by allowing the pageserver to process the
+// request without waiting for 'request_lsn' to arrive.
+//
+// The legacy V1 interface contained only one LSN, and a boolean 'latest' flag. The V1 interface was
+// sufficient for the primary; the 'lsn' was equivalent to the 'not_modified_since' value, and
+// 'latest' was set to true. The V2 interface was added because there was no correct way for a
+// standby to request a page at a particular non-latest LSN, and also include the
+// 'not_modified_since' hint. That led to an awkward choice of either using an old LSN in the
+// request, if the standby knows that the page hasn't been modified since, and risk getting an error
+// if that LSN has fallen behind the GC horizon, or requesting the current replay LSN, which could
+// require the pageserver unnecessarily to wait for the WAL to arrive up to that point. The new V2
+// interface allows sending both LSNs, and let the pageserver do the right thing. There is no
+// difference in the responses between V1 and V2.
+//
+// The Request structs below reflect the V2 interface. If V1 is used, the parse function
+// maps the old format requests to the new format.
+//
+#[derive(Clone, Copy)]
+pub enum PagestreamProtocolVersion {
+    V1,
+    V2,
+}
+
 #[derive(Debug, PartialEq, Eq)]
 pub struct PagestreamExistsRequest {
-    pub latest: bool,
-    pub lsn: Lsn,
+    pub request_lsn: Lsn,
+    pub not_modified_since: Lsn,
     pub rel: RelTag,
 }
 
 #[derive(Debug, PartialEq, Eq)]
 pub struct PagestreamNblocksRequest {
-    pub latest: bool,
-    pub lsn: Lsn,
+    pub request_lsn: Lsn,
+    pub not_modified_since: Lsn,
     pub rel: RelTag,
 }
 
 #[derive(Debug, PartialEq, Eq)]
 pub struct PagestreamGetPageRequest {
-    pub latest: bool,
-    pub lsn: Lsn,
+    pub request_lsn: Lsn,
+    pub not_modified_since: Lsn,
     pub rel: RelTag,
     pub blkno: u32,
 }
 
 #[derive(Debug, PartialEq, Eq)]
 pub struct PagestreamDbSizeRequest {
-    pub latest: bool,
-    pub lsn: Lsn,
+    pub request_lsn: Lsn,
+    pub not_modified_since: Lsn,
     pub dbnode: u32,
 }
 
 #[derive(Debug, PartialEq, Eq)]
 pub struct PagestreamGetSlruSegmentRequest {
-    pub latest: bool,
-    pub lsn: Lsn,
+    pub request_lsn: Lsn,
+    pub not_modified_since: Lsn,
     pub kind: u8,
     pub segno: u32,
 }
@@ -918,14 +970,16 @@ pub struct TenantHistorySize {
 }
 
 impl PagestreamFeMessage {
+    /// Serialize a compute -> pageserver message. This is currently only used in testing
+    /// tools. Always uses protocol version 2.
     pub fn serialize(&self) -> Bytes {
         let mut bytes = BytesMut::new();
 
         match self {
             Self::Exists(req) => {
                 bytes.put_u8(0);
-                bytes.put_u8(u8::from(req.latest));
-                bytes.put_u64(req.lsn.0);
+                bytes.put_u64(req.request_lsn.0);
+                bytes.put_u64(req.not_modified_since.0);
                 bytes.put_u32(req.rel.spcnode);
                 bytes.put_u32(req.rel.dbnode);
                 bytes.put_u32(req.rel.relnode);
@@ -934,8 +988,8 @@ impl PagestreamFeMessage {
 
             Self::Nblocks(req) => {
                 bytes.put_u8(1);
-                bytes.put_u8(u8::from(req.latest));
-                bytes.put_u64(req.lsn.0);
+                bytes.put_u64(req.request_lsn.0);
+                bytes.put_u64(req.not_modified_since.0);
                 bytes.put_u32(req.rel.spcnode);
                 bytes.put_u32(req.rel.dbnode);
                 bytes.put_u32(req.rel.relnode);
@@ -944,8 +998,8 @@ impl PagestreamFeMessage {
 
             Self::GetPage(req) => {
                 bytes.put_u8(2);
-                bytes.put_u8(u8::from(req.latest));
-                bytes.put_u64(req.lsn.0);
+                bytes.put_u64(req.request_lsn.0);
+                bytes.put_u64(req.not_modified_since.0);
                 bytes.put_u32(req.rel.spcnode);
                 bytes.put_u32(req.rel.dbnode);
                 bytes.put_u32(req.rel.relnode);
@@ -955,15 +1009,15 @@ impl PagestreamFeMessage {
 
             Self::DbSize(req) => {
                 bytes.put_u8(3);
-                bytes.put_u8(u8::from(req.latest));
-                bytes.put_u64(req.lsn.0);
+                bytes.put_u64(req.request_lsn.0);
+                bytes.put_u64(req.not_modified_since.0);
                 bytes.put_u32(req.dbnode);
             }
 
             Self::GetSlruSegment(req) => {
                 bytes.put_u8(4);
-                bytes.put_u8(u8::from(req.latest));
-                bytes.put_u64(req.lsn.0);
+                bytes.put_u64(req.request_lsn.0);
+                bytes.put_u64(req.not_modified_since.0);
                 bytes.put_u8(req.kind);
                 bytes.put_u32(req.segno);
             }
@@ -972,18 +1026,40 @@ impl PagestreamFeMessage {
         bytes.into()
     }
 
-    pub fn parse<R: std::io::Read>(body: &mut R) -> anyhow::Result<PagestreamFeMessage> {
-        // TODO these gets can fail
-
+    pub fn parse<R: std::io::Read>(
+        body: &mut R,
+        protocol_version: PagestreamProtocolVersion,
+    ) -> anyhow::Result<PagestreamFeMessage> {
         // these correspond to the NeonMessageTag enum in pagestore_client.h
         //
         // TODO: consider using protobuf or serde bincode for less error prone
         // serialization.
         let msg_tag = body.read_u8()?;
+
+        let (request_lsn, not_modified_since) = match protocol_version {
+            PagestreamProtocolVersion::V2 => (
+                Lsn::from(body.read_u64::<BigEndian>()?),
+                Lsn::from(body.read_u64::<BigEndian>()?),
+            ),
+            PagestreamProtocolVersion::V1 => {
+                // In the old protocol, each message starts with a boolean 'latest' flag,
+                // followed by 'lsn'. Convert that to the two LSNs, 'request_lsn' and
+                // 'not_modified_since', used in the new protocol version.
+                let latest = body.read_u8()? != 0;
+                let request_lsn = Lsn::from(body.read_u64::<BigEndian>()?);
+                if latest {
+                    (Lsn::MAX, request_lsn) // get latest version
+                } else {
+                    (request_lsn, request_lsn) // get version at specified LSN
+                }
+            }
+        };
+
+        // The rest of the messages are the same between V1 and V2
         match msg_tag {
             0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
-                latest: body.read_u8()? != 0,
-                lsn: Lsn::from(body.read_u64::<BigEndian>()?),
+                request_lsn,
+                not_modified_since,
                 rel: RelTag {
                     spcnode: body.read_u32::<BigEndian>()?,
                     dbnode: body.read_u32::<BigEndian>()?,
@@ -992,8 +1068,8 @@ impl PagestreamFeMessage {
                 },
             })),
             1 => Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
-                latest: body.read_u8()? != 0,
-                lsn: Lsn::from(body.read_u64::<BigEndian>()?),
+                request_lsn,
+                not_modified_since,
                 rel: RelTag {
                     spcnode: body.read_u32::<BigEndian>()?,
                     dbnode: body.read_u32::<BigEndian>()?,
@@ -1002,8 +1078,8 @@ impl PagestreamFeMessage {
                 },
             })),
             2 => Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
-                latest: body.read_u8()? != 0,
-                lsn: Lsn::from(body.read_u64::<BigEndian>()?),
+                request_lsn,
+                not_modified_since,
                 rel: RelTag {
                     spcnode: body.read_u32::<BigEndian>()?,
                     dbnode: body.read_u32::<BigEndian>()?,
@@ -1013,14 +1089,14 @@ impl PagestreamFeMessage {
                 blkno: body.read_u32::<BigEndian>()?,
             })),
             3 => Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
-                latest: body.read_u8()? != 0,
-                lsn: Lsn::from(body.read_u64::<BigEndian>()?),
+                request_lsn,
+                not_modified_since,
                 dbnode: body.read_u32::<BigEndian>()?,
             })),
             4 => Ok(PagestreamFeMessage::GetSlruSegment(
                 PagestreamGetSlruSegmentRequest {
-                    latest: body.read_u8()? != 0,
-                    lsn: Lsn::from(body.read_u64::<BigEndian>()?),
+                    request_lsn,
+                    not_modified_since,
                     kind: body.read_u8()?,
                     segno: body.read_u32::<BigEndian>()?,
                 },
@@ -1148,8 +1224,8 @@ mod tests {
         // Test serialization/deserialization of PagestreamFeMessage
         let messages = vec![
             PagestreamFeMessage::Exists(PagestreamExistsRequest {
-                latest: true,
-                lsn: Lsn(4),
+                request_lsn: Lsn(4),
+                not_modified_since: Lsn(3),
                 rel: RelTag {
                     forknum: 1,
                     spcnode: 2,
@@ -1158,8 +1234,8 @@ mod tests {
                 },
             }),
             PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
-                latest: false,
-                lsn: Lsn(4),
+                request_lsn: Lsn(4),
+                not_modified_since: Lsn(4),
                 rel: RelTag {
                     forknum: 1,
                     spcnode: 2,
@@ -1168,8 +1244,8 @@ mod tests {
                 },
             }),
             PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
-                latest: true,
-                lsn: Lsn(4),
+                request_lsn: Lsn(4),
+                not_modified_since: Lsn(3),
                 rel: RelTag {
                     forknum: 1,
                     spcnode: 2,
@@ -1179,14 +1255,16 @@ mod tests {
                 blkno: 7,
             }),
             PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
-                latest: true,
-                lsn: Lsn(4),
+                request_lsn: Lsn(4),
+                not_modified_since: Lsn(3),
                 dbnode: 7,
             }),
         ];
         for msg in messages {
             let bytes = msg.serialize();
-            let reconstructed = PagestreamFeMessage::parse(&mut bytes.reader()).unwrap();
+            let reconstructed =
+                PagestreamFeMessage::parse(&mut bytes.reader(), PagestreamProtocolVersion::V2)
+                    .unwrap();
             assert!(msg == reconstructed);
         }
     }
diff --git a/libs/pageserver_api/src/models/partitioning.rs b/libs/pageserver_api/src/models/partitioning.rs
index 0d287f7be0..f6644be635 100644
--- a/libs/pageserver_api/src/models/partitioning.rs
+++ b/libs/pageserver_api/src/models/partitioning.rs
@@ -1,9 +1,11 @@
 use utils::lsn::Lsn;
 
+use crate::keyspace::SparseKeySpace;
+
 #[derive(Debug, PartialEq, Eq)]
 pub struct Partitioning {
     pub keys: crate::keyspace::KeySpace,
-
+    pub sparse_keys: crate::keyspace::SparseKeySpace,
     pub at_lsn: Lsn,
 }
 
@@ -32,6 +34,8 @@ impl serde::Serialize for Partitioning {
         let mut map = serializer.serialize_map(Some(2))?;
         map.serialize_key("keys")?;
         map.serialize_value(&KeySpace(&self.keys))?;
+        map.serialize_key("sparse_keys")?;
+        map.serialize_value(&KeySpace(&self.sparse_keys.0))?;
         map.serialize_key("at_lsn")?;
         map.serialize_value(&WithDisplay(&self.at_lsn))?;
         map.end()
@@ -99,6 +103,7 @@ impl<'a> serde::Deserialize<'a> for Partitioning {
         #[derive(serde::Deserialize)]
         struct De {
             keys: KeySpace,
+            sparse_keys: KeySpace,
             #[serde_as(as = "serde_with::DisplayFromStr")]
             at_lsn: Lsn,
         }
@@ -107,6 +112,7 @@ impl<'a> serde::Deserialize<'a> for Partitioning {
         Ok(Self {
             at_lsn: de.at_lsn,
             keys: de.keys.0,
+            sparse_keys: SparseKeySpace(de.sparse_keys.0),
         })
     }
 }
@@ -133,6 +139,12 @@ mod tests {
                 "030000000000000000000000000000000003"
               ]
             ],
+            "sparse_keys": [
+              [
+                "620000000000000000000000000000000000",
+                "620000000000000000000000000000000003"
+              ]
+            ],
             "at_lsn": "0/2240160"
         }
         "#;
diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs
index a2a9165184..ff6d3d91b6 100644
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -5,21 +5,99 @@ use crate::{
     models::ShardParameters,
 };
 use hex::FromHex;
+use postgres_ffi::relfile_utils::INIT_FORKNUM;
 use serde::{Deserialize, Serialize};
 use utils::id::TenantId;
 
+/// See docs/rfcs/031-sharding-static.md for an overview of sharding.
+///
+/// This module contains a variety of types used to represent the concept of sharding
+/// a Neon tenant across multiple physical shards.  Since there are quite a few of these,
+/// we provide an summary here.
+///
+/// Types used to describe shards:
+/// - [`ShardCount`] describes how many shards make up a tenant, plus the magic `unsharded` value
+///   which identifies a tenant which is not shard-aware.  This means its storage paths do not include
+///   a shard suffix.
+/// - [`ShardNumber`] is simply the zero-based index of a shard within a tenant.
+/// - [`ShardIndex`] is the 2-tuple of `ShardCount` and `ShardNumber`, it's just like a `TenantShardId`
+///   without the tenant ID.  This is useful for things that are implicitly scoped to a particular
+///   tenant, such as layer files.
+/// - [`ShardIdentity`]` is the full description of a particular shard's parameters, in sufficient
+///   detail to convert a [`Key`] to a [`ShardNumber`] when deciding where to write/read.
+/// - The [`ShardSlug`] is a terse formatter for ShardCount and ShardNumber, written as
+///   four hex digits.  An unsharded tenant is `0000`.
+/// - [`TenantShardId`] is the unique ID of a particular shard within a particular tenant
+///
+/// Types used to describe the parameters for data distribution in a sharded tenant:
+/// - [`ShardStripeSize`] controls how long contiguous runs of [`Key`]s (stripes) are when distributed across
+///   multiple shards.  Its value is given in 8kiB pages.
+/// - [`ShardLayout`] describes the data distribution scheme, and at time of writing is
+///   always zero: this is provided for future upgrades that might introduce different
+///   data distribution schemes.
+///
+/// Examples:
+/// - A legacy unsharded tenant has one shard with ShardCount(0), ShardNumber(0), and its slug is 0000
+/// - A single sharded tenant has one shard with ShardCount(1), ShardNumber(0), and its slug is 0001
+/// - In a tenant with 4 shards, each shard has ShardCount(N), ShardNumber(i) where i in 0..N-1 (inclusive),
+///   and their slugs are 0004, 0104, 0204, and 0304.
+
 #[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
 pub struct ShardNumber(pub u8);
 
 #[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
 pub struct ShardCount(u8);
 
+/// Combination of ShardNumber and ShardCount.  For use within the context of a particular tenant,
+/// when we need to know which shard we're dealing with, but do not need to know the full
+/// ShardIdentity (because we won't be doing any page->shard mapping), and do not need to know
+/// the fully qualified TenantShardId.
+#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
+pub struct ShardIndex {
+    pub shard_number: ShardNumber,
+    pub shard_count: ShardCount,
+}
+
+/// The ShardIdentity contains enough information to map a [`Key`] to a [`ShardNumber`],
+/// and to check whether that [`ShardNumber`] is the same as the current shard.
+#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
+pub struct ShardIdentity {
+    pub number: ShardNumber,
+    pub count: ShardCount,
+    pub stripe_size: ShardStripeSize,
+    layout: ShardLayout,
+}
+
+/// Formatting helper, for generating the `shard_id` label in traces.
+struct ShardSlug<'a>(&'a TenantShardId);
+
+/// TenantShardId globally identifies a particular shard in a particular tenant.
+///
+/// These are written as `<TenantId>-<ShardSlug>`, for example:
+///   # The second shard in a two-shard tenant
+///   072f1291a5310026820b2fe4b2968934-0102
+///
+/// If the `ShardCount` is _unsharded_, the `TenantShardId` is written without
+/// a shard suffix and is equivalent to the encoding of a `TenantId`: this enables
+/// an unsharded [`TenantShardId`] to be used interchangably with a [`TenantId`].
+///
+/// The human-readable encoding of an unsharded TenantShardId, such as used in API URLs,
+/// is both forward and backward compatible with TenantId: a legacy TenantId can be
+/// decoded as a TenantShardId, and when re-encoded it will be parseable
+/// as a TenantId.
+#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
+pub struct TenantShardId {
+    pub tenant_id: TenantId,
+    pub shard_number: ShardNumber,
+    pub shard_count: ShardCount,
+}
+
 impl ShardCount {
     pub const MAX: Self = Self(u8::MAX);
 
     /// The internal value of a ShardCount may be zero, which means "1 shard, but use
     /// legacy format for TenantShardId that excludes the shard suffix", also known
-    /// as `TenantShardId::unsharded`.
+    /// as [`TenantShardId::unsharded`].
     ///
     /// This method returns the actual number of shards, i.e. if our internal value is
     /// zero, we return 1 (unsharded tenants have 1 shard).
@@ -38,6 +116,9 @@ impl ShardCount {
         self.0
     }
 
+    /// Whether the `ShardCount` is for an unsharded tenant, so uses one shard but
+    /// uses the legacy format for `TenantShardId`. See also the documentation for
+    /// [`Self::count`].
     pub fn is_unsharded(&self) -> bool {
         self.0 == 0
     }
@@ -53,33 +134,6 @@ impl ShardNumber {
     pub const MAX: Self = Self(u8::MAX);
 }
 
-/// TenantShardId identify the units of work for the Pageserver.
-///
-/// These are written as `<tenant_id>-<shard number><shard-count>`, for example:
-///
-///   # The second shard in a two-shard tenant
-///   072f1291a5310026820b2fe4b2968934-0102
-///
-/// Historically, tenants could not have multiple shards, and were identified
-/// by TenantId.  To support this, TenantShardId has a special legacy
-/// mode where `shard_count` is equal to zero: this represents a single-sharded
-/// tenant which should be written as a TenantId with no suffix.
-///
-/// The human-readable encoding of TenantShardId, such as used in API URLs,
-/// is both forward and backward compatible: a legacy TenantId can be
-/// decoded as a TenantShardId, and when re-encoded it will be parseable
-/// as a TenantId.
-///
-/// Note that the binary encoding is _not_ backward compatible, because
-/// at the time sharding is introduced, there are no existing binary structures
-/// containing TenantId that we need to handle.
-#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
-pub struct TenantShardId {
-    pub tenant_id: TenantId,
-    pub shard_number: ShardNumber,
-    pub shard_count: ShardCount,
-}
-
 impl TenantShardId {
     pub fn unsharded(tenant_id: TenantId) -> Self {
         Self {
@@ -111,10 +165,13 @@ impl TenantShardId {
     }
 
     /// Convenience for code that has special behavior on the 0th shard.
-    pub fn is_zero(&self) -> bool {
+    pub fn is_shard_zero(&self) -> bool {
         self.shard_number == ShardNumber(0)
     }
 
+    /// The "unsharded" value is distinct from simply having a single shard: it represents
+    /// a tenant which is not shard-aware at all, and whose storage paths will not include
+    /// a shard suffix.
     pub fn is_unsharded(&self) -> bool {
         self.shard_number == ShardNumber(0) && self.shard_count.is_unsharded()
     }
@@ -150,9 +207,6 @@ impl TenantShardId {
     }
 }
 
-/// Formatting helper
-struct ShardSlug<'a>(&'a TenantShardId);
-
 impl<'a> std::fmt::Display for ShardSlug<'a> {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         write!(
@@ -222,16 +276,6 @@ impl From<[u8; 18]> for TenantShardId {
     }
 }
 
-/// For use within the context of a particular tenant, when we need to know which
-/// shard we're dealing with, but do not need to know the full ShardIdentity (because
-/// we won't be doing any page->shard mapping), and do not need to know the fully qualified
-/// TenantShardId.
-#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
-pub struct ShardIndex {
-    pub shard_number: ShardNumber,
-    pub shard_count: ShardCount,
-}
-
 impl ShardIndex {
     pub fn new(number: ShardNumber, count: ShardCount) -> Self {
         Self {
@@ -246,6 +290,9 @@ impl ShardIndex {
         }
     }
 
+    /// The "unsharded" value is distinct from simply having a single shard: it represents
+    /// a tenant which is not shard-aware at all, and whose storage paths will not include
+    /// a shard suffix.
     pub fn is_unsharded(&self) -> bool {
         self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
     }
@@ -313,6 +360,8 @@ impl Serialize for TenantShardId {
         if serializer.is_human_readable() {
             serializer.collect_str(self)
         } else {
+            // Note: while human encoding of [`TenantShardId`] is backward and forward
+            // compatible, this binary encoding is not.
             let mut packed: [u8; 18] = [0; 18];
             packed[0..16].clone_from_slice(&self.tenant_id.as_arr());
             packed[16] = self.shard_number.0;
@@ -390,16 +439,6 @@ const LAYOUT_BROKEN: ShardLayout = ShardLayout(255);
 /// Default stripe size in pages: 256MiB divided by 8kiB page size.
 const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);
 
-/// The ShardIdentity contains the information needed for one member of map
-/// to resolve a key to a shard, and then check whether that shard is ==self.
-#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
-pub struct ShardIdentity {
-    pub number: ShardNumber,
-    pub count: ShardCount,
-    pub stripe_size: ShardStripeSize,
-    layout: ShardLayout,
-}
-
 #[derive(thiserror::Error, Debug, PartialEq, Eq)]
 pub enum ShardConfigError {
     #[error("Invalid shard count")]
@@ -414,7 +453,7 @@ impl ShardIdentity {
     /// An identity with number=0 count=0 is a "none" identity, which represents legacy
     /// tenants.  Modern single-shard tenants should not use this: they should
     /// have number=0 count=1.
-    pub fn unsharded() -> Self {
+    pub const fn unsharded() -> Self {
         Self {
             number: ShardNumber(0),
             count: ShardCount(0),
@@ -439,6 +478,9 @@ impl ShardIdentity {
         }
     }
 
+    /// The "unsharded" value is distinct from simply having a single shard: it represents
+    /// a tenant which is not shard-aware at all, and whose storage paths will not include
+    /// a shard suffix.
     pub fn is_unsharded(&self) -> bool {
         self.number == ShardNumber(0) && self.count == ShardCount(0)
     }
@@ -487,6 +529,8 @@ impl ShardIdentity {
     }
 
     /// Return true if the key should be ingested by this shard
+    ///
+    /// Shards must ingest _at least_ keys which return true from this check.
     pub fn is_key_local(&self, key: &Key) -> bool {
         assert!(!self.is_broken());
         if self.count < ShardCount(2) || (key_is_shard0(key) && self.number == ShardNumber(0)) {
@@ -497,7 +541,9 @@ impl ShardIdentity {
     }
 
     /// Return true if the key should be discarded if found in this shard's
-    /// data store, e.g. during compaction after a split
+    /// data store, e.g. during compaction after a split.
+    ///
+    /// Shards _may_ drop keys which return false here, but are not obliged to.
     pub fn is_key_disposable(&self, key: &Key) -> bool {
         if key_is_shard0(key) {
             // Q: Why can't we dispose of shard0 content if we're not shard 0?
@@ -523,7 +569,7 @@ impl ShardIdentity {
 
     /// Convenience for checking if this identity is the 0th shard in a tenant,
     /// for special cases on shard 0 such as ingesting relation sizes.
-    pub fn is_zero(&self) -> bool {
+    pub fn is_shard_zero(&self) -> bool {
         self.number == ShardNumber(0)
     }
 }
@@ -606,7 +652,13 @@ fn key_is_shard0(key: &Key) -> bool {
     // relation pages are distributed to shards other than shard zero. Everything else gets
     // stored on shard 0.  This guarantees that shard 0 can independently serve basebackup
     // requests, and any request other than those for particular blocks in relations.
-    !is_rel_block_key(key)
+    //
+    // The only exception to this rule is "initfork" data -- this relates to postgres's UNLOGGED table
+    // type. These are special relations, usually with only 0 or 1 blocks, and we store them on shard 0
+    // because they must be included in basebackups.
+    let is_initfork = key.field5 == INIT_FORKNUM;
+
+    !is_rel_block_key(key) || is_initfork
 }
 
 /// Provide the same result as the function in postgres `hashfn.h` with the same name
diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs
index aa6845b9b1..0d6986778a 100644
--- a/libs/postgres_ffi/src/lib.rs
+++ b/libs/postgres_ffi/src/lib.rs
@@ -118,7 +118,9 @@ pub use v14::bindings::{TimeLineID, TimestampTz, XLogRecPtr, XLogSegNo};
 // Likewise for these, although the assumption that these don't change is a little more iffy.
 pub use v14::bindings::{MultiXactOffset, MultiXactStatus};
 pub use v14::bindings::{PageHeaderData, XLogRecord};
-pub use v14::xlog_utils::{XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD};
+pub use v14::xlog_utils::{
+    XLOG_SIZE_OF_XLOG_LONG_PHD, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD,
+};
 
 pub use v14::bindings::{CheckPoint, ControlFileData};
 
diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs
index 4a66a0ab1d..0bbb91afc2 100644
--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -331,7 +331,10 @@ impl CheckPoint {
     /// Returns 'true' if the XID was updated.
     pub fn update_next_xid(&mut self, xid: u32) -> bool {
         // nextXid should be greater than any XID in WAL, so increment provided XID and check for wraparround.
-        let mut new_xid = std::cmp::max(xid.wrapping_add(1), pg_constants::FIRST_NORMAL_TRANSACTION_ID);
+        let mut new_xid = std::cmp::max(
+            xid.wrapping_add(1),
+            pg_constants::FIRST_NORMAL_TRANSACTION_ID,
+        );
         // To reduce number of metadata checkpoints, we forward align XID on XID_CHECKPOINT_INTERVAL.
         // XID_CHECKPOINT_INTERVAL should not be larger than BLCKSZ*CLOG_XACTS_PER_BYTE
         new_xid =
@@ -367,8 +370,16 @@ pub fn generate_wal_segment(segno: u64, system_id: u64, lsn: Lsn) -> Result<Byte
     let seg_off = lsn.segment_offset(WAL_SEGMENT_SIZE);
 
     let first_page_only = seg_off < XLOG_BLCKSZ;
-    let (shdr_rem_len, infoflags) = if first_page_only {
-        (seg_off, pg_constants::XLP_FIRST_IS_CONTRECORD)
+    // If first records starts in the middle of the page, pretend in page header
+    // there is a fake record which ends where first real record starts. This
+    // makes pg_waldump etc happy.
+    let (shdr_rem_len, infoflags) = if first_page_only && seg_off > 0 {
+        assert!(seg_off >= XLOG_SIZE_OF_XLOG_LONG_PHD);
+        // xlp_rem_len doesn't include page header, hence the subtraction.
+        (
+            seg_off - XLOG_SIZE_OF_XLOG_LONG_PHD,
+            pg_constants::XLP_FIRST_IS_CONTRECORD,
+        )
     } else {
         (0, 0)
     };
@@ -397,20 +408,22 @@ pub fn generate_wal_segment(segno: u64, system_id: u64, lsn: Lsn) -> Result<Byte
 
     if !first_page_only {
         let block_offset = lsn.page_offset_in_segment(WAL_SEGMENT_SIZE) as usize;
+        // see comments above about XLP_FIRST_IS_CONTRECORD and xlp_rem_len.
+        let (xlp_rem_len, xlp_info) = if page_off > 0 {
+            assert!(page_off >= XLOG_SIZE_OF_XLOG_SHORT_PHD as u64);
+            (
+                (page_off - XLOG_SIZE_OF_XLOG_SHORT_PHD as u64) as u32,
+                pg_constants::XLP_FIRST_IS_CONTRECORD,
+            )
+        } else {
+            (0, 0)
+        };
         let header = XLogPageHeaderData {
             xlp_magic: XLOG_PAGE_MAGIC as u16,
-            xlp_info: if page_off >= pg_constants::SIZE_OF_PAGE_HEADER as u64 {
-                pg_constants::XLP_FIRST_IS_CONTRECORD
-            } else {
-                0
-            },
+            xlp_info,
             xlp_tli: PG_TLI,
             xlp_pageaddr: lsn.page_lsn().0,
-            xlp_rem_len: if page_off >= pg_constants::SIZE_OF_PAGE_HEADER as u64 {
-                page_off as u32
-            } else {
-                0u32
-            },
+            xlp_rem_len,
             ..Default::default() // Put 0 in padding fields.
         };
         let hdr_bytes = header.encode()?;
diff --git a/libs/postgres_ffi/wal_craft/src/lib.rs b/libs/postgres_ffi/wal_craft/src/lib.rs
index 23786e3b08..262068cbda 100644
--- a/libs/postgres_ffi/wal_craft/src/lib.rs
+++ b/libs/postgres_ffi/wal_craft/src/lib.rs
@@ -4,7 +4,9 @@ use log::*;
 use postgres::types::PgLsn;
 use postgres::Client;
 use postgres_ffi::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ};
-use postgres_ffi::{XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD};
+use postgres_ffi::{
+    XLOG_SIZE_OF_XLOG_LONG_PHD, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD,
+};
 use std::path::{Path, PathBuf};
 use std::process::Command;
 use std::time::{Duration, Instant};
@@ -262,11 +264,21 @@ fn craft_internal<C: postgres::GenericClient>(
         intermediate_lsns.insert(0, initial_lsn);
     }
 
-    // Some records may be not flushed, e.g. non-transactional logical messages.
+    // Some records may be not flushed, e.g. non-transactional logical messages. Flush now.
     //
-    // Note: this is broken if pg_current_wal_insert_lsn is at page boundary
-    // because pg_current_wal_insert_lsn skips page headers.
-    client.execute("select neon_xlogflush(pg_current_wal_insert_lsn())", &[])?;
+    // If the previous WAL record ended exactly at page boundary, pg_current_wal_insert_lsn
+    // returns the position just after the page header on the next page. That's where the next
+    // record will be inserted. But the page header hasn't actually been written to the WAL
+    // yet, and if you try to flush it, you get a "request to flush past end of generated WAL"
+    // error. Because of that, if the insert location is just after a page header, back off to
+    // previous page boundary.
+    let mut lsn = u64::from(client.pg_current_wal_insert_lsn()?);
+    if lsn % WAL_SEGMENT_SIZE as u64 == XLOG_SIZE_OF_XLOG_LONG_PHD as u64 {
+        lsn -= XLOG_SIZE_OF_XLOG_LONG_PHD as u64;
+    } else if lsn % XLOG_BLCKSZ as u64 == XLOG_SIZE_OF_XLOG_SHORT_PHD as u64 {
+        lsn -= XLOG_SIZE_OF_XLOG_SHORT_PHD as u64;
+    }
+    client.execute("select neon_xlogflush($1)", &[&PgLsn::from(lsn)])?;
     Ok(intermediate_lsns)
 }
 
@@ -320,38 +332,49 @@ impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {
 
         client.execute("CREATE table t(x int)", &[])?;
 
-        // Add padding so the XLOG_SWITCH record ends exactly on XLOG_BLCKSZ boundary.
-        // We will use logical message as the padding. We start with detecting how much WAL
-        // it takes for one logical message, considering all alignments and headers.
-        let base_wal_advance = {
+        // Add padding so the XLOG_SWITCH record ends exactly on XLOG_BLCKSZ boundary.  We
+        // will use carefully-sized logical messages to advance WAL insert location such
+        // that there is just enough space on the page for the XLOG_SWITCH record.
+        loop {
+            // We start with measuring how much WAL it takes for one logical message,
+            // considering all alignments and headers.
             let before_lsn = client.pg_current_wal_insert_lsn()?;
-            // Small non-empty message bigger than few bytes is more likely than an empty
-            // message to have the same format as the big padding message.
             client.execute(
                 "SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', 10))",
                 &[],
             )?;
-            // The XLOG_SWITCH record has no data => its size is exactly XLOG_SIZE_OF_XLOG_RECORD.
-            (u64::from(client.pg_current_wal_insert_lsn()?) - u64::from(before_lsn)) as usize
-                + XLOG_SIZE_OF_XLOG_RECORD
-        };
-        let mut remaining_lsn =
-            XLOG_BLCKSZ - u64::from(client.pg_current_wal_insert_lsn()?) as usize % XLOG_BLCKSZ;
-        if remaining_lsn < base_wal_advance {
-            remaining_lsn += XLOG_BLCKSZ;
+            let after_lsn = client.pg_current_wal_insert_lsn()?;
+
+            // Did the record cross a page boundary? If it did, start over. Crossing a
+            // page boundary adds to the apparent size of the record because of the page
+            // header, which throws off the calculation.
+            if u64::from(before_lsn) / XLOG_BLCKSZ as u64
+                != u64::from(after_lsn) / XLOG_BLCKSZ as u64
+            {
+                continue;
+            }
+            // base_size is the size of a logical message without the payload
+            let base_size = u64::from(after_lsn) - u64::from(before_lsn) - 10;
+
+            // Is there enough space on the page for another logical message and an
+            // XLOG_SWITCH? If not, start over.
+            let page_remain = XLOG_BLCKSZ as u64 - u64::from(after_lsn) % XLOG_BLCKSZ as u64;
+            if page_remain < base_size - XLOG_SIZE_OF_XLOG_RECORD as u64 {
+                continue;
+            }
+
+            // We will write another logical message, such that after the logical message
+            // record, there will be space for exactly one XLOG_SWITCH. How large should
+            // the logical message's payload be? An XLOG_SWITCH record has no data => its
+            // size is exactly XLOG_SIZE_OF_XLOG_RECORD.
+            let repeats = page_remain - base_size - XLOG_SIZE_OF_XLOG_RECORD as u64;
+
+            client.execute(
+                "SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', $1))",
+                &[&(repeats as i32)],
+            )?;
+            break;
         }
-        let repeats = 10 + remaining_lsn - base_wal_advance;
-        info!(
-            "current_wal_insert_lsn={}, remaining_lsn={}, base_wal_advance={}, repeats={}",
-            client.pg_current_wal_insert_lsn()?,
-            remaining_lsn,
-            base_wal_advance,
-            repeats
-        );
-        client.execute(
-            "SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', $1))",
-            &[&(repeats as i32)],
-        )?;
         info!(
             "current_wal_insert_lsn={}, XLOG_SIZE_OF_XLOG_RECORD={}",
             client.pg_current_wal_insert_lsn()?,
diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml
index 4a53f485ca..78da01c9a0 100644
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -38,6 +38,7 @@ azure_storage_blobs.workspace = true
 futures-util.workspace = true
 http-types.workspace = true
 itertools.workspace = true
+sync_wrapper = { workspace = true, features = ["futures"] }
 
 [dev-dependencies]
 camino-tempfile.workspace = true
diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs
index 5fff3e25c9..24c1248304 100644
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -3,6 +3,7 @@
 use std::borrow::Cow;
 use std::collections::HashMap;
 use std::env;
+use std::io;
 use std::num::NonZeroU32;
 use std::pin::Pin;
 use std::str::FromStr;
@@ -20,6 +21,7 @@ use azure_storage_blobs::blob::CopyStatus;
 use azure_storage_blobs::prelude::ClientBuilder;
 use azure_storage_blobs::{blob::operations::GetBlobBuilder, prelude::ContainerClient};
 use bytes::Bytes;
+use futures::future::Either;
 use futures::stream::Stream;
 use futures_util::StreamExt;
 use futures_util::TryStreamExt;
@@ -128,12 +130,12 @@ impl AzureBlobStorage {
         let kind = RequestKind::Get;
 
         let _permit = self.permit(kind, cancel).await?;
+        let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
+        let cancel_or_timeout_ = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
 
         let mut etag = None;
         let mut last_modified = None;
         let mut metadata = HashMap::new();
-        // TODO give proper streaming response instead of buffering into RAM
-        // https://github.com/neondatabase/neon/issues/5563
 
         let download = async {
             let response = builder
@@ -152,39 +154,46 @@ impl AzureBlobStorage {
                 Err(_elapsed) => Err(DownloadError::Timeout),
             });
 
-            let mut response = std::pin::pin!(response);
+            let mut response = Box::pin(response);
 
-            let mut bufs = Vec::new();
-            while let Some(part) = response.next().await {
-                let part = part?;
-                if etag.is_none() {
-                    etag = Some(part.blob.properties.etag);
-                }
-                if last_modified.is_none() {
-                    last_modified = Some(part.blob.properties.last_modified.into());
-                }
-                if let Some(blob_meta) = part.blob.metadata {
-                    metadata.extend(blob_meta.iter().map(|(k, v)| (k.to_owned(), v.to_owned())));
-                }
-                let data = part
-                    .data
-                    .collect()
-                    .await
-                    .map_err(|e| DownloadError::Other(e.into()))?;
-                bufs.push(data);
-            }
-
-            if bufs.is_empty() {
+            let Some(part) = response.next().await else {
                 return Err(DownloadError::Other(anyhow::anyhow!(
-                    "Azure GET response contained no buffers"
+                    "Azure GET response contained no response body"
                 )));
+            };
+            let part = part?;
+            if etag.is_none() {
+                etag = Some(part.blob.properties.etag);
             }
+            if last_modified.is_none() {
+                last_modified = Some(part.blob.properties.last_modified.into());
+            }
+            if let Some(blob_meta) = part.blob.metadata {
+                metadata.extend(blob_meta.iter().map(|(k, v)| (k.to_owned(), v.to_owned())));
+            }
+
             // unwrap safety: if these were None, bufs would be empty and we would have returned an error already
             let etag = etag.unwrap();
             let last_modified = last_modified.unwrap();
 
+            let tail_stream = response
+                .map(|part| match part {
+                    Ok(part) => Either::Left(part.data.map(|r| r.map_err(io::Error::other))),
+                    Err(e) => {
+                        Either::Right(futures::stream::once(async { Err(io::Error::other(e)) }))
+                    }
+                })
+                .flatten();
+            let stream = part
+                .data
+                .map(|r| r.map_err(io::Error::other))
+                .chain(sync_wrapper::SyncStream::new(tail_stream));
+            //.chain(SyncStream::from_pin(Box::pin(tail_stream)));
+
+            let download_stream = crate::support::DownloadStream::new(cancel_or_timeout_, stream);
+
             Ok(Download {
-                download_stream: Box::pin(futures::stream::iter(bufs.into_iter().map(Ok))),
+                download_stream: Box::pin(download_stream),
                 etag,
                 last_modified,
                 metadata: Some(StorageMetadata(metadata)),
@@ -193,7 +202,10 @@ impl AzureBlobStorage {
 
         tokio::select! {
             bufs = download => bufs,
-            _ = cancel.cancelled() => Err(DownloadError::Cancelled),
+            cancel_or_timeout = cancel_or_timeout => match cancel_or_timeout {
+                TimeoutOrCancel::Timeout => Err(DownloadError::Timeout),
+                TimeoutOrCancel::Cancel => Err(DownloadError::Cancelled),
+            },
         }
     }
 
diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index e708854be2..708662f20f 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -21,11 +21,13 @@ use std::{
     fmt::Debug,
     num::{NonZeroU32, NonZeroUsize},
     pin::Pin,
+    str::FromStr,
     sync::Arc,
     time::{Duration, SystemTime},
 };
 
 use anyhow::{bail, Context};
+use aws_sdk_s3::types::StorageClass;
 use camino::{Utf8Path, Utf8PathBuf};
 
 use bytes::Bytes;
@@ -53,11 +55,11 @@ pub use error::{DownloadError, TimeTravelError, TimeoutOrCancel};
 /// ~3500 PUT/COPY/POST/DELETE or 5500 GET/HEAD S3 requests
 /// <https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/>
 pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100;
-/// We set this a little bit low as we currently buffer the entire file into RAM
+/// Set this limit analogously to the S3 limit
 ///
 /// Here, a limit of max 20k concurrent connections was noted.
 /// <https://learn.microsoft.com/en-us/answers/questions/1301863/is-there-any-limitation-to-concurrent-connections>
-pub const DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT: usize = 30;
+pub const DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT: usize = 100;
 /// No limits on the client side, which currenltly means 1000 for AWS S3.
 /// <https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax>
 pub const DEFAULT_MAX_KEYS_PER_LIST_RESPONSE: Option<i32> = None;
@@ -134,6 +136,11 @@ impl RemotePath {
     pub fn strip_prefix(&self, p: &RemotePath) -> Result<&Utf8Path, std::path::StripPrefixError> {
         self.0.strip_prefix(&p.0)
     }
+
+    pub fn add_trailing_slash(&self) -> Self {
+        // Unwrap safety inputs are guararnteed to be valid UTF-8
+        Self(format!("{}/", self.0).try_into().unwrap())
+    }
 }
 
 /// We don't need callers to be able to pass arbitrary delimiters: just control
@@ -157,47 +164,21 @@ pub struct Listing {
 /// providing basic CRUD operations for storage files.
 #[allow(async_fn_in_trait)]
 pub trait RemoteStorage: Send + Sync + 'static {
-    /// Lists all top level subdirectories for a given prefix
-    /// Note: here we assume that if the prefix is passed it was obtained via remote_object_id
-    /// which already takes into account any kind of global prefix (prefix_in_bucket for S3 or storage_root for LocalFS)
-    /// so this method doesnt need to.
-    async fn list_prefixes(
-        &self,
-        prefix: Option<&RemotePath>,
-        cancel: &CancellationToken,
-    ) -> Result<Vec<RemotePath>, DownloadError> {
-        let result = self
-            .list(prefix, ListingMode::WithDelimiter, None, cancel)
-            .await?
-            .prefixes;
-        Ok(result)
-    }
-    /// Lists all files in directory "recursively"
-    /// (not really recursively, because AWS has a flat namespace)
-    /// Note: This is subtely different than list_prefixes,
-    /// because it is for listing files instead of listing
-    /// names sharing common prefixes.
-    /// For example,
-    /// list_files("foo/bar") = ["foo/bar/cat123.txt",
-    /// "foo/bar/cat567.txt", "foo/bar/dog123.txt", "foo/bar/dog456.txt"]
-    /// whereas,
-    /// list_prefixes("foo/bar/") = ["cat", "dog"]
-    /// See `test_real_s3.rs` for more details.
+    /// List objects in remote storage, with semantics matching AWS S3's ListObjectsV2.
+    /// (see `<https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html>`)
+    ///
+    /// Note that the prefix is relative to any `prefix_in_bucket` configured for the client, not
+    /// from the absolute root of the bucket.
+    ///
+    /// `mode` configures whether to use a delimiter.  Without a delimiter all keys
+    /// within the prefix are listed in the `keys` of the result.  With a delimiter, any "directories" at the top level of
+    /// the prefix are returned in the `prefixes` of the result, and keys in the top level of the prefix are
+    /// returned in `keys` ().
+    ///
+    /// `max_keys` controls the maximum number of keys that will be returned.  If this is None, this function
+    /// will iteratively call listobjects until it runs out of keys.  Note that this is not safe to use on
+    /// unlimted size buckets, as the full list of objects is allocated into a monolithic data structure.
     ///
-    /// max_keys limits max number of keys returned; None means unlimited.
-    async fn list_files(
-        &self,
-        prefix: Option<&RemotePath>,
-        max_keys: Option<NonZeroU32>,
-        cancel: &CancellationToken,
-    ) -> Result<Vec<RemotePath>, DownloadError> {
-        let result = self
-            .list(prefix, ListingMode::NoDelimiter, max_keys, cancel)
-            .await?
-            .keys;
-        Ok(result)
-    }
-
     async fn list(
         &self,
         prefix: Option<&RemotePath>,
@@ -336,41 +317,6 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
         }
     }
 
-    // A function for listing all the files in a "directory"
-    // Example:
-    // list_files("foo/bar") = ["foo/bar/a.txt", "foo/bar/b.txt"]
-    //
-    // max_keys limits max number of keys returned; None means unlimited.
-    pub async fn list_files(
-        &self,
-        folder: Option<&RemotePath>,
-        max_keys: Option<NonZeroU32>,
-        cancel: &CancellationToken,
-    ) -> Result<Vec<RemotePath>, DownloadError> {
-        match self {
-            Self::LocalFs(s) => s.list_files(folder, max_keys, cancel).await,
-            Self::AwsS3(s) => s.list_files(folder, max_keys, cancel).await,
-            Self::AzureBlob(s) => s.list_files(folder, max_keys, cancel).await,
-            Self::Unreliable(s) => s.list_files(folder, max_keys, cancel).await,
-        }
-    }
-
-    // lists common *prefixes*, if any of files
-    // Example:
-    // list_prefixes("foo123","foo567","bar123","bar432") = ["foo", "bar"]
-    pub async fn list_prefixes(
-        &self,
-        prefix: Option<&RemotePath>,
-        cancel: &CancellationToken,
-    ) -> Result<Vec<RemotePath>, DownloadError> {
-        match self {
-            Self::LocalFs(s) => s.list_prefixes(prefix, cancel).await,
-            Self::AwsS3(s) => s.list_prefixes(prefix, cancel).await,
-            Self::AzureBlob(s) => s.list_prefixes(prefix, cancel).await,
-            Self::Unreliable(s) => s.list_prefixes(prefix, cancel).await,
-        }
-    }
-
     /// See [`RemoteStorage::upload`]
     pub async fn upload(
         &self,
@@ -619,6 +565,7 @@ pub struct S3Config {
     /// See [`DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT`] for more details.
     pub concurrency_limit: NonZeroUsize,
     pub max_keys_per_list_response: Option<i32>,
+    pub upload_storage_class: Option<StorageClass>,
 }
 
 impl Debug for S3Config {
@@ -747,6 +694,18 @@ impl RemoteStorageConfig {
                     endpoint,
                     concurrency_limit,
                     max_keys_per_list_response,
+                    upload_storage_class: toml
+                        .get("upload_storage_class")
+                        .map(|prefix_in_bucket| -> anyhow::Result<_> {
+                            let s = parse_toml_string("upload_storage_class", prefix_in_bucket)?;
+                            let storage_class = StorageClass::from_str(&s).expect("infallible");
+                            #[allow(deprecated)]
+                            if matches!(storage_class, StorageClass::Unknown(_)) {
+                                bail!("Specified storage class unknown to SDK: '{s}'. Allowed values: {:?}", StorageClass::values());
+                            }
+                            Ok(storage_class)
+                        })
+                        .transpose()?,
                 })
             }
             (_, _, _, Some(_), None) => {
diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs
index 8cad863731..1f7bcfc982 100644
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -5,11 +5,9 @@
 //! volume is mounted to the local FS.
 
 use std::{
-    borrow::Cow,
-    future::Future,
+    collections::HashSet,
     io::ErrorKind,
     num::NonZeroU32,
-    pin::Pin,
     time::{Duration, SystemTime, UNIX_EPOCH},
 };
 
@@ -22,11 +20,11 @@ use tokio::{
     io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt},
 };
 use tokio_util::{io::ReaderStream, sync::CancellationToken};
-use tracing::*;
-use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};
+use utils::crashsafe::path_with_suffix_extension;
 
 use crate::{
     Download, DownloadError, Listing, ListingMode, RemotePath, TimeTravelError, TimeoutOrCancel,
+    REMOTE_STORAGE_PREFIX_SEPARATOR,
 };
 
 use super::{RemoteStorage, StorageMetadata};
@@ -93,7 +91,47 @@ impl LocalFs {
 
     #[cfg(test)]
     async fn list_all(&self) -> anyhow::Result<Vec<RemotePath>> {
-        Ok(get_all_files(&self.storage_root, true)
+        use std::{future::Future, pin::Pin};
+        fn get_all_files<'a, P>(
+            directory_path: P,
+        ) -> Pin<Box<dyn Future<Output = anyhow::Result<Vec<Utf8PathBuf>>> + Send + Sync + 'a>>
+        where
+            P: AsRef<Utf8Path> + Send + Sync + 'a,
+        {
+            Box::pin(async move {
+                let directory_path = directory_path.as_ref();
+                if directory_path.exists() {
+                    if directory_path.is_dir() {
+                        let mut paths = Vec::new();
+                        let mut dir_contents = fs::read_dir(directory_path).await?;
+                        while let Some(dir_entry) = dir_contents.next_entry().await? {
+                            let file_type = dir_entry.file_type().await?;
+                            let entry_path =
+                                Utf8PathBuf::from_path_buf(dir_entry.path()).map_err(|pb| {
+                                    anyhow::Error::msg(format!(
+                                        "non-Unicode path: {}",
+                                        pb.to_string_lossy()
+                                    ))
+                                })?;
+                            if file_type.is_symlink() {
+                                tracing::debug!("{entry_path:?} is a symlink, skipping")
+                            } else if file_type.is_dir() {
+                                paths.extend(get_all_files(&entry_path).await?.into_iter())
+                            } else {
+                                paths.push(entry_path);
+                            }
+                        }
+                        Ok(paths)
+                    } else {
+                        bail!("Path {directory_path:?} is not a directory")
+                    }
+                } else {
+                    Ok(Vec::new())
+                }
+            })
+        }
+
+        Ok(get_all_files(&self.storage_root)
             .await?
             .into_iter()
             .map(|path| {
@@ -120,6 +158,14 @@ impl LocalFs {
         // S3 object list prefixes can be arbitrary strings, but when reading
         // the local filesystem we need a directory to start calling read_dir on.
         let mut initial_dir = full_path.clone();
+
+        // If there's no trailing slash, we have to start looking from one above: even if
+        // `initial_dir` is a directory, we should still list any prefixes in the parent
+        // that start with the same string.
+        if !full_path.to_string().ends_with('/') {
+            initial_dir.pop();
+        }
+
         loop {
             // Did we make it to the root?
             if initial_dir.parent().is_none() {
@@ -295,61 +341,66 @@ impl RemoteStorage for LocalFs {
         let op = async {
             let mut result = Listing::default();
 
-            if let ListingMode::NoDelimiter = mode {
-                let keys = self
-                    .list_recursive(prefix)
-                    .await
-                    .map_err(DownloadError::Other)?;
-
-                result.keys = keys
-                    .into_iter()
-                    .filter(|k| {
-                        let path = k.with_base(&self.storage_root);
-                        !path.is_dir()
-                    })
-                    .collect();
-
-                if let Some(max_keys) = max_keys {
-                    result.keys.truncate(max_keys.get() as usize);
-                }
-
-                return Ok(result);
-            }
-
-            let path = match prefix {
-                Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)),
-                None => Cow::Borrowed(&self.storage_root),
-            };
-
-            let prefixes_to_filter = get_all_files(path.as_ref(), false)
+            // Filter out directories: in S3 directories don't exist, only the keys within them do.
+            let keys = self
+                .list_recursive(prefix)
                 .await
                 .map_err(DownloadError::Other)?;
+            let keys = keys
+                .into_iter()
+                .filter(|k| {
+                    let path = k.with_base(&self.storage_root);
+                    !path.is_dir()
+                })
+                .collect();
 
-            // filter out empty directories to mirror s3 behavior.
-            for prefix in prefixes_to_filter {
-                if prefix.is_dir()
-                    && is_directory_empty(&prefix)
-                        .await
-                        .map_err(DownloadError::Other)?
-                {
-                    continue;
-                }
-
-                let stripped = prefix
-                    .strip_prefix(&self.storage_root)
-                    .context("Failed to strip prefix")
-                    .and_then(RemotePath::new)
-                    .expect(
-                        "We list files for storage root, hence should be able to remote the prefix",
-                    );
-
-                if prefix.is_dir() {
-                    result.prefixes.push(stripped);
-                } else {
-                    result.keys.push(stripped);
+            if let ListingMode::NoDelimiter = mode {
+                result.keys = keys;
+            } else {
+                let mut prefixes = HashSet::new();
+                for key in keys {
+                    // If the part after the prefix includes a "/", take only the first part and put it in `prefixes`.
+                    let relative_key = if let Some(prefix) = prefix {
+                        let mut prefix = prefix.clone();
+                        // We only strip the dirname of the prefix, so that when we strip it from the start of keys we
+                        // end up with full file/dir names.
+                        let prefix_full_local_path = prefix.with_base(&self.storage_root);
+                        let has_slash = prefix.0.to_string().ends_with('/');
+                        let strip_prefix = if prefix_full_local_path.is_dir() && has_slash {
+                            prefix
+                        } else {
+                            prefix.0.pop();
+                            prefix
+                        };
+
+                        RemotePath::new(key.strip_prefix(&strip_prefix).unwrap()).unwrap()
+                    } else {
+                        key
+                    };
+
+                    let relative_key = format!("{}", relative_key);
+                    if relative_key.contains(REMOTE_STORAGE_PREFIX_SEPARATOR) {
+                        let first_part = relative_key
+                            .split(REMOTE_STORAGE_PREFIX_SEPARATOR)
+                            .next()
+                            .unwrap()
+                            .to_owned();
+                        prefixes.insert(first_part);
+                    } else {
+                        result
+                            .keys
+                            .push(RemotePath::from_string(&relative_key).unwrap());
+                    }
                 }
+                result.prefixes = prefixes
+                    .into_iter()
+                    .map(|s| RemotePath::from_string(&s).unwrap())
+                    .collect();
             }
 
+            if let Some(max_keys) = max_keys {
+                result.keys.truncate(max_keys.get() as usize);
+            }
             Ok(result)
         };
 
@@ -560,50 +611,6 @@ fn storage_metadata_path(original_path: &Utf8Path) -> Utf8PathBuf {
     path_with_suffix_extension(original_path, "metadata")
 }
 
-fn get_all_files<'a, P>(
-    directory_path: P,
-    recursive: bool,
-) -> Pin<Box<dyn Future<Output = anyhow::Result<Vec<Utf8PathBuf>>> + Send + Sync + 'a>>
-where
-    P: AsRef<Utf8Path> + Send + Sync + 'a,
-{
-    Box::pin(async move {
-        let directory_path = directory_path.as_ref();
-        if directory_path.exists() {
-            if directory_path.is_dir() {
-                let mut paths = Vec::new();
-                let mut dir_contents = fs::read_dir(directory_path).await?;
-                while let Some(dir_entry) = dir_contents.next_entry().await? {
-                    let file_type = dir_entry.file_type().await?;
-                    let entry_path =
-                        Utf8PathBuf::from_path_buf(dir_entry.path()).map_err(|pb| {
-                            anyhow::Error::msg(format!(
-                                "non-Unicode path: {}",
-                                pb.to_string_lossy()
-                            ))
-                        })?;
-                    if file_type.is_symlink() {
-                        debug!("{entry_path:?} is a symlink, skipping")
-                    } else if file_type.is_dir() {
-                        if recursive {
-                            paths.extend(get_all_files(&entry_path, true).await?.into_iter())
-                        } else {
-                            paths.push(entry_path)
-                        }
-                    } else {
-                        paths.push(entry_path);
-                    }
-                }
-                Ok(paths)
-            } else {
-                bail!("Path {directory_path:?} is not a directory")
-            }
-        } else {
-            Ok(Vec::new())
-        }
-    })
-}
-
 async fn create_target_directory(target_file_path: &Utf8Path) -> anyhow::Result<()> {
     let target_dir = match target_file_path.parent() {
         Some(parent_dir) => parent_dir,
@@ -923,13 +930,18 @@ mod fs_tests {
         // No delimiter: should recursively list everything
         let (storage, cancel) = create_storage()?;
         let child = upload_dummy_file(&storage, "grandparent/parent/child", None, &cancel).await?;
+        let child_sibling =
+            upload_dummy_file(&storage, "grandparent/parent/child_sibling", None, &cancel).await?;
         let uncle = upload_dummy_file(&storage, "grandparent/uncle", None, &cancel).await?;
 
         let listing = storage
             .list(None, ListingMode::NoDelimiter, None, &cancel)
             .await?;
         assert!(listing.prefixes.is_empty());
-        assert_eq!(listing.keys, [uncle.clone(), child.clone()].to_vec());
+        assert_eq!(
+            listing.keys.into_iter().collect::<HashSet<_>>(),
+            HashSet::from([uncle.clone(), child.clone(), child_sibling.clone()])
+        );
 
         // Delimiter: should only go one deep
         let listing = storage
@@ -942,7 +954,25 @@ mod fs_tests {
         );
         assert!(listing.keys.is_empty());
 
-        // Delimiter & prefix
+        // Delimiter & prefix with a trailing slash
+        let listing = storage
+            .list(
+                Some(&RemotePath::from_string("timelines/some_timeline/grandparent/").unwrap()),
+                ListingMode::WithDelimiter,
+                None,
+                &cancel,
+            )
+            .await?;
+        assert_eq!(
+            listing.keys,
+            [RemotePath::from_string("uncle").unwrap()].to_vec()
+        );
+        assert_eq!(
+            listing.prefixes,
+            [RemotePath::from_string("parent").unwrap()].to_vec()
+        );
+
+        // Delimiter and prefix without a trailing slash
         let listing = storage
             .list(
                 Some(&RemotePath::from_string("timelines/some_timeline/grandparent").unwrap()),
@@ -951,12 +981,66 @@ mod fs_tests {
                 &cancel,
             )
             .await?;
+        assert_eq!(listing.keys, [].to_vec());
         assert_eq!(
             listing.prefixes,
-            [RemotePath::from_string("timelines/some_timeline/grandparent/parent").unwrap()]
-                .to_vec()
+            [RemotePath::from_string("grandparent").unwrap()].to_vec()
+        );
+
+        // Delimiter and prefix that's partway through a path component
+        let listing = storage
+            .list(
+                Some(&RemotePath::from_string("timelines/some_timeline/grandp").unwrap()),
+                ListingMode::WithDelimiter,
+                None,
+                &cancel,
+            )
+            .await?;
+        assert_eq!(listing.keys, [].to_vec());
+        assert_eq!(
+            listing.prefixes,
+            [RemotePath::from_string("grandparent").unwrap()].to_vec()
+        );
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn list_part_component() -> anyhow::Result<()> {
+        // No delimiter: should recursively list everything
+        let (storage, cancel) = create_storage()?;
+
+        // Imitates what happens in a tenant path when we have an unsharded path and a sharded path, and do a listing
+        // of the unsharded path: although there is a "directory" at the unsharded path, it should be handled as
+        // a freeform prefix.
+        let _child_a =
+            upload_dummy_file(&storage, "grandparent/tenant-01/child", None, &cancel).await?;
+        let _child_b =
+            upload_dummy_file(&storage, "grandparent/tenant/child", None, &cancel).await?;
+
+        // Delimiter and prefix that's partway through a path component
+        let listing = storage
+            .list(
+                Some(
+                    &RemotePath::from_string("timelines/some_timeline/grandparent/tenant").unwrap(),
+                ),
+                ListingMode::WithDelimiter,
+                None,
+                &cancel,
+            )
+            .await?;
+        assert_eq!(listing.keys, [].to_vec());
+
+        let mut found_prefixes = listing.prefixes.clone();
+        found_prefixes.sort();
+        assert_eq!(
+            found_prefixes,
+            [
+                RemotePath::from_string("tenant").unwrap(),
+                RemotePath::from_string("tenant-01").unwrap(),
+            ]
+            .to_vec()
         );
-        assert_eq!(listing.keys, [uncle.clone()].to_vec());
 
         Ok(())
     }
diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index 1cb85cfb1b..c0b89cee2a 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -30,7 +30,7 @@ use aws_sdk_s3::{
     config::{AsyncSleep, Builder, IdentityCache, Region, SharedAsyncSleep},
     error::SdkError,
     operation::get_object::GetObjectError,
-    types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion},
+    types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion, StorageClass},
     Client,
 };
 use aws_smithy_async::rt::sleep::TokioSleep;
@@ -62,6 +62,7 @@ pub struct S3Bucket {
     bucket_name: String,
     prefix_in_bucket: Option<String>,
     max_keys_per_list_response: Option<i32>,
+    upload_storage_class: Option<StorageClass>,
     concurrency_limiter: ConcurrencyLimiter,
     // Per-request timeout. Accessible for tests.
     pub timeout: Duration,
@@ -154,6 +155,7 @@ impl S3Bucket {
             max_keys_per_list_response: aws_config.max_keys_per_list_response,
             prefix_in_bucket,
             concurrency_limiter: ConcurrencyLimiter::new(aws_config.concurrency_limit.get()),
+            upload_storage_class: aws_config.upload_storage_class.clone(),
             timeout,
         })
     }
@@ -178,10 +180,7 @@ impl S3Bucket {
 
     pub fn relative_path_to_s3_object(&self, path: &RemotePath) -> String {
         assert_eq!(std::path::MAIN_SEPARATOR, REMOTE_STORAGE_PREFIX_SEPARATOR);
-        let path_string = path
-            .get_path()
-            .as_str()
-            .trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR);
+        let path_string = path.get_path().as_str();
         match &self.prefix_in_bucket {
             Some(prefix) => prefix.clone() + "/" + path_string,
             None => path_string.to_string(),
@@ -471,16 +470,11 @@ impl RemoteStorage for S3Bucket {
         // get the passed prefix or if it is not set use prefix_in_bucket value
         let list_prefix = prefix
             .map(|p| self.relative_path_to_s3_object(p))
-            .or_else(|| self.prefix_in_bucket.clone())
-            .map(|mut p| {
-                // required to end with a separator
-                // otherwise request will return only the entry of a prefix
-                if matches!(mode, ListingMode::WithDelimiter)
-                    && !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR)
-                {
-                    p.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
-                }
-                p
+            .or_else(|| {
+                self.prefix_in_bucket.clone().map(|mut s| {
+                    s.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
+                    s
+                })
             });
 
         let _permit = self.permit(kind, cancel).await?;
@@ -549,11 +543,15 @@ impl RemoteStorage for S3Bucket {
                 }
             }
 
-            result.prefixes.extend(
-                prefixes
-                    .iter()
-                    .filter_map(|o| Some(self.s3_object_to_relative_path(o.prefix()?))),
-            );
+            // S3 gives us prefixes like "foo/", we return them like "foo"
+            result.prefixes.extend(prefixes.iter().filter_map(|o| {
+                Some(
+                    self.s3_object_to_relative_path(
+                        o.prefix()?
+                            .trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR),
+                    ),
+                )
+            }));
 
             continuation_token = match response.next_continuation_token {
                 Some(new_token) => Some(new_token),
@@ -586,6 +584,7 @@ impl RemoteStorage for S3Bucket {
             .bucket(self.bucket_name.clone())
             .key(self.relative_path_to_s3_object(to))
             .set_metadata(metadata.map(|m| m.0))
+            .set_storage_class(self.upload_storage_class.clone())
             .content_length(from_size_bytes.try_into()?)
             .body(bytes_stream)
             .send();
@@ -637,6 +636,7 @@ impl RemoteStorage for S3Bucket {
             .copy_object()
             .bucket(self.bucket_name.clone())
             .key(self.relative_path_to_s3_object(to))
+            .set_storage_class(self.upload_storage_class.clone())
             .copy_source(copy_source)
             .send();
 
@@ -894,6 +894,7 @@ impl RemoteStorage for S3Bucket {
                                     .copy_object()
                                     .bucket(self.bucket_name.clone())
                                     .key(key)
+                                    .set_storage_class(self.upload_storage_class.clone())
                                     .copy_source(&source_id)
                                     .send();
 
@@ -1050,22 +1051,22 @@ mod tests {
             Some("/test/prefix/"),
         ];
         let expected_outputs = [
-            vec!["", "some/path", "some/path"],
-            vec!["/", "/some/path", "/some/path"],
+            vec!["", "some/path", "some/path/"],
+            vec!["/", "/some/path", "/some/path/"],
             vec![
                 "test/prefix/",
                 "test/prefix/some/path",
-                "test/prefix/some/path",
+                "test/prefix/some/path/",
             ],
             vec![
                 "test/prefix/",
                 "test/prefix/some/path",
-                "test/prefix/some/path",
+                "test/prefix/some/path/",
             ],
             vec![
                 "test/prefix/",
                 "test/prefix/some/path",
-                "test/prefix/some/path",
+                "test/prefix/some/path/",
             ],
         ];
 
@@ -1077,6 +1078,7 @@ mod tests {
                 endpoint: None,
                 concurrency_limit: NonZeroUsize::new(100).unwrap(),
                 max_keys_per_list_response: Some(5),
+                upload_storage_class: None,
             };
             let storage =
                 S3Bucket::new(&config, std::time::Duration::ZERO).expect("remote storage init");
diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs
index f5344d3ae2..c467a2d196 100644
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -107,27 +107,6 @@ impl UnreliableWrapper {
 type VoidStorage = crate::LocalFs;
 
 impl RemoteStorage for UnreliableWrapper {
-    async fn list_prefixes(
-        &self,
-        prefix: Option<&RemotePath>,
-        cancel: &CancellationToken,
-    ) -> Result<Vec<RemotePath>, DownloadError> {
-        self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))
-            .map_err(DownloadError::Other)?;
-        self.inner.list_prefixes(prefix, cancel).await
-    }
-
-    async fn list_files(
-        &self,
-        folder: Option<&RemotePath>,
-        max_keys: Option<NonZeroU32>,
-        cancel: &CancellationToken,
-    ) -> Result<Vec<RemotePath>, DownloadError> {
-        self.attempt(RemoteOp::ListPrefixes(folder.cloned()))
-            .map_err(DownloadError::Other)?;
-        self.inner.list_files(folder, max_keys, cancel).await
-    }
-
     async fn list(
         &self,
         prefix: Option<&RemotePath>,
diff --git a/libs/remote_storage/tests/common/tests.rs b/libs/remote_storage/tests/common/tests.rs
index 72f6f956e0..673151c8ef 100644
--- a/libs/remote_storage/tests/common/tests.rs
+++ b/libs/remote_storage/tests/common/tests.rs
@@ -1,5 +1,6 @@
 use anyhow::Context;
 use camino::Utf8Path;
+use remote_storage::ListingMode;
 use remote_storage::RemotePath;
 use std::sync::Arc;
 use std::{collections::HashSet, num::NonZeroU32};
@@ -54,9 +55,9 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a
     let base_prefix = RemotePath::new(Utf8Path::new(ctx.enabled.base_prefix))
         .context("common_prefix construction")?;
     let root_remote_prefixes = test_client
-        .list_prefixes(None, &cancel)
-        .await
-        .context("client list root prefixes failure")?
+        .list(None, ListingMode::WithDelimiter, None, &cancel)
+        .await?
+        .prefixes
         .into_iter()
         .collect::<HashSet<_>>();
     assert_eq!(
@@ -65,9 +66,14 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a
     );
 
     let nested_remote_prefixes = test_client
-        .list_prefixes(Some(&base_prefix), &cancel)
-        .await
-        .context("client list nested prefixes failure")?
+        .list(
+            Some(&base_prefix.add_trailing_slash()),
+            ListingMode::WithDelimiter,
+            None,
+            &cancel,
+        )
+        .await?
+        .prefixes
         .into_iter()
         .collect::<HashSet<_>>();
     let remote_only_prefixes = nested_remote_prefixes
@@ -90,11 +96,13 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a
 ///
 /// First, create a set of S3 objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_remote_data`]
 /// Then performs the following queries:
-///    1. `list_files(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
-///    2. `list_files("folder1")`.  This  should return all files `random_prefix/folder1/blob_{i}.txt`
+///    1. `list(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
+///    2. `list("folder1")`.  This  should return all files `random_prefix/folder1/blob_{i}.txt`
 #[test_context(MaybeEnabledStorageWithSimpleTestBlobs)]
 #[tokio::test]
-async fn list_files_works(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> anyhow::Result<()> {
+async fn list_no_delimiter_works(
+    ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs,
+) -> anyhow::Result<()> {
     let ctx = match ctx {
         MaybeEnabledStorageWithSimpleTestBlobs::Enabled(ctx) => ctx,
         MaybeEnabledStorageWithSimpleTestBlobs::Disabled => return Ok(()),
@@ -107,29 +115,36 @@ async fn list_files_works(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> a
     let base_prefix =
         RemotePath::new(Utf8Path::new("folder1")).context("common_prefix construction")?;
     let root_files = test_client
-        .list_files(None, None, &cancel)
+        .list(None, ListingMode::NoDelimiter, None, &cancel)
         .await
         .context("client list root files failure")?
+        .keys
         .into_iter()
         .collect::<HashSet<_>>();
     assert_eq!(
         root_files,
         ctx.remote_blobs.clone(),
-        "remote storage list_files on root mismatches with the uploads."
+        "remote storage list on root mismatches with the uploads."
     );
 
     // Test that max_keys limit works. In total there are about 21 files (see
     // upload_simple_remote_data call in test_real_s3.rs).
     let limited_root_files = test_client
-        .list_files(None, Some(NonZeroU32::new(2).unwrap()), &cancel)
+        .list(
+            None,
+            ListingMode::NoDelimiter,
+            Some(NonZeroU32::new(2).unwrap()),
+            &cancel,
+        )
         .await
         .context("client list root files failure")?;
-    assert_eq!(limited_root_files.len(), 2);
+    assert_eq!(limited_root_files.keys.len(), 2);
 
     let nested_remote_files = test_client
-        .list_files(Some(&base_prefix), None, &cancel)
+        .list(Some(&base_prefix), ListingMode::NoDelimiter, None, &cancel)
         .await
         .context("client list nested files failure")?
+        .keys
         .into_iter()
         .collect::<HashSet<_>>();
     let trim_remote_blobs: HashSet<_> = ctx
@@ -141,7 +156,7 @@ async fn list_files_works(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> a
         .collect();
     assert_eq!(
         nested_remote_files, trim_remote_blobs,
-        "remote storage list_files on subdirrectory mismatches with the uploads."
+        "remote storage list on subdirrectory mismatches with the uploads."
     );
     Ok(())
 }
@@ -199,7 +214,11 @@ async fn delete_objects_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<(
 
     ctx.client.delete_objects(&[path1, path2], &cancel).await?;
 
-    let prefixes = ctx.client.list_prefixes(None, &cancel).await?;
+    let prefixes = ctx
+        .client
+        .list(None, ListingMode::WithDelimiter, None, &cancel)
+        .await?
+        .prefixes;
 
     assert_eq!(prefixes.len(), 1);
 
diff --git a/libs/remote_storage/tests/test_real_azure.rs b/libs/remote_storage/tests/test_real_azure.rs
index 6aa02868e6..cd0b2be4b5 100644
--- a/libs/remote_storage/tests/test_real_azure.rs
+++ b/libs/remote_storage/tests/test_real_azure.rs
@@ -132,10 +132,6 @@ impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
     }
 }
 
-// NOTE: the setups for the list_prefixes test and the list_files test are very similar
-// However, they are not idential. The list_prefixes function is concerned with listing prefixes,
-// whereas the list_files function is concerned with listing files.
-// See `RemoteStorage::list_files` documentation for more details
 enum MaybeEnabledStorageWithSimpleTestBlobs {
     Enabled(AzureWithSimpleTestBlobs),
     Disabled,
diff --git a/libs/remote_storage/tests/test_real_s3.rs b/libs/remote_storage/tests/test_real_s3.rs
index c5d5216f00..a273abe867 100644
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -12,8 +12,8 @@ use anyhow::Context;
 use camino::Utf8Path;
 use futures_util::StreamExt;
 use remote_storage::{
-    DownloadError, GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind,
-    S3Config,
+    DownloadError, GenericRemoteStorage, ListingMode, RemotePath, RemoteStorageConfig,
+    RemoteStorageKind, S3Config,
 };
 use test_context::test_context;
 use test_context::AsyncTestContext;
@@ -75,11 +75,14 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
         client: &Arc<GenericRemoteStorage>,
         cancel: &CancellationToken,
     ) -> anyhow::Result<HashSet<RemotePath>> {
-        Ok(retry(|| client.list_files(None, None, cancel))
-            .await
-            .context("list root files failure")?
-            .into_iter()
-            .collect::<HashSet<_>>())
+        Ok(
+            retry(|| client.list(None, ListingMode::NoDelimiter, None, cancel))
+                .await
+                .context("list root files failure")?
+                .keys
+                .into_iter()
+                .collect::<HashSet<_>>(),
+        )
     }
 
     let cancel = CancellationToken::new();
@@ -294,10 +297,6 @@ impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
     }
 }
 
-// NOTE: the setups for the list_prefixes test and the list_files test are very similar
-// However, they are not idential. The list_prefixes function is concerned with listing prefixes,
-// whereas the list_files function is concerned with listing files.
-// See `RemoteStorage::list_files` documentation for more details
 enum MaybeEnabledStorageWithSimpleTestBlobs {
     Enabled(S3WithSimpleTestBlobs),
     Disabled,
@@ -381,6 +380,7 @@ fn create_s3_client(
             endpoint: None,
             concurrency_limit: NonZeroUsize::new(100).unwrap(),
             max_keys_per_list_response,
+            upload_storage_class: None,
         }),
         timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
     };
diff --git a/libs/utils/src/generation.rs b/libs/utils/src/generation.rs
index af15cee924..b703e883de 100644
--- a/libs/utils/src/generation.rs
+++ b/libs/utils/src/generation.rs
@@ -34,6 +34,8 @@ pub enum Generation {
 /// scenarios where pageservers might otherwise issue conflicting writes to
 /// remote storage
 impl Generation {
+    pub const MAX: Self = Self::Valid(u32::MAX);
+
     /// Create a new Generation that represents a legacy key format with
     /// no generation suffix
     pub fn none() -> Self {
diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs
index b09350d11e..2953f0aad4 100644
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -92,6 +92,8 @@ pub mod zstd;
 
 pub mod env;
 
+pub mod poison;
+
 /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
 ///
 /// we have several cases:
diff --git a/libs/utils/src/poison.rs b/libs/utils/src/poison.rs
new file mode 100644
index 0000000000..0bf5664f47
--- /dev/null
+++ b/libs/utils/src/poison.rs
@@ -0,0 +1,121 @@
+//!  Protect a piece of state from reuse after it is left in an inconsistent state.
+//!
+//!  # Example
+//!
+//!  ```
+//!  # tokio_test::block_on(async {
+//!  use utils::poison::Poison;
+//!  use std::time::Duration;
+//!
+//!  struct State {
+//!    clean: bool,
+//!  }
+//!  let state = tokio::sync::Mutex::new(Poison::new("mystate", State { clean: true }));
+//!
+//!  let mut mutex_guard = state.lock().await;
+//!  let mut poison_guard = mutex_guard.check_and_arm()?;
+//!  let state = poison_guard.data_mut();
+//!  state.clean = false;
+//!  // If we get cancelled at this await point, subsequent check_and_arm() calls will fail.
+//!  tokio::time::sleep(Duration::from_secs(10)).await;
+//!  state.clean = true;
+//!  poison_guard.disarm();
+//!  # Ok::<(), utils::poison::Error>(())
+//!  # });
+//!  ```
+
+use tracing::warn;
+
+pub struct Poison<T> {
+    what: &'static str,
+    state: State,
+    data: T,
+}
+
+#[derive(Clone, Copy)]
+enum State {
+    Clean,
+    Armed,
+    Poisoned { at: chrono::DateTime<chrono::Utc> },
+}
+
+impl<T> Poison<T> {
+    /// We log `what` `warning!` level if the [`Guard`] gets dropped without being [`Guard::disarm`]ed.
+    pub fn new(what: &'static str, data: T) -> Self {
+        Self {
+            what,
+            state: State::Clean,
+            data,
+        }
+    }
+
+    /// Check for poisoning and return a [`Guard`] that provides access to the wrapped state.
+    pub fn check_and_arm(&mut self) -> Result<Guard<T>, Error> {
+        match self.state {
+            State::Clean => {
+                self.state = State::Armed;
+                Ok(Guard(self))
+            }
+            State::Armed => unreachable!("transient state"),
+            State::Poisoned { at } => Err(Error::Poisoned {
+                what: self.what,
+                at,
+            }),
+        }
+    }
+}
+
+/// Use [`Self::data`] and [`Self::data_mut`] to access the wrapped state.
+/// Once modifications are done, use [`Self::disarm`].
+/// If [`Guard`] gets dropped instead of calling [`Self::disarm`], the state is poisoned
+/// and subsequent calls to [`Poison::check_and_arm`] will fail with an error.
+pub struct Guard<'a, T>(&'a mut Poison<T>);
+
+impl<'a, T> Guard<'a, T> {
+    pub fn data(&self) -> &T {
+        &self.0.data
+    }
+    pub fn data_mut(&mut self) -> &mut T {
+        &mut self.0.data
+    }
+
+    pub fn disarm(self) {
+        match self.0.state {
+            State::Clean => unreachable!("we set it to Armed in check_and_arm()"),
+            State::Armed => {
+                self.0.state = State::Clean;
+            }
+            State::Poisoned { at } => {
+                unreachable!("we fail check_and_arm() if it's in that state: {at}")
+            }
+        }
+    }
+}
+
+impl<'a, T> Drop for Guard<'a, T> {
+    fn drop(&mut self) {
+        match self.0.state {
+            State::Clean => {
+                // set by disarm()
+            }
+            State::Armed => {
+                // still armed => poison it
+                let at = chrono::Utc::now();
+                self.0.state = State::Poisoned { at };
+                warn!(at=?at, "poisoning {}", self.0.what);
+            }
+            State::Poisoned { at } => {
+                unreachable!("we fail check_and_arm() if it's in that state: {at}")
+            }
+        }
+    }
+}
+
+#[derive(thiserror::Error, Debug)]
+pub enum Error {
+    #[error("poisoned at {at}: {what}")]
+    Poisoned {
+        what: &'static str,
+        at: chrono::DateTime<chrono::Utc>,
+    },
+}
diff --git a/libs/utils/src/seqwait.rs b/libs/utils/src/seqwait.rs
index 0544c5be03..375b227b99 100644
--- a/libs/utils/src/seqwait.rs
+++ b/libs/utils/src/seqwait.rs
@@ -2,11 +2,10 @@
 
 use std::cmp::{Eq, Ordering};
 use std::collections::BinaryHeap;
-use std::fmt::Debug;
 use std::mem;
 use std::sync::Mutex;
 use std::time::Duration;
-use tokio::sync::watch::{channel, Receiver, Sender};
+use tokio::sync::watch::{self, channel};
 use tokio::time::timeout;
 
 /// An error happened while waiting for a number
@@ -35,23 +34,73 @@ pub trait MonotonicCounter<V> {
     fn cnt_value(&self) -> V;
 }
 
-/// Internal components of a `SeqWait`
-struct SeqWaitInt<S, V>
+/// Heap of waiters, lowest numbers pop first.
+struct Waiters<V>
 where
-    S: MonotonicCounter<V>,
     V: Ord,
 {
-    waiters: BinaryHeap<Waiter<V>>,
-    current: S,
-    shutdown: bool,
+    heap: BinaryHeap<Waiter<V>>,
+    /// Number of the first waiter in the heap, or None if there are no waiters.
+    status_channel: watch::Sender<Option<V>>,
+}
+
+impl<V> Waiters<V>
+where
+    V: Ord + Copy,
+{
+    fn new() -> Self {
+        Waiters {
+            heap: BinaryHeap::new(),
+            status_channel: channel(None).0,
+        }
+    }
+
+    /// `status_channel` contains the number of the first waiter in the heap.
+    /// This function should be called whenever waiters heap changes.
+    fn update_status(&self) {
+        let first_waiter = self.heap.peek().map(|w| w.wake_num);
+        let _ = self.status_channel.send_replace(first_waiter);
+    }
+
+    /// Add new waiter to the heap, return a channel that will be notified when the number arrives.
+    fn add(&mut self, num: V) -> watch::Receiver<()> {
+        let (tx, rx) = channel(());
+        self.heap.push(Waiter {
+            wake_num: num,
+            wake_channel: tx,
+        });
+        self.update_status();
+        rx
+    }
+
+    /// Pop all waiters <= num from the heap. Collect channels in a vector,
+    /// so that caller can wake them up.
+    fn pop_leq(&mut self, num: V) -> Vec<watch::Sender<()>> {
+        let mut wake_these = Vec::new();
+        while let Some(n) = self.heap.peek() {
+            if n.wake_num > num {
+                break;
+            }
+            wake_these.push(self.heap.pop().unwrap().wake_channel);
+        }
+        self.update_status();
+        wake_these
+    }
+
+    /// Used on shutdown to efficiently drop all waiters.
+    fn take_all(&mut self) -> BinaryHeap<Waiter<V>> {
+        let heap = mem::take(&mut self.heap);
+        self.update_status();
+        heap
+    }
 }
 
 struct Waiter<T>
 where
     T: Ord,
 {
-    wake_num: T,              // wake me when this number arrives ...
-    wake_channel: Sender<()>, // ... by sending a message to this channel
+    wake_num: T,                     // wake me when this number arrives ...
+    wake_channel: watch::Sender<()>, // ... by sending a message to this channel
 }
 
 // BinaryHeap is a max-heap, and we want a min-heap. Reverse the ordering here
@@ -76,6 +125,17 @@ impl<T: Ord> PartialEq for Waiter<T> {
 
 impl<T: Ord> Eq for Waiter<T> {}
 
+/// Internal components of a `SeqWait`
+struct SeqWaitInt<S, V>
+where
+    S: MonotonicCounter<V>,
+    V: Ord,
+{
+    waiters: Waiters<V>,
+    current: S,
+    shutdown: bool,
+}
+
 /// A tool for waiting on a sequence number
 ///
 /// This provides a way to wait the arrival of a number.
@@ -108,7 +168,7 @@ where
     /// Create a new `SeqWait`, initialized to a particular number
     pub fn new(starting_num: S) -> Self {
         let internal = SeqWaitInt {
-            waiters: BinaryHeap::new(),
+            waiters: Waiters::new(),
             current: starting_num,
             shutdown: false,
         };
@@ -128,9 +188,8 @@ where
             // Block any future waiters from starting
             internal.shutdown = true;
 
-            // This will steal the entire waiters map.
-            // When we drop it all waiters will be woken.
-            mem::take(&mut internal.waiters)
+            // Take all waiters to drop them later.
+            internal.waiters.take_all()
 
             // Drop the lock as we exit this scope.
         };
@@ -196,7 +255,7 @@ where
 
     /// Register and return a channel that will be notified when a number arrives,
     /// or None, if it has already arrived.
-    fn queue_for_wait(&self, num: V) -> Result<Option<Receiver<()>>, SeqWaitError> {
+    fn queue_for_wait(&self, num: V) -> Result<Option<watch::Receiver<()>>, SeqWaitError> {
         let mut internal = self.internal.lock().unwrap();
         if internal.current.cnt_value() >= num {
             return Ok(None);
@@ -205,12 +264,8 @@ where
             return Err(SeqWaitError::Shutdown);
         }
 
-        // Create a new channel.
-        let (tx, rx) = channel(());
-        internal.waiters.push(Waiter {
-            wake_num: num,
-            wake_channel: tx,
-        });
+        // Add waiter channel to the queue.
+        let rx = internal.waiters.add(num);
         // Drop the lock as we exit this scope.
         Ok(Some(rx))
     }
@@ -231,16 +286,8 @@ where
             }
             internal.current.cnt_advance(num);
 
-            // Pop all waiters <= num from the heap. Collect them in a vector, and
-            // wake them up after releasing the lock.
-            let mut wake_these = Vec::new();
-            while let Some(n) = internal.waiters.peek() {
-                if n.wake_num > num {
-                    break;
-                }
-                wake_these.push(internal.waiters.pop().unwrap().wake_channel);
-            }
-            wake_these
+            // Pop all waiters <= num from the heap.
+            internal.waiters.pop_leq(num)
         };
 
         for tx in wake_these {
@@ -255,6 +302,23 @@ where
     pub fn load(&self) -> S {
         self.internal.lock().unwrap().current
     }
+
+    /// Get a Receiver for the current status.
+    ///
+    /// The current status is the number of the first waiter in the queue,
+    /// or None if there are no waiters.
+    ///
+    /// This receiver will be notified whenever the status changes.
+    /// It is useful for receiving notifications when the first waiter
+    /// starts waiting for a number, or when there are no more waiters left.
+    pub fn status_receiver(&self) -> watch::Receiver<Option<V>> {
+        self.internal
+            .lock()
+            .unwrap()
+            .waiters
+            .status_channel
+            .subscribe()
+    }
 }
 
 #[cfg(test)]
diff --git a/libs/utils/src/sync/heavier_once_cell.rs b/libs/utils/src/sync/heavier_once_cell.rs
index 8eee1f72a6..1abd3d9861 100644
--- a/libs/utils/src/sync/heavier_once_cell.rs
+++ b/libs/utils/src/sync/heavier_once_cell.rs
@@ -192,6 +192,14 @@ impl<T> OnceCell<T> {
         }
     }
 
+    /// Like [`Guard::take_and_deinit`], but will return `None` if this OnceCell was never
+    /// initialized.
+    pub fn take_and_deinit(&mut self) -> Option<(T, InitPermit)> {
+        let inner = self.inner.get_mut().unwrap();
+
+        inner.take_and_deinit()
+    }
+
     /// Return the number of [`Self::get_or_init`] calls waiting for initialization to complete.
     pub fn initializer_count(&self) -> usize {
         self.initializers.load(Ordering::Relaxed)
@@ -246,15 +254,23 @@ impl<'a, T> Guard<'a, T> {
     /// The permit will be on a semaphore part of the new internal value, and any following
     /// [`OnceCell::get_or_init`] will wait on it to complete.
     pub fn take_and_deinit(mut self) -> (T, InitPermit) {
+        self.0
+            .take_and_deinit()
+            .expect("guard is not created unless value has been initialized")
+    }
+}
+
+impl<T> Inner<T> {
+    pub fn take_and_deinit(&mut self) -> Option<(T, InitPermit)> {
+        let value = self.value.take()?;
+
         let mut swapped = Inner::default();
         let sem = swapped.init_semaphore.clone();
         // acquire and forget right away, moving the control over to InitPermit
         sem.try_acquire().expect("we just created this").forget();
-        std::mem::swap(&mut *self.0, &mut swapped);
-        swapped
-            .value
-            .map(|v| (v, InitPermit(sem)))
-            .expect("guard is not created unless value has been initialized")
+        let permit = InitPermit(sem);
+        std::mem::swap(self, &mut swapped);
+        Some((value, permit))
     }
 }
 
@@ -263,6 +279,13 @@ impl<'a, T> Guard<'a, T> {
 /// On drop, this type will return the permit.
 pub struct InitPermit(Arc<tokio::sync::Semaphore>);
 
+impl std::fmt::Debug for InitPermit {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let ptr = Arc::as_ptr(&self.0) as *const ();
+        f.debug_tuple("InitPermit").field(&ptr).finish()
+    }
+}
+
 impl Drop for InitPermit {
     fn drop(&mut self) {
         assert_eq!(
@@ -559,4 +582,22 @@ mod tests {
 
         assert_eq!(*target.get().unwrap(), 11);
     }
+
+    #[tokio::test]
+    async fn take_and_deinit_on_mut() {
+        use std::convert::Infallible;
+
+        let mut target = OnceCell::<u32>::default();
+        assert!(target.take_and_deinit().is_none());
+
+        target
+            .get_or_init(|permit| async move { Ok::<_, Infallible>((42, permit)) })
+            .await
+            .unwrap();
+
+        let again = target.take_and_deinit();
+        assert!(matches!(again, Some((42, _))), "{again:?}");
+
+        assert!(target.take_and_deinit().is_none());
+    }
 }
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index e82aee2462..0f1a7fe688 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -69,6 +69,7 @@ tokio-stream.workspace = true
 tokio-util.workspace = true
 toml_edit = { workspace = true, features = [ "serde" ] }
 tracing.workspace = true
+twox-hash.workspace = true
 url.workspace = true
 walkdir.workspace = true
 metrics.workspace = true
diff --git a/pageserver/benches/bench_walredo.rs b/pageserver/benches/bench_walredo.rs
index ffe607be4b..5b871c5d5e 100644
--- a/pageserver/benches/bench_walredo.rs
+++ b/pageserver/benches/bench_walredo.rs
@@ -27,30 +27,50 @@
 //!
 //! # Reference Numbers
 //!
-//! 2024-04-04 on i3en.3xlarge
+//! 2024-04-15 on i3en.3xlarge
 //!
 //! ```text
-//! short/1                 time:   [25.925 µs 26.060 µs 26.209 µs]
-//! short/2                 time:   [31.277 µs 31.483 µs 31.722 µs]
-//! short/4                 time:   [45.496 µs 45.831 µs 46.182 µs]
-//! short/8                 time:   [84.298 µs 84.920 µs 85.566 µs]
-//! short/16                time:   [185.04 µs 186.41 µs 187.88 µs]
-//! short/32                time:   [385.01 µs 386.77 µs 388.70 µs]
-//! short/64                time:   [770.24 µs 773.04 µs 776.04 µs]
-//! short/128               time:   [1.5017 ms 1.5064 ms 1.5113 ms]
-//! medium/1                time:   [106.65 µs 107.20 µs 107.85 µs]
-//! medium/2                time:   [153.28 µs 154.24 µs 155.56 µs]
-//! medium/4                time:   [325.67 µs 327.01 µs 328.71 µs]
-//! medium/8                time:   [646.82 µs 650.17 µs 653.91 µs]
-//! medium/16               time:   [1.2645 ms 1.2701 ms 1.2762 ms]
-//! medium/32               time:   [2.4409 ms 2.4550 ms 2.4692 ms]
-//! medium/64               time:   [4.6814 ms 4.7114 ms 4.7408 ms]
-//! medium/128              time:   [8.7790 ms 8.9037 ms 9.0282 ms]
+//! async-short/1           time:   [24.584 µs 24.737 µs 24.922 µs]
+//! async-short/2           time:   [33.479 µs 33.660 µs 33.888 µs]
+//! async-short/4           time:   [42.713 µs 43.046 µs 43.440 µs]
+//! async-short/8           time:   [71.814 µs 72.478 µs 73.240 µs]
+//! async-short/16          time:   [132.73 µs 134.45 µs 136.22 µs]
+//! async-short/32          time:   [258.31 µs 260.73 µs 263.27 µs]
+//! async-short/64          time:   [511.61 µs 514.44 µs 517.51 µs]
+//! async-short/128         time:   [992.64 µs 998.23 µs 1.0042 ms]
+//! async-medium/1          time:   [110.11 µs 110.50 µs 110.96 µs]
+//! async-medium/2          time:   [153.06 µs 153.85 µs 154.99 µs]
+//! async-medium/4          time:   [317.51 µs 319.92 µs 322.85 µs]
+//! async-medium/8          time:   [638.30 µs 644.68 µs 652.12 µs]
+//! async-medium/16         time:   [1.2651 ms 1.2773 ms 1.2914 ms]
+//! async-medium/32         time:   [2.5117 ms 2.5410 ms 2.5720 ms]
+//! async-medium/64         time:   [4.8088 ms 4.8555 ms 4.9047 ms]
+//! async-medium/128        time:   [8.8311 ms 8.9849 ms 9.1263 ms]
+//! sync-short/1            time:   [25.503 µs 25.626 µs 25.771 µs]
+//! sync-short/2            time:   [30.850 µs 31.013 µs 31.208 µs]
+//! sync-short/4            time:   [45.543 µs 45.856 µs 46.193 µs]
+//! sync-short/8            time:   [84.114 µs 84.639 µs 85.220 µs]
+//! sync-short/16           time:   [185.22 µs 186.15 µs 187.13 µs]
+//! sync-short/32           time:   [377.43 µs 378.87 µs 380.46 µs]
+//! sync-short/64           time:   [756.49 µs 759.04 µs 761.70 µs]
+//! sync-short/128          time:   [1.4825 ms 1.4874 ms 1.4923 ms]
+//! sync-medium/1           time:   [105.66 µs 106.01 µs 106.43 µs]
+//! sync-medium/2           time:   [153.10 µs 153.84 µs 154.72 µs]
+//! sync-medium/4           time:   [327.13 µs 329.44 µs 332.27 µs]
+//! sync-medium/8           time:   [654.26 µs 658.73 µs 663.63 µs]
+//! sync-medium/16          time:   [1.2682 ms 1.2748 ms 1.2816 ms]
+//! sync-medium/32          time:   [2.4456 ms 2.4595 ms 2.4731 ms]
+//! sync-medium/64          time:   [4.6523 ms 4.6890 ms 4.7256 ms]
+//! sync-medium/128         time:   [8.7215 ms 8.8323 ms 8.9344 ms]
 //! ```
 
 use bytes::{Buf, Bytes};
 use criterion::{BenchmarkId, Criterion};
-use pageserver::{config::PageServerConf, walrecord::NeonWalRecord, walredo::PostgresRedoManager};
+use pageserver::{
+    config::PageServerConf,
+    walrecord::NeonWalRecord,
+    walredo::{PostgresRedoManager, ProcessKind},
+};
 use pageserver_api::{key::Key, shard::TenantShardId};
 use std::{
     sync::Arc,
@@ -60,33 +80,39 @@ use tokio::{sync::Barrier, task::JoinSet};
 use utils::{id::TenantId, lsn::Lsn};
 
 fn bench(c: &mut Criterion) {
-    {
-        let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
-        for nclients in nclients {
-            let mut group = c.benchmark_group("short");
-            group.bench_with_input(
-                BenchmarkId::from_parameter(nclients),
-                &nclients,
-                |b, nclients| {
-                    let redo_work = Arc::new(Request::short_input());
-                    b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
-                },
-            );
+    for process_kind in &[ProcessKind::Async, ProcessKind::Sync] {
+        {
+            let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
+            for nclients in nclients {
+                let mut group = c.benchmark_group(format!("{process_kind}-short"));
+                group.bench_with_input(
+                    BenchmarkId::from_parameter(nclients),
+                    &nclients,
+                    |b, nclients| {
+                        let redo_work = Arc::new(Request::short_input());
+                        b.iter_custom(|iters| {
+                            bench_impl(*process_kind, Arc::clone(&redo_work), iters, *nclients)
+                        });
+                    },
+                );
+            }
         }
-    }
 
-    {
-        let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
-        for nclients in nclients {
-            let mut group = c.benchmark_group("medium");
-            group.bench_with_input(
-                BenchmarkId::from_parameter(nclients),
-                &nclients,
-                |b, nclients| {
-                    let redo_work = Arc::new(Request::medium_input());
-                    b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
-                },
-            );
+        {
+            let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
+            for nclients in nclients {
+                let mut group = c.benchmark_group(format!("{process_kind}-medium"));
+                group.bench_with_input(
+                    BenchmarkId::from_parameter(nclients),
+                    &nclients,
+                    |b, nclients| {
+                        let redo_work = Arc::new(Request::medium_input());
+                        b.iter_custom(|iters| {
+                            bench_impl(*process_kind, Arc::clone(&redo_work), iters, *nclients)
+                        });
+                    },
+                );
+            }
         }
     }
 }
@@ -94,10 +120,16 @@ criterion::criterion_group!(benches, bench);
 criterion::criterion_main!(benches);
 
 // Returns the sum of each client's wall-clock time spent executing their share of the n_redos.
-fn bench_impl(redo_work: Arc<Request>, n_redos: u64, nclients: u64) -> Duration {
+fn bench_impl(
+    process_kind: ProcessKind,
+    redo_work: Arc<Request>,
+    n_redos: u64,
+    nclients: u64,
+) -> Duration {
     let repo_dir = camino_tempfile::tempdir_in(env!("CARGO_TARGET_TMPDIR")).unwrap();
 
-    let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
+    let mut conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
+    conf.walredo_process_kind = process_kind;
     let conf = Box::leak(Box::new(conf));
     let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());
 
@@ -113,25 +145,40 @@ fn bench_impl(redo_work: Arc<Request>, n_redos: u64, nclients: u64) -> Duration
     let manager = PostgresRedoManager::new(conf, tenant_shard_id);
     let manager = Arc::new(manager);
 
+    // divide the amount of work equally among the clients.
+    let nredos_per_client = n_redos / nclients;
     for _ in 0..nclients {
         rt.block_on(async {
             tasks.spawn(client(
                 Arc::clone(&manager),
                 Arc::clone(&start),
                 Arc::clone(&redo_work),
-                // divide the amount of work equally among the clients
-                n_redos / nclients,
+                nredos_per_client,
             ))
         });
     }
 
-    rt.block_on(async move {
-        let mut total_wallclock_time = std::time::Duration::from_millis(0);
+    let elapsed = rt.block_on(async move {
+        let mut total_wallclock_time = Duration::ZERO;
         while let Some(res) = tasks.join_next().await {
             total_wallclock_time += res.unwrap();
         }
         total_wallclock_time
-    })
+    });
+
+    // consistency check to ensure process kind setting worked
+    if nredos_per_client > 0 {
+        assert_eq!(
+            manager
+                .status()
+                .process
+                .map(|p| p.kind)
+                .expect("the benchmark work causes a walredo process to be spawned"),
+            std::borrow::Cow::Borrowed(process_kind.into())
+        );
+    }
+
+    elapsed
 }
 
 async fn client(
diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index 3c9982ffb8..6df8b2170d 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -243,6 +243,19 @@ impl Client {
         Ok(())
     }
 
+    pub async fn tenant_scan_remote_storage(
+        &self,
+        tenant_id: TenantId,
+    ) -> Result<TenantScanRemoteStorageResponse> {
+        let uri = format!(
+            "{}/v1/tenant/{tenant_id}/scan_remote_storage",
+            self.mgmt_api_endpoint
+        );
+        let response = self.request(Method::GET, &uri, ()).await?;
+        let body = response.json().await.map_err(Error::ReceiveBody)?;
+        Ok(body)
+    }
+
     pub async fn tenant_config(&self, req: &TenantConfigRequest) -> Result<()> {
         let uri = format!("{}/v1/tenant/config", self.mgmt_api_endpoint);
         self.request(Method::PUT, &uri, req).await?;
@@ -271,6 +284,34 @@ impl Client {
         Ok((status, progress))
     }
 
+    pub async fn tenant_secondary_status(
+        &self,
+        tenant_shard_id: TenantShardId,
+    ) -> Result<SecondaryProgress> {
+        let path = reqwest::Url::parse(&format!(
+            "{}/v1/tenant/{}/secondary/status",
+            self.mgmt_api_endpoint, tenant_shard_id
+        ))
+        .expect("Cannot build URL");
+
+        self.request(Method::GET, path, ())
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
+
+    pub async fn tenant_heatmap_upload(&self, tenant_id: TenantShardId) -> Result<()> {
+        let path = reqwest::Url::parse(&format!(
+            "{}/v1/tenant/{}/heatmap_upload",
+            self.mgmt_api_endpoint, tenant_id
+        ))
+        .expect("Cannot build URL");
+
+        self.request(Method::POST, path, ()).await?;
+        Ok(())
+    }
+
     pub async fn location_config(
         &self,
         tenant_shard_id: TenantShardId,
@@ -278,10 +319,7 @@ impl Client {
         flush_ms: Option<std::time::Duration>,
         lazy: bool,
     ) -> Result<()> {
-        let req_body = TenantLocationConfigRequest {
-            tenant_id: Some(tenant_shard_id),
-            config,
-        };
+        let req_body = TenantLocationConfigRequest { config };
 
         let mut path = reqwest::Url::parse(&format!(
             "{}/v1/tenant/{}/location_config",
diff --git a/pageserver/client/src/page_service.rs b/pageserver/client/src/page_service.rs
index 49175b3b90..f9507fc47a 100644
--- a/pageserver/client/src/page_service.rs
+++ b/pageserver/client/src/page_service.rs
@@ -60,7 +60,7 @@ impl Client {
     ) -> anyhow::Result<PagestreamClient> {
         let copy_both: tokio_postgres::CopyBothDuplex<bytes::Bytes> = self
             .client
-            .copy_both_simple(&format!("pagestream {tenant_id} {timeline_id}"))
+            .copy_both_simple(&format!("pagestream_v2 {tenant_id} {timeline_id}"))
             .await?;
         let Client {
             cancel_on_client_drop,
diff --git a/pageserver/compaction/src/compact_tiered.rs b/pageserver/compaction/src/compact_tiered.rs
index 5261746b22..137b93055a 100644
--- a/pageserver/compaction/src/compact_tiered.rs
+++ b/pageserver/compaction/src/compact_tiered.rs
@@ -18,6 +18,7 @@
 //! database size. For example, if the logical database size is 10 GB, we would
 //! generate new image layers every 10 GB of WAL.
 use futures::StreamExt;
+use pageserver_api::shard::ShardIdentity;
 use tracing::{debug, info};
 
 use std::collections::{HashSet, VecDeque};
@@ -125,6 +126,7 @@ async fn compact_level<E: CompactionJobExecutor>(
     }
 
     let mut state = LevelCompactionState {
+        shard_identity: *executor.get_shard_identity(),
         target_file_size,
         _lsn_range: lsn_range.clone(),
         layers: layer_fragments,
@@ -164,6 +166,8 @@ struct LevelCompactionState<'a, E>
 where
     E: CompactionJobExecutor,
 {
+    shard_identity: ShardIdentity,
+
     // parameters
     target_file_size: u64,
 
@@ -366,6 +370,7 @@ where
                 .executor
                 .get_keyspace(&job.key_range, job.lsn_range.end, ctx)
                 .await?,
+            &self.shard_identity,
         ) * 8192;
 
         let wal_size = job
@@ -430,7 +435,7 @@ where
             keyspace,
             self.target_file_size / 8192,
         );
-        while let Some(key_range) = window.choose_next_image() {
+        while let Some(key_range) = window.choose_next_image(&self.shard_identity) {
             new_jobs.push(CompactionJob::<E> {
                 key_range,
                 lsn_range: job.lsn_range.clone(),
@@ -623,7 +628,12 @@ impl<K: CompactionKey> KeyspaceWindowPos<K> {
     }
 
     // Advance the cursor until it reaches 'target_keysize'.
-    fn advance_until_size(&mut self, w: &KeyspaceWindowHead<K>, max_size: u64) {
+    fn advance_until_size(
+        &mut self,
+        w: &KeyspaceWindowHead<K>,
+        max_size: u64,
+        shard_identity: &ShardIdentity,
+    ) {
         while self.accum_keysize < max_size && !self.reached_end(w) {
             let curr_range = &w.keyspace[self.keyspace_idx];
             if self.end_key < curr_range.start {
@@ -632,7 +642,7 @@ impl<K: CompactionKey> KeyspaceWindowPos<K> {
             }
 
             // We're now within 'curr_range'. Can we advance past it completely?
-            let distance = K::key_range_size(&(self.end_key..curr_range.end));
+            let distance = K::key_range_size(&(self.end_key..curr_range.end), shard_identity);
             if (self.accum_keysize + distance as u64) < max_size {
                 // oh yeah, it fits
                 self.end_key = curr_range.end;
@@ -641,7 +651,7 @@ impl<K: CompactionKey> KeyspaceWindowPos<K> {
             } else {
                 // advance within the range
                 let skip_key = self.end_key.skip_some();
-                let distance = K::key_range_size(&(self.end_key..skip_key));
+                let distance = K::key_range_size(&(self.end_key..skip_key), shard_identity);
                 if (self.accum_keysize + distance as u64) < max_size {
                     self.end_key = skip_key;
                     self.accum_keysize += distance as u64;
@@ -677,7 +687,7 @@ where
         }
     }
 
-    fn choose_next_image(&mut self) -> Option<Range<K>> {
+    fn choose_next_image(&mut self, shard_identity: &ShardIdentity) -> Option<Range<K>> {
         if self.start_pos.keyspace_idx == self.head.keyspace.len() {
             // we've reached the end
             return None;
@@ -687,6 +697,7 @@ where
         next_pos.advance_until_size(
             &self.head,
             self.start_pos.accum_keysize + self.head.target_keysize,
+            shard_identity,
         );
 
         // See if we can gobble up the rest of the keyspace if we stretch out the layer, up to
@@ -695,6 +706,7 @@ where
         end_pos.advance_until_size(
             &self.head,
             self.start_pos.accum_keysize + (self.head.target_keysize * 5 / 4),
+            shard_identity,
         );
         if end_pos.reached_end(&self.head) {
             // gobble up any unused keyspace between the last used key and end of the range
diff --git a/pageserver/compaction/src/helpers.rs b/pageserver/compaction/src/helpers.rs
index 9de6363d6e..1b80373ba7 100644
--- a/pageserver/compaction/src/helpers.rs
+++ b/pageserver/compaction/src/helpers.rs
@@ -5,6 +5,7 @@ use crate::interface::*;
 use futures::future::BoxFuture;
 use futures::{Stream, StreamExt};
 use itertools::Itertools;
+use pageserver_api::shard::ShardIdentity;
 use pin_project_lite::pin_project;
 use std::collections::BinaryHeap;
 use std::collections::VecDeque;
@@ -13,11 +14,17 @@ use std::ops::{DerefMut, Range};
 use std::pin::Pin;
 use std::task::{ready, Poll};
 
-pub fn keyspace_total_size<K>(keyspace: &CompactionKeySpace<K>) -> u64
+pub fn keyspace_total_size<K>(
+    keyspace: &CompactionKeySpace<K>,
+    shard_identity: &ShardIdentity,
+) -> u64
 where
     K: CompactionKey,
 {
-    keyspace.iter().map(|r| K::key_range_size(r) as u64).sum()
+    keyspace
+        .iter()
+        .map(|r| K::key_range_size(r, shard_identity) as u64)
+        .sum()
 }
 
 pub fn overlaps_with<T: Ord>(a: &Range<T>, b: &Range<T>) -> bool {
diff --git a/pageserver/compaction/src/interface.rs b/pageserver/compaction/src/interface.rs
index 5dc62e506f..35519b5d0a 100644
--- a/pageserver/compaction/src/interface.rs
+++ b/pageserver/compaction/src/interface.rs
@@ -4,7 +4,7 @@
 //! All the heavy lifting is done by the create_image and create_delta
 //! functions that the implementor provides.
 use futures::Future;
-use pageserver_api::{key::Key, keyspace::key_range_size};
+use pageserver_api::{key::Key, keyspace::ShardedRange, shard::ShardIdentity};
 use std::ops::Range;
 use utils::lsn::Lsn;
 
@@ -32,6 +32,8 @@ pub trait CompactionJobExecutor {
     // Functions that the planner uses to support its decisions
     // ----
 
+    fn get_shard_identity(&self) -> &ShardIdentity;
+
     /// Return all layers that overlap the given bounding box.
     fn get_layers(
         &mut self,
@@ -98,7 +100,7 @@ pub trait CompactionKey: std::cmp::Ord + Clone + Copy + std::fmt::Display {
     ///
     /// This returns u32, for compatibility with Repository::key. If the
     /// distance is larger, return u32::MAX.
-    fn key_range_size(key_range: &Range<Self>) -> u32;
+    fn key_range_size(key_range: &Range<Self>, shard_identity: &ShardIdentity) -> u32;
 
     // return "self + 1"
     fn next(&self) -> Self;
@@ -113,8 +115,8 @@ impl CompactionKey for Key {
     const MIN: Self = Self::MIN;
     const MAX: Self = Self::MAX;
 
-    fn key_range_size(r: &std::ops::Range<Self>) -> u32 {
-        key_range_size(r)
+    fn key_range_size(r: &std::ops::Range<Self>, shard_identity: &ShardIdentity) -> u32 {
+        ShardedRange::new(r.clone(), shard_identity).page_count()
     }
     fn next(&self) -> Key {
         (self as &Key).next()
diff --git a/pageserver/compaction/src/simulator.rs b/pageserver/compaction/src/simulator.rs
index 6c00df3a65..3543df64fa 100644
--- a/pageserver/compaction/src/simulator.rs
+++ b/pageserver/compaction/src/simulator.rs
@@ -3,6 +3,7 @@ mod draw;
 use draw::{LayerTraceEvent, LayerTraceFile, LayerTraceOp};
 
 use futures::StreamExt;
+use pageserver_api::shard::ShardIdentity;
 use rand::Rng;
 use tracing::info;
 
@@ -71,7 +72,7 @@ impl interface::CompactionKey for Key {
     const MIN: Self = u64::MIN;
     const MAX: Self = u64::MAX;
 
-    fn key_range_size(key_range: &Range<Self>) -> u32 {
+    fn key_range_size(key_range: &Range<Self>, _shard_identity: &ShardIdentity) -> u32 {
         std::cmp::min(key_range.end - key_range.start, u32::MAX as u64) as u32
     }
 
@@ -434,6 +435,11 @@ impl interface::CompactionJobExecutor for MockTimeline {
     type ImageLayer = Arc<MockImageLayer>;
     type RequestContext = MockRequestContext;
 
+    fn get_shard_identity(&self) -> &ShardIdentity {
+        static IDENTITY: ShardIdentity = ShardIdentity::unsharded();
+        &IDENTITY
+    }
+
     async fn get_layers(
         &mut self,
         key_range: &Range<Self::Key>,
diff --git a/pageserver/ctl/Cargo.toml b/pageserver/ctl/Cargo.toml
index c5cd451e8d..843f5dd862 100644
--- a/pageserver/ctl/Cargo.toml
+++ b/pageserver/ctl/Cargo.toml
@@ -12,9 +12,14 @@ bytes.workspace = true
 camino.workspace = true
 clap = { workspace = true, features = ["string"] }
 git-version.workspace = true
+humantime.workspace = true
 pageserver = { path = ".." }
+pageserver_api.workspace = true
+remote_storage = { path = "../../libs/remote_storage" }
 postgres_ffi.workspace = true
 tokio.workspace = true
+tokio-util.workspace = true
+toml_edit.workspace = true
 utils.workspace = true
 svg_fmt.workspace = true
 workspace_hack.workspace = true
diff --git a/pageserver/ctl/src/draw_timeline_dir.rs b/pageserver/ctl/src/draw_timeline_dir.rs
index 0e77ef0563..9a556cb3d4 100644
--- a/pageserver/ctl/src/draw_timeline_dir.rs
+++ b/pageserver/ctl/src/draw_timeline_dir.rs
@@ -9,18 +9,45 @@
 //! Coordinates in both axis are compressed for better readability.
 //! (see <https://medium.com/algorithms-digest/coordinate-compression-2fff95326fb>)
 //!
-//! Example use:
+//! The plain text API was chosen so that we can easily work with filenames from various
+//! sources; see the Usage section below for examples.
+//!
+//! # Usage
+//!
+//! ## Producing the SVG
+//!
 //! ```bash
-//! $ ls test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE | \
-//! $   grep "__" | cargo run --release --bin pagectl draw-timeline-dir > out.svg
-//! $ firefox out.svg
+//!
+//! # local timeline dir
+//! ls test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE | \
+//!     grep "__" | cargo run --release --bin pagectl draw-timeline-dir > out.svg
+//!
+//! # Layer map dump from `/v1/tenant/$TENANT/timeline/$TIMELINE/layer`
+//! (jq -r '.historic_layers[] | .layer_file_name' | cargo  run -p pagectl draw-timeline) < layer-map.json > out.svg
+//!
+//! # From an `index_part.json` in S3
+//! (jq -r '.layer_metadata | keys[]' | cargo  run -p pagectl draw-timeline ) < index_part.json-00000016 > out.svg
+//!
 //! ```
 //!
-//! This API was chosen so that we can easily work with filenames extracted from ssh,
-//! or from pageserver log files.
+//! ## Viewing
 //!
-//! TODO Consider shipping this as a grafana panel plugin:
-//!      <https://grafana.com/tutorials/build-a-panel-plugin/>
+//! **Inkscape** is better than the built-in viewers in browsers.
+//!
+//! After selecting a layer file rectangle, use "Open XML Editor" (Ctrl|Cmd + Shift + X)
+//! to see the layer file name in the comment field.
+//!
+//! ```bash
+//!
+//! # Linux
+//! inkscape out.svg
+//!
+//! # macOS
+//! /Applications/Inkscape.app/Contents/MacOS/inkscape out.svg
+//!
+//! ```
+//!
+
 use anyhow::Result;
 use pageserver::repository::Key;
 use pageserver::METADATA_FILE_NAME;
@@ -65,7 +92,12 @@ fn parse_filename(name: &str) -> (Range<Key>, Range<Lsn>) {
 
 pub fn main() -> Result<()> {
     // Parse layer filenames from stdin
-    let mut ranges: Vec<(Range<Key>, Range<Lsn>)> = vec![];
+    struct Layer {
+        filename: String,
+        key_range: Range<Key>,
+        lsn_range: Range<Lsn>,
+    }
+    let mut files: Vec<Layer> = vec![];
     let stdin = io::stdin();
     for line in stdin.lock().lines() {
         let line = line.unwrap();
@@ -76,14 +108,23 @@ pub fn main() -> Result<()> {
             // Don't try and parse "metadata" like a key-lsn range
             continue;
         }
-        let range = parse_filename(filename);
-        ranges.push(range);
+        let (key_range, lsn_range) = parse_filename(filename);
+        files.push(Layer {
+            filename: filename.to_owned(),
+            key_range,
+            lsn_range,
+        });
     }
 
     // Collect all coordinates
     let mut keys: Vec<Key> = vec![];
     let mut lsns: Vec<Lsn> = vec![];
-    for (keyr, lsnr) in &ranges {
+    for Layer {
+        key_range: keyr,
+        lsn_range: lsnr,
+        ..
+    } in &files
+    {
         keys.push(keyr.start);
         keys.push(keyr.end);
         lsns.push(lsnr.start);
@@ -107,7 +148,12 @@ pub fn main() -> Result<()> {
             h: stretch * lsn_map.len() as f32
         }
     );
-    for (keyr, lsnr) in &ranges {
+    for Layer {
+        filename,
+        key_range: keyr,
+        lsn_range: lsnr,
+    } in &files
+    {
         let key_start = *key_map.get(&keyr.start).unwrap();
         let key_end = *key_map.get(&keyr.end).unwrap();
         let key_diff = key_end - key_start;
@@ -151,6 +197,7 @@ pub fn main() -> Result<()> {
             .fill(fill)
             .stroke(Stroke::Color(rgb(0, 0, 0), 0.1))
             .border_radius(0.4)
+            .comment(filename)
         );
     }
     println!("{}", EndSvg);
diff --git a/pageserver/ctl/src/main.rs b/pageserver/ctl/src/main.rs
index e73d961e36..1fb75584fc 100644
--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -9,6 +9,11 @@ mod index_part;
 mod layer_map_analyzer;
 mod layers;
 
+use std::{
+    str::FromStr,
+    time::{Duration, SystemTime},
+};
+
 use camino::{Utf8Path, Utf8PathBuf};
 use clap::{Parser, Subcommand};
 use index_part::IndexPartCmd;
@@ -20,8 +25,16 @@ use pageserver::{
     tenant::{dump_layerfile_from_path, metadata::TimelineMetadata},
     virtual_file,
 };
+use pageserver_api::shard::TenantShardId;
 use postgres_ffi::ControlFileData;
-use utils::{lsn::Lsn, project_git_version};
+use remote_storage::{RemotePath, RemoteStorageConfig};
+use tokio_util::sync::CancellationToken;
+use utils::{
+    id::TimelineId,
+    logging::{self, LogFormat, TracingErrorLayerEnablement},
+    lsn::Lsn,
+    project_git_version,
+};
 
 project_git_version!(GIT_VERSION);
 
@@ -43,6 +56,7 @@ enum Commands {
     #[command(subcommand)]
     IndexPart(IndexPartCmd),
     PrintLayerFile(PrintLayerFileCmd),
+    TimeTravelRemotePrefix(TimeTravelRemotePrefixCmd),
     DrawTimeline {},
     AnalyzeLayerMap(AnalyzeLayerMapCmd),
     #[command(subcommand)]
@@ -68,6 +82,26 @@ struct PrintLayerFileCmd {
     path: Utf8PathBuf,
 }
 
+/// Roll back the time for the specified prefix using S3 history.
+///
+/// The command is fairly low level and powerful. Validation is only very light,
+/// so it is more powerful, and thus potentially more dangerous.
+#[derive(Parser)]
+struct TimeTravelRemotePrefixCmd {
+    /// A configuration string for the remote_storage configuration.
+    ///
+    /// Example: `remote_storage = { bucket_name = "aws-storage-bucket-name", bucket_region = "us-east-2" }`
+    config_toml_str: String,
+    /// remote prefix to time travel recover. For safety reasons, we require it to contain
+    /// a timeline or tenant ID in the prefix.
+    prefix: String,
+    /// Timestamp to travel to. Given in format like `2024-01-20T10:45:45Z`. Assumes UTC and second accuracy.
+    travel_to: String,
+    /// Timestamp of the start of the operation, must be after any changes we want to roll back and after.
+    /// You can use a few seconds before invoking the command. Same format as `travel_to`.
+    done_if_after: Option<String>,
+}
+
 #[derive(Parser)]
 struct AnalyzeLayerMapCmd {
     /// Pageserver data path
@@ -78,6 +112,14 @@ struct AnalyzeLayerMapCmd {
 
 #[tokio::main]
 async fn main() -> anyhow::Result<()> {
+    logging::init(
+        LogFormat::Plain,
+        TracingErrorLayerEnablement::EnableWithRustLogFilter,
+        logging::Output::Stdout,
+    )?;
+
+    logging::replace_panic_hook_with_tracing_panic_hook().forget();
+
     let cli = CliOpts::parse();
 
     match cli.command {
@@ -105,6 +147,42 @@ async fn main() -> anyhow::Result<()> {
                 print_layerfile(&cmd.path).await?;
             }
         }
+        Commands::TimeTravelRemotePrefix(cmd) => {
+            let timestamp = humantime::parse_rfc3339(&cmd.travel_to)
+                .map_err(|_e| anyhow::anyhow!("Invalid time for travel_to: '{}'", cmd.travel_to))?;
+
+            let done_if_after = if let Some(done_if_after) = &cmd.done_if_after {
+                humantime::parse_rfc3339(done_if_after).map_err(|_e| {
+                    anyhow::anyhow!("Invalid time for done_if_after: '{}'", done_if_after)
+                })?
+            } else {
+                const SAFETY_MARGIN: Duration = Duration::from_secs(3);
+                tokio::time::sleep(SAFETY_MARGIN).await;
+                // Convert to string representation and back to get rid of sub-second values
+                let done_if_after = SystemTime::now();
+                tokio::time::sleep(SAFETY_MARGIN).await;
+                done_if_after
+            };
+
+            let timestamp = strip_subsecond(timestamp);
+            let done_if_after = strip_subsecond(done_if_after);
+
+            let Some(prefix) = validate_prefix(&cmd.prefix) else {
+                println!("specified prefix '{}' failed validation", cmd.prefix);
+                return Ok(());
+            };
+            let toml_document = toml_edit::Document::from_str(&cmd.config_toml_str)?;
+            let toml_item = toml_document
+                .get("remote_storage")
+                .expect("need remote_storage");
+            let config = RemoteStorageConfig::from_toml(toml_item)?.expect("incomplete config");
+            let storage = remote_storage::GenericRemoteStorage::from_config(&config);
+            let cancel = CancellationToken::new();
+            storage
+                .unwrap()
+                .time_travel_recover(Some(&prefix), timestamp, done_if_after, &cancel)
+                .await?;
+        }
     };
     Ok(())
 }
@@ -185,3 +263,89 @@ fn handle_metadata(
 
     Ok(())
 }
+
+/// Ensures that the given S3 prefix is sufficiently constrained.
+/// The command is very risky already and we don't want to expose something
+/// that allows usually unintentional and quite catastrophic time travel of
+/// an entire bucket, which would be a major catastrophy and away
+/// by only one character change (similar to "rm -r /home /username/foobar").
+fn validate_prefix(prefix: &str) -> Option<RemotePath> {
+    if prefix.is_empty() {
+        // Empty prefix means we want to specify the *whole* bucket
+        return None;
+    }
+    let components = prefix.split('/').collect::<Vec<_>>();
+    let (last, components) = {
+        let last = components.last()?;
+        if last.is_empty() {
+            (
+                components.iter().nth_back(1)?,
+                &components[..(components.len() - 1)],
+            )
+        } else {
+            (last, &components[..])
+        }
+    };
+    'valid: {
+        if let Ok(_timeline_id) = TimelineId::from_str(last) {
+            // Ends in either a tenant or timeline ID
+            break 'valid;
+        }
+        if *last == "timelines" {
+            if let Some(before_last) = components.iter().nth_back(1) {
+                if let Ok(_tenant_id) = TenantShardId::from_str(before_last) {
+                    // Has a valid tenant id
+                    break 'valid;
+                }
+            }
+        }
+
+        return None;
+    }
+    RemotePath::from_string(prefix).ok()
+}
+
+fn strip_subsecond(timestamp: SystemTime) -> SystemTime {
+    let ts_str = humantime::format_rfc3339_seconds(timestamp).to_string();
+    humantime::parse_rfc3339(&ts_str).expect("can't parse just created timestamp")
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_validate_prefix() {
+        assert_eq!(validate_prefix(""), None);
+        assert_eq!(validate_prefix("/"), None);
+        #[track_caller]
+        fn assert_valid(prefix: &str) {
+            let remote_path = RemotePath::from_string(prefix).unwrap();
+            assert_eq!(validate_prefix(prefix), Some(remote_path));
+        }
+        assert_valid("wal/3aa8fcc61f6d357410b7de754b1d9001/641e5342083b2235ee3deb8066819683/");
+        // Path is not relative but absolute
+        assert_eq!(
+            validate_prefix(
+                "/wal/3aa8fcc61f6d357410b7de754b1d9001/641e5342083b2235ee3deb8066819683/"
+            ),
+            None
+        );
+        assert_valid("wal/3aa8fcc61f6d357410b7de754b1d9001/");
+        // Partial tenant IDs should be invalid, S3 will match all tenants with the specific ID prefix
+        assert_eq!(validate_prefix("wal/3aa8fcc61f6d357410b7d"), None);
+        assert_eq!(validate_prefix("wal"), None);
+        assert_eq!(validate_prefix("/wal/"), None);
+        assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001");
+        // Partial tenant ID
+        assert_eq!(
+            validate_prefix("pageserver/v1/tenants/3aa8fcc61f6d357410b"),
+            None
+        );
+        assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines");
+        assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001-0004/timelines");
+        assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines/");
+        assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines/641e5342083b2235ee3deb8066819683");
+        assert_eq!(validate_prefix("pageserver/v1/tenants/"), None);
+    }
+}
diff --git a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
index c3d8e61a2c..5043a207fc 100644
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -312,8 +312,12 @@ async fn main_impl(
                     let (rel_tag, block_no) =
                         key_to_rel_block(key).expect("we filter non-rel-block keys out above");
                     PagestreamGetPageRequest {
-                        latest: rng.gen_bool(args.req_latest_probability),
-                        lsn: r.timeline_lsn,
+                        request_lsn: if rng.gen_bool(args.req_latest_probability) {
+                            Lsn::MAX
+                        } else {
+                            r.timeline_lsn
+                        },
+                        not_modified_since: r.timeline_lsn,
                         rel: rel_tag,
                         blkno: block_no,
                     }
diff --git a/pageserver/src/aux_file.rs b/pageserver/src/aux_file.rs
new file mode 100644
index 0000000000..a343acaf7a
--- /dev/null
+++ b/pageserver/src/aux_file.rs
@@ -0,0 +1,112 @@
+use pageserver_api::key::{Key, AUX_KEY_PREFIX, METADATA_KEY_SIZE};
+use tracing::warn;
+
+/// Create a metadata key from a hash, encoded as [AUX_KEY_PREFIX, 2B directory prefix, first 13B of 128b xxhash].
+fn aux_hash_to_metadata_key(dir_level1: u8, dir_level2: u8, data: &[u8]) -> Key {
+    let mut key = [0; METADATA_KEY_SIZE];
+    let hash = twox_hash::xxh3::hash128(data).to_be_bytes();
+    key[0] = AUX_KEY_PREFIX;
+    key[1] = dir_level1;
+    key[2] = dir_level2;
+    key[3..16].copy_from_slice(&hash[0..13]);
+    Key::from_metadata_key_fixed_size(&key)
+}
+
+const AUX_DIR_PG_LOGICAL: u8 = 0x01;
+const AUX_DIR_PG_REPLSLOT: u8 = 0x02;
+const AUX_DIR_PG_UNKNOWN: u8 = 0xFF;
+
+/// Encode the aux file into a fixed-size key.
+///
+/// The first byte is the AUX key prefix. We use the next 2 bytes of the key for the directory / aux file type.
+/// We have one-to-one mapping for each of the aux file that we support. We hash the remaining part of the path
+/// (usually a single file name, or several components) into 13-byte hash. The way we determine the 2-byte prefix
+/// is roughly based on the first two components of the path, one unique number for one component.
+///
+/// * pg_logical/mappings -> 0x0101
+/// * pg_logical/snapshots -> 0x0102
+/// * pg_logical/replorigin_checkpoint -> 0x0103
+/// * pg_logical/others -> 0x01FF
+/// * pg_replslot/ -> 0x0201
+/// * others -> 0xFFFF
+///
+/// If you add new AUX files to this function, please also add a test case to `test_encoding_portable`.
+/// The new file type must have never been written to the storage before. Otherwise, there could be data
+/// corruptions as the new file belongs to a new prefix but it might have been stored under the `others` prefix.
+pub fn encode_aux_file_key(path: &str) -> Key {
+    if let Some(fname) = path.strip_prefix("pg_logical/mappings/") {
+        aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0x01, fname.as_bytes())
+    } else if let Some(fname) = path.strip_prefix("pg_logical/snapshots/") {
+        aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0x02, fname.as_bytes())
+    } else if path == "pg_logical/replorigin_checkpoint" {
+        aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0x03, b"")
+    } else if let Some(fname) = path.strip_prefix("pg_logical/") {
+        if cfg!(debug_assertions) {
+            warn!(
+                "unsupported pg_logical aux file type: {}, putting to 0x01FF, would affect path scanning",
+                path
+            );
+        }
+        aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0xFF, fname.as_bytes())
+    } else if let Some(fname) = path.strip_prefix("pg_replslot/") {
+        aux_hash_to_metadata_key(AUX_DIR_PG_REPLSLOT, 0x01, fname.as_bytes())
+    } else {
+        if cfg!(debug_assertions) {
+            warn!(
+                "unsupported aux file type: {}, putting to 0xFFFF, would affect path scanning",
+                path
+            );
+        }
+        aux_hash_to_metadata_key(AUX_DIR_PG_UNKNOWN, 0xFF, path.as_bytes())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_hash_portable() {
+        // AUX file encoding requires the hash to be portable across all platforms. This test case checks
+        // if the algorithm produces the same hash across different environments.
+        assert_eq!(
+            305317690835051308206966631765527126151,
+            twox_hash::xxh3::hash128("test1".as_bytes())
+        );
+        assert_eq!(
+            85104974691013376326742244813280798847,
+            twox_hash::xxh3::hash128("test/test2".as_bytes())
+        );
+        assert_eq!(0, twox_hash::xxh3::hash128("".as_bytes()));
+    }
+
+    #[test]
+    fn test_encoding_portable() {
+        // To correct retrieve AUX files, the generated keys for the same file must be the same for all versions
+        // of the page server.
+        assert_eq!(
+            "6200000101E5B20C5F8DD5AA3289D6D9EAFA",
+            encode_aux_file_key("pg_logical/mappings/test1").to_string()
+        );
+        assert_eq!(
+            "620000010239AAC544893139B26F501B97E6",
+            encode_aux_file_key("pg_logical/snapshots/test2").to_string()
+        );
+        assert_eq!(
+            "620000010300000000000000000000000000",
+            encode_aux_file_key("pg_logical/replorigin_checkpoint").to_string()
+        );
+        assert_eq!(
+            "62000001FF8635AF2134B7266EC5B4189FD6",
+            encode_aux_file_key("pg_logical/unsupported").to_string()
+        );
+        assert_eq!(
+            "6200000201772D0E5D71DE14DA86142A1619",
+            encode_aux_file_key("pg_replslot/test3").to_string()
+        );
+        assert_eq!(
+            "620000FFFF1866EBEB53B807B26A2416F317",
+            encode_aux_file_key("other_file_not_supported").to_string()
+        );
+    }
+}
diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs
index 0479d05f8f..58b18dae7d 100644
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -10,7 +10,7 @@
 //! This module is responsible for creation of such tarball
 //! from data stored in object storage.
 //!
-use anyhow::{anyhow, bail, ensure, Context};
+use anyhow::{anyhow, Context};
 use bytes::{BufMut, Bytes, BytesMut};
 use fail::fail_point;
 use pageserver_api::key::{key_to_slru_block, Key};
@@ -38,6 +38,14 @@ use postgres_ffi::PG_TLI;
 use postgres_ffi::{BLCKSZ, RELSEG_SIZE, WAL_SEGMENT_SIZE};
 use utils::lsn::Lsn;
 
+#[derive(Debug, thiserror::Error)]
+pub enum BasebackupError {
+    #[error("basebackup pageserver error {0:#}")]
+    Server(#[from] anyhow::Error),
+    #[error("basebackup client error {0:#}")]
+    Client(#[source] io::Error),
+}
+
 /// Create basebackup with non-rel data in it.
 /// Only include relational data if 'full_backup' is true.
 ///
@@ -53,7 +61,7 @@ pub async fn send_basebackup_tarball<'a, W>(
     prev_lsn: Option<Lsn>,
     full_backup: bool,
     ctx: &'a RequestContext,
-) -> anyhow::Result<()>
+) -> Result<(), BasebackupError>
 where
     W: AsyncWrite + Send + Sync + Unpin,
 {
@@ -92,8 +100,10 @@ where
 
     // Consolidate the derived and the provided prev_lsn values
     let prev_lsn = if let Some(provided_prev_lsn) = prev_lsn {
-        if backup_prev != Lsn(0) {
-            ensure!(backup_prev == provided_prev_lsn);
+        if backup_prev != Lsn(0) && backup_prev != provided_prev_lsn {
+            return Err(BasebackupError::Server(anyhow!(
+                "backup_prev {backup_prev} != provided_prev_lsn {provided_prev_lsn}"
+            )));
         }
         provided_prev_lsn
     } else {
@@ -159,15 +169,26 @@ where
         }
     }
 
-    async fn add_block(&mut self, key: &Key, block: Bytes) -> anyhow::Result<()> {
+    async fn add_block(&mut self, key: &Key, block: Bytes) -> Result<(), BasebackupError> {
         let (kind, segno, _) = key_to_slru_block(*key)?;
 
         match kind {
             SlruKind::Clog => {
-                ensure!(block.len() == BLCKSZ as usize || block.len() == BLCKSZ as usize + 8);
+                if !(block.len() == BLCKSZ as usize || block.len() == BLCKSZ as usize + 8) {
+                    return Err(BasebackupError::Server(anyhow!(
+                        "invalid SlruKind::Clog record: block.len()={}",
+                        block.len()
+                    )));
+                }
             }
             SlruKind::MultiXactMembers | SlruKind::MultiXactOffsets => {
-                ensure!(block.len() == BLCKSZ as usize);
+                if block.len() != BLCKSZ as usize {
+                    return Err(BasebackupError::Server(anyhow!(
+                        "invalid {:?} record: block.len()={}",
+                        kind,
+                        block.len()
+                    )));
+                }
             }
         }
 
@@ -194,12 +215,15 @@ where
         Ok(())
     }
 
-    async fn flush(&mut self) -> anyhow::Result<()> {
+    async fn flush(&mut self) -> Result<(), BasebackupError> {
         let nblocks = self.buf.len() / BLCKSZ as usize;
         let (kind, segno) = self.current_segment.take().unwrap();
         let segname = format!("{}/{:>04X}", kind.to_str(), segno);
         let header = new_tar_header(&segname, self.buf.len() as u64)?;
-        self.ar.append(&header, self.buf.as_slice()).await?;
+        self.ar
+            .append(&header, self.buf.as_slice())
+            .await
+            .map_err(BasebackupError::Client)?;
 
         self.total_blocks += nblocks;
         debug!("Added to basebackup slru {} relsize {}", segname, nblocks);
@@ -209,7 +233,7 @@ where
         Ok(())
     }
 
-    async fn finish(mut self) -> anyhow::Result<()> {
+    async fn finish(mut self) -> Result<(), BasebackupError> {
         let res = if self.current_segment.is_none() || self.buf.is_empty() {
             Ok(())
         } else {
@@ -226,7 +250,7 @@ impl<'a, W> Basebackup<'a, W>
 where
     W: AsyncWrite + Send + Sync + Unpin,
 {
-    async fn send_tarball(mut self) -> anyhow::Result<()> {
+    async fn send_tarball(mut self) -> Result<(), BasebackupError> {
         // TODO include checksum
 
         let lazy_slru_download = self.timeline.get_lazy_slru_download() && !self.full_backup;
@@ -262,16 +286,25 @@ where
             let slru_partitions = self
                 .timeline
                 .get_slru_keyspace(Version::Lsn(self.lsn), self.ctx)
-                .await?
-                .partition(Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64);
+                .await
+                .map_err(|e| BasebackupError::Server(e.into()))?
+                .partition(
+                    self.timeline.get_shard_identity(),
+                    Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64,
+                );
 
             let mut slru_builder = SlruSegmentsBuilder::new(&mut self.ar);
 
             for part in slru_partitions.parts {
-                let blocks = self.timeline.get_vectored(part, self.lsn, self.ctx).await?;
+                let blocks = self
+                    .timeline
+                    .get_vectored(part, self.lsn, self.ctx)
+                    .await
+                    .map_err(|e| BasebackupError::Server(e.into()))?;
 
                 for (key, block) in blocks {
-                    slru_builder.add_block(&key, block?).await?;
+                    let block = block.map_err(|e| BasebackupError::Server(e.into()))?;
+                    slru_builder.add_block(&key, block).await?;
                 }
             }
             slru_builder.finish().await?;
@@ -279,8 +312,11 @@ where
 
         let mut min_restart_lsn: Lsn = Lsn::MAX;
         // Create tablespace directories
-        for ((spcnode, dbnode), has_relmap_file) in
-            self.timeline.list_dbdirs(self.lsn, self.ctx).await?
+        for ((spcnode, dbnode), has_relmap_file) in self
+            .timeline
+            .list_dbdirs(self.lsn, self.ctx)
+            .await
+            .map_err(|e| BasebackupError::Server(e.into()))?
         {
             self.add_dbdir(spcnode, dbnode, has_relmap_file).await?;
 
@@ -289,7 +325,8 @@ where
             let rels = self
                 .timeline
                 .list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
-                .await?;
+                .await
+                .map_err(|e| BasebackupError::Server(e.into()))?;
             for &rel in rels.iter() {
                 // Send init fork as main fork to provide well formed empty
                 // contents of UNLOGGED relations. Postgres copies it in
@@ -312,7 +349,12 @@ where
                 }
             }
 
-            for (path, content) in self.timeline.list_aux_files(self.lsn, self.ctx).await? {
+            for (path, content) in self
+                .timeline
+                .list_aux_files(self.lsn, self.ctx)
+                .await
+                .map_err(|e| BasebackupError::Server(e.into()))?
+            {
                 if path.starts_with("pg_replslot") {
                     let offs = pg_constants::REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN;
                     let restart_lsn = Lsn(u64::from_le_bytes(
@@ -343,34 +385,41 @@ where
         for xid in self
             .timeline
             .list_twophase_files(self.lsn, self.ctx)
-            .await?
+            .await
+            .map_err(|e| BasebackupError::Server(e.into()))?
         {
             self.add_twophase_file(xid).await?;
         }
 
         fail_point!("basebackup-before-control-file", |_| {
-            bail!("failpoint basebackup-before-control-file")
+            Err(BasebackupError::Server(anyhow!(
+                "failpoint basebackup-before-control-file"
+            )))
         });
 
         // Generate pg_control and bootstrap WAL segment.
         self.add_pgcontrol_file().await?;
-        self.ar.finish().await?;
+        self.ar.finish().await.map_err(BasebackupError::Client)?;
         debug!("all tarred up!");
         Ok(())
     }
 
     /// Add contents of relfilenode `src`, naming it as `dst`.
-    async fn add_rel(&mut self, src: RelTag, dst: RelTag) -> anyhow::Result<()> {
+    async fn add_rel(&mut self, src: RelTag, dst: RelTag) -> Result<(), BasebackupError> {
         let nblocks = self
             .timeline
-            .get_rel_size(src, Version::Lsn(self.lsn), false, self.ctx)
-            .await?;
+            .get_rel_size(src, Version::Lsn(self.lsn), self.ctx)
+            .await
+            .map_err(|e| BasebackupError::Server(e.into()))?;
 
         // If the relation is empty, create an empty file
         if nblocks == 0 {
             let file_name = dst.to_segfile_name(0);
             let header = new_tar_header(&file_name, 0)?;
-            self.ar.append(&header, &mut io::empty()).await?;
+            self.ar
+                .append(&header, &mut io::empty())
+                .await
+                .map_err(BasebackupError::Client)?;
             return Ok(());
         }
 
@@ -384,14 +433,18 @@ where
             for blknum in startblk..endblk {
                 let img = self
                     .timeline
-                    .get_rel_page_at_lsn(src, blknum, Version::Lsn(self.lsn), false, self.ctx)
-                    .await?;
+                    .get_rel_page_at_lsn(src, blknum, Version::Lsn(self.lsn), self.ctx)
+                    .await
+                    .map_err(|e| BasebackupError::Server(e.into()))?;
                 segment_data.extend_from_slice(&img[..]);
             }
 
             let file_name = dst.to_segfile_name(seg as u32);
             let header = new_tar_header(&file_name, segment_data.len() as u64)?;
-            self.ar.append(&header, segment_data.as_slice()).await?;
+            self.ar
+                .append(&header, segment_data.as_slice())
+                .await
+                .map_err(BasebackupError::Client)?;
 
             seg += 1;
             startblk = endblk;
@@ -411,20 +464,22 @@ where
         spcnode: u32,
         dbnode: u32,
         has_relmap_file: bool,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), BasebackupError> {
         let relmap_img = if has_relmap_file {
             let img = self
                 .timeline
                 .get_relmap_file(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
-                .await?;
+                .await
+                .map_err(|e| BasebackupError::Server(e.into()))?;
 
-            ensure!(
-                img.len()
-                    == dispatch_pgversion!(
-                        self.timeline.pg_version,
-                        pgv::bindings::SIZEOF_RELMAPFILE
-                    )
-            );
+            if img.len()
+                != dispatch_pgversion!(self.timeline.pg_version, pgv::bindings::SIZEOF_RELMAPFILE)
+            {
+                return Err(BasebackupError::Server(anyhow!(
+                    "img.len() != SIZE_OF_RELMAPFILE, img.len()={}",
+                    img.len(),
+                )));
+            }
 
             Some(img)
         } else {
@@ -437,14 +492,20 @@ where
                 ver => format!("{ver}\x0A"),
             };
             let header = new_tar_header("PG_VERSION", pg_version_str.len() as u64)?;
-            self.ar.append(&header, pg_version_str.as_bytes()).await?;
+            self.ar
+                .append(&header, pg_version_str.as_bytes())
+                .await
+                .map_err(BasebackupError::Client)?;
 
             info!("timeline.pg_version {}", self.timeline.pg_version);
 
             if let Some(img) = relmap_img {
                 // filenode map for global tablespace
                 let header = new_tar_header("global/pg_filenode.map", img.len() as u64)?;
-                self.ar.append(&header, &img[..]).await?;
+                self.ar
+                    .append(&header, &img[..])
+                    .await
+                    .map_err(BasebackupError::Client)?;
             } else {
                 warn!("global/pg_filenode.map is missing");
             }
@@ -463,18 +524,26 @@ where
                 && self
                     .timeline
                     .list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
-                    .await?
+                    .await
+                    .map_err(|e| BasebackupError::Server(e.into()))?
                     .is_empty()
             {
                 return Ok(());
             }
             // User defined tablespaces are not supported
-            ensure!(spcnode == DEFAULTTABLESPACE_OID);
+            if spcnode != DEFAULTTABLESPACE_OID {
+                return Err(BasebackupError::Server(anyhow!(
+                    "spcnode != DEFAULTTABLESPACE_OID, spcnode={spcnode}"
+                )));
+            }
 
             // Append dir path for each database
             let path = format!("base/{}", dbnode);
             let header = new_tar_header_dir(&path)?;
-            self.ar.append(&header, &mut io::empty()).await?;
+            self.ar
+                .append(&header, &mut io::empty())
+                .await
+                .map_err(BasebackupError::Client)?;
 
             if let Some(img) = relmap_img {
                 let dst_path = format!("base/{}/PG_VERSION", dbnode);
@@ -484,11 +553,17 @@ where
                     ver => format!("{ver}\x0A"),
                 };
                 let header = new_tar_header(&dst_path, pg_version_str.len() as u64)?;
-                self.ar.append(&header, pg_version_str.as_bytes()).await?;
+                self.ar
+                    .append(&header, pg_version_str.as_bytes())
+                    .await
+                    .map_err(BasebackupError::Client)?;
 
                 let relmap_path = format!("base/{}/pg_filenode.map", dbnode);
                 let header = new_tar_header(&relmap_path, img.len() as u64)?;
-                self.ar.append(&header, &img[..]).await?;
+                self.ar
+                    .append(&header, &img[..])
+                    .await
+                    .map_err(BasebackupError::Client)?;
             }
         };
         Ok(())
@@ -497,11 +572,12 @@ where
     //
     // Extract twophase state files
     //
-    async fn add_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> {
+    async fn add_twophase_file(&mut self, xid: TransactionId) -> Result<(), BasebackupError> {
         let img = self
             .timeline
             .get_twophase_file(xid, self.lsn, self.ctx)
-            .await?;
+            .await
+            .map_err(|e| BasebackupError::Server(e.into()))?;
 
         let mut buf = BytesMut::new();
         buf.extend_from_slice(&img[..]);
@@ -509,7 +585,10 @@ where
         buf.put_u32_le(crc);
         let path = format!("pg_twophase/{:>08X}", xid);
         let header = new_tar_header(&path, buf.len() as u64)?;
-        self.ar.append(&header, &buf[..]).await?;
+        self.ar
+            .append(&header, &buf[..])
+            .await
+            .map_err(BasebackupError::Client)?;
 
         Ok(())
     }
@@ -518,24 +597,28 @@ where
     // Add generated pg_control file and bootstrap WAL segment.
     // Also send zenith.signal file with extra bootstrap data.
     //
-    async fn add_pgcontrol_file(&mut self) -> anyhow::Result<()> {
+    async fn add_pgcontrol_file(&mut self) -> Result<(), BasebackupError> {
         // add zenith.signal file
         let mut zenith_signal = String::new();
         if self.prev_record_lsn == Lsn(0) {
             if self.lsn == self.timeline.get_ancestor_lsn() {
-                write!(zenith_signal, "PREV LSN: none")?;
+                write!(zenith_signal, "PREV LSN: none")
+                    .map_err(|e| BasebackupError::Server(e.into()))?;
             } else {
-                write!(zenith_signal, "PREV LSN: invalid")?;
+                write!(zenith_signal, "PREV LSN: invalid")
+                    .map_err(|e| BasebackupError::Server(e.into()))?;
             }
         } else {
-            write!(zenith_signal, "PREV LSN: {}", self.prev_record_lsn)?;
+            write!(zenith_signal, "PREV LSN: {}", self.prev_record_lsn)
+                .map_err(|e| BasebackupError::Server(e.into()))?;
         }
         self.ar
             .append(
                 &new_tar_header("zenith.signal", zenith_signal.len() as u64)?,
                 zenith_signal.as_bytes(),
             )
-            .await?;
+            .await
+            .map_err(BasebackupError::Client)?;
 
         let checkpoint_bytes = self
             .timeline
@@ -557,7 +640,10 @@ where
 
         //send pg_control
         let header = new_tar_header("global/pg_control", pg_control_bytes.len() as u64)?;
-        self.ar.append(&header, &pg_control_bytes[..]).await?;
+        self.ar
+            .append(&header, &pg_control_bytes[..])
+            .await
+            .map_err(BasebackupError::Client)?;
 
         //send wal segment
         let segno = self.lsn.segment_number(WAL_SEGMENT_SIZE);
@@ -572,8 +658,16 @@ where
             self.lsn,
         )
         .map_err(|e| anyhow!(e).context("Failed generating wal segment"))?;
-        ensure!(wal_seg.len() == WAL_SEGMENT_SIZE);
-        self.ar.append(&header, &wal_seg[..]).await?;
+        if wal_seg.len() != WAL_SEGMENT_SIZE {
+            return Err(BasebackupError::Server(anyhow!(
+                "wal_seg.len() != WAL_SEGMENT_SIZE, wal_seg.len()={}",
+                wal_seg.len()
+            )));
+        }
+        self.ar
+            .append(&header, &wal_seg[..])
+            .await
+            .map_err(BasebackupError::Client)?;
         Ok(())
     }
 }
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 0903b206ff..1345223a43 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -121,8 +121,10 @@ fn main() -> anyhow::Result<()> {
         &[("node_id", &conf.id.to_string())],
     );
 
-    // after setting up logging, log the effective IO engine choice
+    // after setting up logging, log the effective IO engine choice and read path implementations
     info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
+    info!(?conf.get_impl, "starting with get page implementation");
+    info!(?conf.get_vectored_impl, "starting with vectored get page implementation");
 
     let tenants_path = conf.tenants_path();
     if !tenants_path.exists() {
@@ -285,6 +287,7 @@ fn start_pageserver(
     ))
     .unwrap();
     pageserver::preinitialize_metrics();
+    pageserver::metrics::wal_redo::set_process_kind_metric(conf.walredo_process_kind);
 
     // If any failpoints were set from FAILPOINTS environment variable,
     // print them to the log for debugging purposes
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 1837da34ce..258eed0b12 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -9,7 +9,7 @@ use pageserver_api::shard::TenantShardId;
 use remote_storage::{RemotePath, RemoteStorageConfig};
 use serde;
 use serde::de::IntoDeserializer;
-use std::{collections::HashMap, env};
+use std::env;
 use storage_broker::Uri;
 use utils::crashsafe::path_with_suffix_extension;
 use utils::id::ConnectionId;
@@ -30,9 +30,9 @@ use utils::{
     logging::LogFormat,
 };
 
-use crate::tenant::config::TenantConfOpt;
 use crate::tenant::timeline::GetVectoredImpl;
 use crate::tenant::vectored_blob_io::MaxVectoredReadBytes;
+use crate::tenant::{config::TenantConfOpt, timeline::GetImpl};
 use crate::tenant::{
     TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
 };
@@ -51,7 +51,7 @@ pub mod defaults {
     use crate::tenant::config::defaults::*;
     use const_format::formatcp;
 
-    pub use pageserver_api::{
+    pub use pageserver_api::config::{
         DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_HTTP_LISTEN_PORT, DEFAULT_PG_LISTEN_ADDR,
         DEFAULT_PG_LISTEN_PORT,
     };
@@ -91,12 +91,16 @@ pub mod defaults {
 
     pub const DEFAULT_GET_VECTORED_IMPL: &str = "sequential";
 
+    pub const DEFAULT_GET_IMPL: &str = "legacy";
+
     pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB
 
     pub const DEFAULT_VALIDATE_VECTORED_GET: bool = true;
 
     pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
 
+    pub const DEFAULT_WALREDO_PROCESS_KIND: &str = "sync";
+
     ///
     /// Default built-in configuration file.
     ///
@@ -136,10 +140,14 @@ pub mod defaults {
 
 #get_vectored_impl = '{DEFAULT_GET_VECTORED_IMPL}'
 
+#get_impl = '{DEFAULT_GET_IMPL}'
+
 #max_vectored_read_bytes = '{DEFAULT_MAX_VECTORED_READ_BYTES}'
 
 #validate_vectored_get = '{DEFAULT_VALIDATE_VECTORED_GET}'
 
+#walredo_process_kind = '{DEFAULT_WALREDO_PROCESS_KIND}'
+
 [tenant_config]
 #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
 #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
@@ -280,6 +288,8 @@ pub struct PageServerConf {
 
     pub get_vectored_impl: GetVectoredImpl,
 
+    pub get_impl: GetImpl,
+
     pub max_vectored_read_bytes: MaxVectoredReadBytes,
 
     pub validate_vectored_get: bool,
@@ -290,6 +300,8 @@ pub struct PageServerConf {
     ///
     /// Setting this to zero disables limits on total ephemeral layer size.
     pub ephemeral_bytes_per_memory_kb: usize,
+
+    pub walredo_process_kind: crate::walredo::ProcessKind,
 }
 
 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -323,26 +335,6 @@ impl<T: Clone> BuilderValue<T> {
     }
 }
 
-// Certain metadata (e.g. externally-addressable name, AZ) is delivered
-// as a separate structure.  This information is not neeed by the pageserver
-// itself, it is only used for registering the pageserver with the control
-// plane and/or storage controller.
-//
-#[derive(serde::Deserialize)]
-pub(crate) struct NodeMetadata {
-    #[serde(rename = "host")]
-    pub(crate) postgres_host: String,
-    #[serde(rename = "port")]
-    pub(crate) postgres_port: u16,
-    pub(crate) http_host: String,
-    pub(crate) http_port: u16,
-
-    // Deployment tools may write fields to the metadata file beyond what we
-    // use in this type: this type intentionally only names fields that require.
-    #[serde(flatten)]
-    pub(crate) other: HashMap<String, serde_json::Value>,
-}
-
 // needed to simplify config construction
 #[derive(Default)]
 struct PageServerConfigBuilder {
@@ -408,11 +400,15 @@ struct PageServerConfigBuilder {
 
     get_vectored_impl: BuilderValue<GetVectoredImpl>,
 
+    get_impl: BuilderValue<GetImpl>,
+
     max_vectored_read_bytes: BuilderValue<MaxVectoredReadBytes>,
 
     validate_vectored_get: BuilderValue<bool>,
 
     ephemeral_bytes_per_memory_kb: BuilderValue<usize>,
+
+    walredo_process_kind: BuilderValue<crate::walredo::ProcessKind>,
 }
 
 impl PageServerConfigBuilder {
@@ -495,11 +491,14 @@ impl PageServerConfigBuilder {
             virtual_file_io_engine: Set(DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap()),
 
             get_vectored_impl: Set(DEFAULT_GET_VECTORED_IMPL.parse().unwrap()),
+            get_impl: Set(DEFAULT_GET_IMPL.parse().unwrap()),
             max_vectored_read_bytes: Set(MaxVectoredReadBytes(
                 NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
             )),
             validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET),
             ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
+
+            walredo_process_kind: Set(DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap()),
         }
     }
 }
@@ -671,6 +670,10 @@ impl PageServerConfigBuilder {
         self.get_vectored_impl = BuilderValue::Set(value);
     }
 
+    pub fn get_impl(&mut self, value: GetImpl) {
+        self.get_impl = BuilderValue::Set(value);
+    }
+
     pub fn get_max_vectored_read_bytes(&mut self, value: MaxVectoredReadBytes) {
         self.max_vectored_read_bytes = BuilderValue::Set(value);
     }
@@ -683,6 +686,10 @@ impl PageServerConfigBuilder {
         self.ephemeral_bytes_per_memory_kb = BuilderValue::Set(value);
     }
 
+    pub fn get_walredo_process_kind(&mut self, value: crate::walredo::ProcessKind) {
+        self.walredo_process_kind = BuilderValue::Set(value);
+    }
+
     pub fn build(self) -> anyhow::Result<PageServerConf> {
         let default = Self::default_values();
 
@@ -736,9 +743,11 @@ impl PageServerConfigBuilder {
                 secondary_download_concurrency,
                 ingest_batch_size,
                 get_vectored_impl,
+                get_impl,
                 max_vectored_read_bytes,
                 validate_vectored_get,
                 ephemeral_bytes_per_memory_kb,
+                walredo_process_kind,
             }
             CUSTOM LOGIC
             {
@@ -1020,6 +1029,9 @@ impl PageServerConf {
                 "get_vectored_impl" => {
                     builder.get_vectored_impl(parse_toml_from_str("get_vectored_impl", item)?)
                 }
+                "get_impl" => {
+                    builder.get_impl(parse_toml_from_str("get_impl", item)?)
+                }
                 "max_vectored_read_bytes" => {
                     let bytes = parse_toml_u64("max_vectored_read_bytes", item)? as usize;
                     builder.get_max_vectored_read_bytes(
@@ -1032,6 +1044,9 @@ impl PageServerConf {
                 "ephemeral_bytes_per_memory_kb" => {
                     builder.get_ephemeral_bytes_per_memory_kb(parse_toml_u64("ephemeral_bytes_per_memory_kb", item)? as usize)
                 }
+                "walredo_process_kind" => {
+                    builder.get_walredo_process_kind(parse_toml_from_str("walredo_process_kind", item)?)
+                }
                 _ => bail!("unrecognized pageserver option '{key}'"),
             }
         }
@@ -1108,12 +1123,14 @@ impl PageServerConf {
             ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
             virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
             get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
+            get_impl: defaults::DEFAULT_GET_IMPL.parse().unwrap(),
             max_vectored_read_bytes: MaxVectoredReadBytes(
                 NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
                     .expect("Invalid default constant"),
             ),
             validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
             ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
+            walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(),
         }
     }
 }
@@ -1346,12 +1363,14 @@ background_task_maximum_delay = '334 s'
                 ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
                 virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
                 get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
+                get_impl: defaults::DEFAULT_GET_IMPL.parse().unwrap(),
                 max_vectored_read_bytes: MaxVectoredReadBytes(
                     NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
                         .expect("Invalid default constant")
                 ),
                 validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
-                ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB
+                ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
+                walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(),
             },
             "Correct defaults should be used when no config values are provided"
         );
@@ -1418,12 +1437,14 @@ background_task_maximum_delay = '334 s'
                 ingest_batch_size: 100,
                 virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
                 get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
+                get_impl: defaults::DEFAULT_GET_IMPL.parse().unwrap(),
                 max_vectored_read_bytes: MaxVectoredReadBytes(
                     NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
                         .expect("Invalid default constant")
                 ),
                 validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
-                ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB
+                ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
+                walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(),
             },
             "Should be able to parse all basic config values correctly"
         );
@@ -1536,6 +1557,7 @@ broker_endpoint = '{broker_endpoint}'
                         endpoint: Some(endpoint.clone()),
                         concurrency_limit: s3_concurrency_limit,
                         max_keys_per_list_response: None,
+                        upload_storage_class: None,
                     }),
                     timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
                 },
diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs
index f5540e896f..62bbde42f4 100644
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -304,7 +304,7 @@ async fn calculate_synthetic_size_worker(
                 continue;
             }
 
-            if !tenant_shard_id.is_zero() {
+            if !tenant_shard_id.is_shard_zero() {
                 // We only send consumption metrics from shard 0, so don't waste time calculating
                 // synthetic size on other shards.
                 continue;
diff --git a/pageserver/src/consumption_metrics/metrics.rs b/pageserver/src/consumption_metrics/metrics.rs
index 6740c1360b..7ba2d04c4f 100644
--- a/pageserver/src/consumption_metrics/metrics.rs
+++ b/pageserver/src/consumption_metrics/metrics.rs
@@ -199,7 +199,7 @@ pub(super) async fn collect_all_metrics(
     };
 
     let tenants = futures::stream::iter(tenants).filter_map(|(id, state, _)| async move {
-        if state != TenantState::Active || !id.is_zero() {
+        if state != TenantState::Active || !id.is_shard_zero() {
             None
         } else {
             tenant_manager
diff --git a/pageserver/src/control_plane_client.rs b/pageserver/src/control_plane_client.rs
index f0ed46ce23..26e7cc7ef8 100644
--- a/pageserver/src/control_plane_client.rs
+++ b/pageserver/src/control_plane_client.rs
@@ -14,10 +14,8 @@ use tokio_util::sync::CancellationToken;
 use url::Url;
 use utils::{backoff, failpoint_support, generation::Generation, id::NodeId};
 
-use crate::{
-    config::{NodeMetadata, PageServerConf},
-    virtual_file::on_fatal_io_error,
-};
+use crate::{config::PageServerConf, virtual_file::on_fatal_io_error};
+use pageserver_api::config::NodeMetadata;
 
 /// The Pageserver's client for using the control plane API: this is a small subset
 /// of the overall control plane API, for dealing with generations (see docs/rfcs/025-generation-numbers.md)
@@ -65,7 +63,7 @@ impl ControlPlaneClient {
         let mut client = reqwest::ClientBuilder::new();
 
         if let Some(jwt) = &conf.control_plane_api_token {
-            let mut headers = hyper::HeaderMap::new();
+            let mut headers = reqwest::header::HeaderMap::new();
             headers.insert(
                 "Authorization",
                 format!("Bearer {}", jwt.get_contents()).parse().unwrap(),
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index 2713309824..c425f3e628 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -58,24 +58,6 @@ paths:
       responses:
         "200":
           description: The reload completed successfully.
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "500":
-          description: Generic operation error (also hits if no keys were found)
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
 
   /v1/tenant/{tenant_id}:
     parameters:
@@ -93,62 +75,14 @@ paths:
             application/json:
               schema:
                 $ref: "#/components/schemas/TenantInfo"
-        "400":
-          description: Error when no tenant id found in path or no timeline id
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
 
     delete:
       description: |
         Attempts to delete specified tenant. 500, 503 and 409 errors should be retried until 404 is retrieved.
         404 means that deletion successfully finished"
       responses:
-        "400":
-          description: Error when no tenant id found in path
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
         "404":
-          description: Tenant not found
+          description: Tenant not found. This is the success path.
           content:
             application/json:
               schema:
@@ -165,18 +99,6 @@ paths:
             application/json:
               schema:
                 $ref: "#/components/schemas/PreconditionFailedError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
 
   /v1/tenant/{tenant_id}/time_travel_remote_storage:
     parameters:
@@ -206,36 +128,6 @@ paths:
             application/json:
               schema:
                 type: string
-        "400":
-          description: Error when no tenant id found in path or invalid timestamp
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
 
   /v1/tenant/{tenant_id}/timeline:
     parameters:
@@ -255,36 +147,6 @@ paths:
                 type: array
                 items:
                   $ref: "#/components/schemas/TimelineInfo"
-        "400":
-          description: Error when no tenant id found in path
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
 
 
   /v1/tenant/{tenant_id}/timeline/{timeline_id}:
@@ -309,60 +171,12 @@ paths:
             application/json:
               schema:
                 $ref: "#/components/schemas/TimelineInfo"
-        "400":
-          description: Error when no tenant id found in path or no timeline id
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
 
     delete:
       description: "Attempts to delete specified timeline. 500 and 409 errors should be retried"
       responses:
-        "400":
-          description: Error when no tenant id found in path or no timeline id
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
         "404":
-          description: Timeline not found
+          description: Timeline not found. This is the success path.
           content:
             application/json:
               schema:
@@ -379,18 +193,6 @@ paths:
             application/json:
               schema:
                 $ref: "#/components/schemas/PreconditionFailedError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
 
   /v1/tenant/{tenant_id}/timeline/{timeline_id}/get_timestamp_of_lsn:
     parameters:
@@ -423,36 +225,6 @@ paths:
               schema:
                 type: string
                 format: date-time
-        "400":
-          description: Error when no tenant id found in path, no timeline id or invalid timestamp
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "404":
-          description: Timeline not found, or there is no timestamp information for the given lsn
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/NotFoundError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
 
   /v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp:
     parameters:
@@ -484,36 +256,6 @@ paths:
             application/json:
               schema:
                 $ref: "#/components/schemas/LsnByTimestampResponse"
-        "400":
-          description: Error when no tenant id found in path, no timeline id or invalid timestamp
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
 
   /v1/tenant/{tenant_id}/timeline/{timeline_id}/do_gc:
     parameters:
@@ -537,36 +279,6 @@ paths:
             application/json:
               schema:
                 type: string
-        "400":
-          description: Error when no tenant id found in path, no timeline id or invalid timestamp
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
   /v1/tenant/{tenant_shard_id}/location_config:
     parameters:
       - name: tenant_shard_id
@@ -628,24 +340,6 @@ paths:
             application/json:
               schema:
                 $ref: "#/components/schemas/TenantLocationConfigResponse"
-        "503":
-          description: Tenant's state cannot be changed right now.  Wait a few seconds and retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
         "409":
           description: |
             The tenant is already known to Pageserver in some way,
@@ -662,12 +356,6 @@ paths:
             application/json:
               schema:
                 $ref: "#/components/schemas/ConflictError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
   /v1/tenant/{tenant_id}/ignore:
     parameters:
       - name: tenant_id
@@ -684,36 +372,6 @@ paths:
       responses:
         "200":
           description: Tenant ignored
-        "400":
-          description: Error when no tenant id found in path parameters
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
 
 
   /v1/tenant/{tenant_id}/load:
@@ -740,36 +398,6 @@ paths:
       responses:
         "202":
           description: Tenant scheduled to load successfully
-        "400":
-          description: Error when no tenant id found in path parameters
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
 
   /v1/tenant/{tenant_id}/{timeline_id}/preserve_initdb_archive:
     parameters:
@@ -790,37 +418,6 @@ paths:
       responses:
         "202":
           description: Tenant scheduled to load successfully
-        "404":
-          description: No tenant or timeline found for the specified ids
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
-
 
   /v1/tenant/{tenant_id}/synthetic_size:
     parameters:
@@ -839,31 +436,8 @@ paths:
             application/json:
               schema:
                 $ref: "#/components/schemas/SyntheticSizeResponse"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
 
+  # This route has no handler. TODO: remove?
   /v1/tenant/{tenant_id}/size:
     parameters:
       - name: tenant_id
@@ -945,18 +519,6 @@ paths:
       responses:
         "200":
           description: Success
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
 
   /v1/tenant/{tenant_shard_id}/secondary/download:
     parameters:
@@ -987,20 +549,6 @@ paths:
             application/json:
               schema:
                 $ref: "#/components/schemas/SecondaryProgress"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
-
-
 
   /v1/tenant/{tenant_id}/timeline/:
     parameters:
@@ -1043,24 +591,6 @@ paths:
             application/json:
               schema:
                 $ref: "#/components/schemas/TimelineInfo"
-        "400":
-          description: Malformed timeline create request
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
         "406":
           description: Permanently unsatisfiable request, don't retry.
           content:
@@ -1079,18 +609,6 @@ paths:
             application/json:
               schema:
                 $ref: "#/components/schemas/Error"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
 
   /v1/tenant/:
     get:
@@ -1104,30 +622,6 @@ paths:
                 type: array
                 items:
                   $ref: "#/components/schemas/TenantInfo"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
 
     post:
       description: |
@@ -1148,43 +642,12 @@ paths:
             application/json:
               schema:
                 type: string
-        "400":
-          description: Malformed tenant create request
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
         "409":
           description: Tenant already exists, creation skipped
           content:
             application/json:
               schema:
                 $ref: "#/components/schemas/ConflictError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
-
 
   /v1/tenant/config:
     put:
@@ -1206,36 +669,6 @@ paths:
                 type: array
                 items:
                   $ref: "#/components/schemas/TenantInfo"
-        "400":
-          description: Malformed tenant config request
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
 
   /v1/tenant/{tenant_id}/config/:
     parameters:
@@ -1255,42 +688,6 @@ paths:
             application/json:
               schema:
                 $ref: "#/components/schemas/TenantConfigResponse"
-        "400":
-          description: Malformed get tenanant config request
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "404":
-          description: Tenand or timeline were not found
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/NotFoundError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
 
   /v1/utilization:
     get:
@@ -1304,12 +701,6 @@ paths:
               application/json:
                 schema:
                   $ref: "#/components/schemas/PageserverUtilization"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
 
 components:
   securitySchemes:
@@ -1391,9 +782,6 @@ components:
       required:
         - mode
       properties:
-        tenant_id:
-          type: string
-          description: Not used, scheduled for removal.
         mode:
           type: string
           enum: ["AttachedSingle", "AttachedMulti", "AttachedStale", "Secondary", "Detached"]
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 47d8ae1148..cf526940f4 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -19,6 +19,8 @@ use pageserver_api::models::LocationConfigListResponse;
 use pageserver_api::models::ShardParameters;
 use pageserver_api::models::TenantDetails;
 use pageserver_api::models::TenantLocationConfigResponse;
+use pageserver_api::models::TenantScanRemoteStorageResponse;
+use pageserver_api::models::TenantScanRemoteStorageShard;
 use pageserver_api::models::TenantShardLocation;
 use pageserver_api::models::TenantShardSplitRequest;
 use pageserver_api::models::TenantShardSplitResponse;
@@ -29,6 +31,7 @@ use pageserver_api::models::{
 };
 use pageserver_api::shard::ShardCount;
 use pageserver_api::shard::TenantShardId;
+use remote_storage::DownloadError;
 use remote_storage::GenericRemoteStorage;
 use remote_storage::TimeTravelError;
 use tenant_size_model::{SizeResult, StorageModel};
@@ -54,6 +57,9 @@ use crate::tenant::mgr::{
 };
 use crate::tenant::mgr::{TenantSlot, UpsertLocationError};
 use crate::tenant::remote_timeline_client;
+use crate::tenant::remote_timeline_client::download_index_part;
+use crate::tenant::remote_timeline_client::list_remote_tenant_shards;
+use crate::tenant::remote_timeline_client::list_remote_timelines;
 use crate::tenant::secondary::SecondaryController;
 use crate::tenant::size::ModelInputs;
 use crate::tenant::storage_layer::LayerAccessStatsReset;
@@ -160,6 +166,9 @@ impl From<PageReconstructError> for ApiError {
     fn from(pre: PageReconstructError) -> ApiError {
         match pre {
             PageReconstructError::Other(pre) => ApiError::InternalServerError(pre),
+            PageReconstructError::MissingKey(e) => {
+                ApiError::InternalServerError(anyhow::anyhow!("{e}"))
+            }
             PageReconstructError::Cancelled => {
                 ApiError::InternalServerError(anyhow::anyhow!("request was cancelled"))
             }
@@ -457,8 +466,12 @@ async fn reload_auth_validation_keys_handler(
             json_response(StatusCode::OK, ())
         }
         Err(e) => {
+            let err_msg = "Error reloading public keys";
             warn!("Error reloading public keys from {key_path:?}: {e:}");
-            json_response(StatusCode::INTERNAL_SERVER_ERROR, ())
+            json_response(
+                StatusCode::INTERNAL_SERVER_ERROR,
+                HttpErrorBody::from_msg(err_msg.to_string()),
+            )
         }
     }
 }
@@ -696,7 +709,7 @@ async fn get_lsn_by_timestamp_handler(
     check_permission(&request, Some(tenant_shard_id.tenant_id))?;
     let state = get_state(&request);
 
-    if !tenant_shard_id.is_zero() {
+    if !tenant_shard_id.is_shard_zero() {
         // Requires SLRU contents, which are only stored on shard zero
         return Err(ApiError::BadRequest(anyhow!(
             "Size calculations are only available on shard zero"
@@ -747,7 +760,7 @@ async fn get_timestamp_of_lsn_handler(
     check_permission(&request, Some(tenant_shard_id.tenant_id))?;
     let state = get_state(&request);
 
-    if !tenant_shard_id.is_zero() {
+    if !tenant_shard_id.is_shard_zero() {
         // Requires SLRU contents, which are only stored on shard zero
         return Err(ApiError::BadRequest(anyhow!(
             "Size calculations are only available on shard zero"
@@ -772,7 +785,9 @@ async fn get_timestamp_of_lsn_handler(
             let time = format_rfc3339(postgres_ffi::from_pg_timestamp(time)).to_string();
             json_response(StatusCode::OK, time)
         }
-        None => json_response(StatusCode::NOT_FOUND, ()),
+        None => Err(ApiError::NotFound(
+            anyhow::anyhow!("Timestamp for lsn {} not found", lsn).into(),
+        )),
     }
 }
 
@@ -1086,7 +1101,7 @@ async fn tenant_size_handler(
     let headers = request.headers();
     let state = get_state(&request);
 
-    if !tenant_shard_id.is_zero() {
+    if !tenant_shard_id.is_shard_zero() {
         return Err(ApiError::BadRequest(anyhow!(
             "Size calculations are only available on shard zero"
         )));
@@ -1903,12 +1918,14 @@ async fn timeline_collect_keyspace(
         let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
         let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
         let at_lsn = at_lsn.unwrap_or_else(|| timeline.get_last_record_lsn());
-        let keys = timeline
+        let (dense_ks, sparse_ks) = timeline
             .collect_keyspace(at_lsn, &ctx)
             .await
             .map_err(|e| ApiError::InternalServerError(e.into()))?;
 
-        let res = pageserver_api::models::partitioning::Partitioning { keys, at_lsn };
+        // This API is currently used by pagebench. Pagebench will iterate all keys within the keyspace.
+        // Therefore, we split dense/sparse keys in this API.
+        let res = pageserver_api::models::partitioning::Partitioning { keys: dense_ks, sparse_keys: sparse_ks, at_lsn };
 
         json_response(StatusCode::OK, res)
     }
@@ -2026,6 +2043,79 @@ async fn secondary_upload_handler(
     json_response(StatusCode::OK, ())
 }
 
+async fn tenant_scan_remote_handler(
+    request: Request<Body>,
+    cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let state = get_state(&request);
+    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+
+    let Some(remote_storage) = state.remote_storage.as_ref() else {
+        return Err(ApiError::BadRequest(anyhow::anyhow!(
+            "Remote storage not configured"
+        )));
+    };
+
+    let mut response = TenantScanRemoteStorageResponse::default();
+
+    let (shards, _other_keys) =
+        list_remote_tenant_shards(remote_storage, tenant_id, cancel.clone())
+            .await
+            .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;
+
+    for tenant_shard_id in shards {
+        let (timeline_ids, _other_keys) =
+            list_remote_timelines(remote_storage, tenant_shard_id, cancel.clone())
+                .await
+                .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;
+
+        let mut generation = Generation::none();
+        for timeline_id in timeline_ids {
+            match download_index_part(
+                remote_storage,
+                &tenant_shard_id,
+                &timeline_id,
+                Generation::MAX,
+                &cancel,
+            )
+            .instrument(info_span!("download_index_part",
+                         tenant_id=%tenant_shard_id.tenant_id,
+                         shard_id=%tenant_shard_id.shard_slug(),
+                         %timeline_id))
+            .await
+            {
+                Ok((index_part, index_generation)) => {
+                    tracing::info!("Found timeline {tenant_shard_id}/{timeline_id} metadata (gen {index_generation:?}, {} layers, {} consistent LSN)",
+                        index_part.layer_metadata.len(), index_part.get_disk_consistent_lsn());
+                    generation = std::cmp::max(generation, index_generation);
+                }
+                Err(DownloadError::NotFound) => {
+                    // This is normal for tenants that were created with multiple shards: they have an unsharded path
+                    // containing the timeline's initdb tarball but no index.  Otherwise it is a bit strange.
+                    tracing::info!("Timeline path {tenant_shard_id}/{timeline_id} exists in remote storage but has no index, skipping");
+                    continue;
+                }
+                Err(e) => {
+                    return Err(ApiError::InternalServerError(anyhow::anyhow!(e)));
+                }
+            };
+        }
+
+        response.shards.push(TenantScanRemoteStorageShard {
+            tenant_shard_id,
+            generation: generation.into(),
+        });
+    }
+
+    if response.shards.is_empty() {
+        return Err(ApiError::NotFound(
+            anyhow::anyhow!("No shards found for tenant ID {tenant_id}").into(),
+        ));
+    }
+
+    json_response(StatusCode::OK, response)
+}
+
 async fn secondary_download_handler(
     request: Request<Body>,
     _cancel: CancellationToken,
@@ -2070,6 +2160,27 @@ async fn secondary_download_handler(
     json_response(status, progress)
 }
 
+async fn secondary_status_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let state = get_state(&request);
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+
+    let Some(secondary_tenant) = state
+        .tenant_manager
+        .get_secondary_tenant_shard(tenant_shard_id)
+    else {
+        return Err(ApiError::NotFound(
+            anyhow::anyhow!("Shard {} not found", tenant_shard_id).into(),
+        ));
+    };
+
+    let progress = secondary_tenant.progress.lock().unwrap().clone();
+
+    json_response(StatusCode::OK, progress)
+}
+
 async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
     json_response(
         StatusCode::NOT_FOUND,
@@ -2422,12 +2533,18 @@ pub fn make_router(
         .post("/v1/tenant/:tenant_shard_id/heatmap_upload", |r| {
             api_handler(r, secondary_upload_handler)
         })
+        .get("/v1/tenant/:tenant_id/scan_remote_storage", |r| {
+            api_handler(r, tenant_scan_remote_handler)
+        })
         .put("/v1/disk_usage_eviction/run", |r| {
             api_handler(r, disk_usage_eviction_run)
         })
         .put("/v1/deletion_queue/flush", |r| {
             api_handler(r, deletion_queue_flush)
         })
+        .get("/v1/tenant/:tenant_shard_id/secondary/status", |r| {
+            api_handler(r, secondary_status_handler)
+        })
         .post("/v1/tenant/:tenant_shard_id/secondary/download", |r| {
             api_handler(r, secondary_download_handler)
         })
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index f947a75f61..930700e50c 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -12,6 +12,7 @@ pub mod disk_usage_eviction_task;
 pub mod http;
 pub mod import_datadir;
 pub use pageserver_api::keyspace;
+pub mod aux_file;
 pub mod metrics;
 pub mod page_cache;
 pub mod page_service;
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 3160f204e2..903bad34cc 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -51,6 +51,9 @@ pub(crate) enum StorageTimeOperation {
     #[strum(serialize = "gc")]
     Gc,
 
+    #[strum(serialize = "find gc cutoffs")]
+    FindGcCutoffs,
+
     #[strum(serialize = "create tenant")]
     CreateTenant,
 }
@@ -86,41 +89,58 @@ pub(crate) static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
-pub(crate) static READ_NUM_FS_LAYERS: Lazy<Histogram> = Lazy::new(|| {
+pub(crate) static READ_NUM_LAYERS_VISITED: Lazy<Histogram> = Lazy::new(|| {
     register_histogram!(
-        "pageserver_read_num_fs_layers",
-        "Number of persistent layers accessed for processing a read request, including those in the cache",
-        vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 10.0, 20.0, 50.0, 100.0],
+        "pageserver_layers_visited_per_read_global",
+        "Number of layers visited to reconstruct one key",
+        vec![1.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0],
+    )
+    .expect("failed to define a metric")
+});
+
+pub(crate) static VEC_READ_NUM_LAYERS_VISITED: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "pageserver_layers_visited_per_vectored_read_global",
+        "Average number of layers visited to reconstruct one key",
+        vec![1.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0],
     )
     .expect("failed to define a metric")
 });
 
 // Metrics collected on operations on the storage repository.
+#[derive(
+    Clone, Copy, enum_map::Enum, strum_macros::EnumString, strum_macros::Display, IntoStaticStr,
+)]
+pub(crate) enum GetKind {
+    Singular,
+    Vectored,
+}
 
 pub(crate) struct ReconstructTimeMetrics {
-    ok: Histogram,
-    err: Histogram,
+    singular: Histogram,
+    vectored: Histogram,
 }
 
 pub(crate) static RECONSTRUCT_TIME: Lazy<ReconstructTimeMetrics> = Lazy::new(|| {
     let inner = register_histogram_vec!(
         "pageserver_getpage_reconstruct_seconds",
         "Time spent in reconstruct_value (reconstruct a page from deltas)",
-        &["result"],
+        &["get_kind"],
         CRITICAL_OP_BUCKETS.into(),
     )
     .expect("failed to define a metric");
+
     ReconstructTimeMetrics {
-        ok: inner.get_metric_with_label_values(&["ok"]).unwrap(),
-        err: inner.get_metric_with_label_values(&["err"]).unwrap(),
+        singular: inner.with_label_values(&[GetKind::Singular.into()]),
+        vectored: inner.with_label_values(&[GetKind::Vectored.into()]),
     }
 });
 
 impl ReconstructTimeMetrics {
-    pub(crate) fn for_result<T, E>(&self, result: &Result<T, E>) -> &Histogram {
-        match result {
-            Ok(_) => &self.ok,
-            Err(_) => &self.err,
+    pub(crate) fn for_get_kind(&self, get_kind: GetKind) -> &Histogram {
+        match get_kind {
+            GetKind::Singular => &self.singular,
+            GetKind::Vectored => &self.vectored,
         }
     }
 }
@@ -133,13 +153,33 @@ pub(crate) static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounter> = Lazy::n
     .expect("failed to define a metric")
 });
 
-pub(crate) static GET_RECONSTRUCT_DATA_TIME: Lazy<Histogram> = Lazy::new(|| {
-    register_histogram!(
+pub(crate) struct ReconstructDataTimeMetrics {
+    singular: Histogram,
+    vectored: Histogram,
+}
+
+impl ReconstructDataTimeMetrics {
+    pub(crate) fn for_get_kind(&self, get_kind: GetKind) -> &Histogram {
+        match get_kind {
+            GetKind::Singular => &self.singular,
+            GetKind::Vectored => &self.vectored,
+        }
+    }
+}
+
+pub(crate) static GET_RECONSTRUCT_DATA_TIME: Lazy<ReconstructDataTimeMetrics> = Lazy::new(|| {
+    let inner = register_histogram_vec!(
         "pageserver_getpage_get_reconstruct_data_seconds",
         "Time spent in get_reconstruct_value_data",
+        &["get_kind"],
         CRITICAL_OP_BUCKETS.into(),
     )
-    .expect("failed to define a metric")
+    .expect("failed to define a metric");
+
+    ReconstructDataTimeMetrics {
+        singular: inner.with_label_values(&[GetKind::Singular.into()]),
+        vectored: inner.with_label_values(&[GetKind::Vectored.into()]),
+    }
 });
 
 pub(crate) static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
@@ -154,6 +194,11 @@ pub(crate) struct GetVectoredLatency {
     map: EnumMap<TaskKind, Option<Histogram>>,
 }
 
+#[allow(dead_code)]
+pub(crate) struct ScanLatency {
+    map: EnumMap<TaskKind, Option<Histogram>>,
+}
+
 impl GetVectoredLatency {
     // Only these task types perform vectored gets. Filter all other tasks out to reduce total
     // cardinality of the metric.
@@ -164,6 +209,48 @@ impl GetVectoredLatency {
     }
 }
 
+impl ScanLatency {
+    // Only these task types perform vectored gets. Filter all other tasks out to reduce total
+    // cardinality of the metric.
+    const TRACKED_TASK_KINDS: [TaskKind; 1] = [TaskKind::PageRequestHandler];
+
+    pub(crate) fn for_task_kind(&self, task_kind: TaskKind) -> Option<&Histogram> {
+        self.map[task_kind].as_ref()
+    }
+}
+
+pub(crate) struct ScanLatencyOngoingRecording<'a> {
+    parent: &'a Histogram,
+    start: std::time::Instant,
+}
+
+impl<'a> ScanLatencyOngoingRecording<'a> {
+    pub(crate) fn start_recording(parent: &'a Histogram) -> ScanLatencyOngoingRecording<'a> {
+        let start = Instant::now();
+        ScanLatencyOngoingRecording { parent, start }
+    }
+
+    pub(crate) fn observe(self, throttled: Option<Duration>) {
+        let elapsed = self.start.elapsed();
+        let ex_throttled = if let Some(throttled) = throttled {
+            elapsed.checked_sub(throttled)
+        } else {
+            Some(elapsed)
+        };
+        if let Some(ex_throttled) = ex_throttled {
+            self.parent.observe(ex_throttled.as_secs_f64());
+        } else {
+            use utils::rate_limit::RateLimit;
+            static LOGGED: Lazy<Mutex<RateLimit>> =
+                Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
+            let mut rate_limit = LOGGED.lock().unwrap();
+            rate_limit.call(|| {
+                warn!("error deducting time spent throttled; this message is logged at a global rate limit");
+            });
+        }
+    }
+}
+
 pub(crate) static GET_VECTORED_LATENCY: Lazy<GetVectoredLatency> = Lazy::new(|| {
     let inner = register_histogram_vec!(
         "pageserver_get_vectored_seconds",
@@ -187,6 +274,29 @@ pub(crate) static GET_VECTORED_LATENCY: Lazy<GetVectoredLatency> = Lazy::new(||
     }
 });
 
+pub(crate) static SCAN_LATENCY: Lazy<ScanLatency> = Lazy::new(|| {
+    let inner = register_histogram_vec!(
+        "pageserver_scan_seconds",
+        "Time spent in scan, excluding time spent in timeline_get_throttle.",
+        &["task_kind"],
+        CRITICAL_OP_BUCKETS.into(),
+    )
+    .expect("failed to define a metric");
+
+    ScanLatency {
+        map: EnumMap::from_array(std::array::from_fn(|task_kind_idx| {
+            let task_kind = <TaskKind as enum_map::Enum>::from_usize(task_kind_idx);
+
+            if ScanLatency::TRACKED_TASK_KINDS.contains(&task_kind) {
+                let task_kind = task_kind.into();
+                Some(inner.with_label_values(&[task_kind]))
+            } else {
+                None
+            }
+        })),
+    }
+});
+
 pub(crate) struct PageCacheMetricsForTaskKind {
     pub read_accesses_materialized_page: IntCounter,
     pub read_accesses_immutable: IntCounter,
@@ -1482,35 +1592,6 @@ pub(crate) static DELETION_QUEUE: Lazy<DeletionQueueMetrics> = Lazy::new(|| {
 }
 });
 
-pub(crate) struct WalIngestMetrics {
-    pub(crate) bytes_received: IntCounter,
-    pub(crate) records_received: IntCounter,
-    pub(crate) records_committed: IntCounter,
-    pub(crate) records_filtered: IntCounter,
-}
-
-pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMetrics {
-    bytes_received: register_int_counter!(
-        "pageserver_wal_ingest_bytes_received",
-        "Bytes of WAL ingested from safekeepers",
-    )
-    .unwrap(),
-    records_received: register_int_counter!(
-        "pageserver_wal_ingest_records_received",
-        "Number of WAL records received from safekeepers"
-    )
-    .expect("failed to define a metric"),
-    records_committed: register_int_counter!(
-        "pageserver_wal_ingest_records_committed",
-        "Number of WAL records which resulted in writes to pageserver storage"
-    )
-    .expect("failed to define a metric"),
-    records_filtered: register_int_counter!(
-        "pageserver_wal_ingest_records_filtered",
-        "Number of WAL records filtered out due to sharding"
-    )
-    .expect("failed to define a metric"),
-});
 pub(crate) struct SecondaryModeMetrics {
     pub(crate) upload_heatmap: IntCounter,
     pub(crate) upload_heatmap_errors: IntCounter,
@@ -1518,7 +1599,8 @@ pub(crate) struct SecondaryModeMetrics {
     pub(crate) download_heatmap: IntCounter,
     pub(crate) download_layer: IntCounter,
 }
-pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| SecondaryModeMetrics {
+pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| {
+    SecondaryModeMetrics {
     upload_heatmap: register_int_counter!(
         "pageserver_secondary_upload_heatmap",
         "Number of heatmaps written to remote storage by attached tenants"
@@ -1536,7 +1618,7 @@ pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| Seco
     .expect("failed to define a metric"),
     download_heatmap: register_int_counter!(
         "pageserver_secondary_download_heatmap",
-        "Number of downloads of heatmaps by secondary mode locations"
+        "Number of downloads of heatmaps by secondary mode locations, including when it hasn't changed"
     )
     .expect("failed to define a metric"),
     download_layer: register_int_counter!(
@@ -1544,6 +1626,7 @@ pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| Seco
         "Number of downloads of layers by secondary mode locations"
     )
     .expect("failed to define a metric"),
+}
 });
 
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
@@ -1710,6 +1793,43 @@ macro_rules! redo_bytes_histogram_count_buckets {
     };
 }
 
+pub(crate) struct WalIngestMetrics {
+    pub(crate) bytes_received: IntCounter,
+    pub(crate) records_received: IntCounter,
+    pub(crate) records_committed: IntCounter,
+    pub(crate) records_filtered: IntCounter,
+    pub(crate) time_spent_on_ingest: Histogram,
+}
+
+pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMetrics {
+    bytes_received: register_int_counter!(
+        "pageserver_wal_ingest_bytes_received",
+        "Bytes of WAL ingested from safekeepers",
+    )
+    .unwrap(),
+    records_received: register_int_counter!(
+        "pageserver_wal_ingest_records_received",
+        "Number of WAL records received from safekeepers"
+    )
+    .expect("failed to define a metric"),
+    records_committed: register_int_counter!(
+        "pageserver_wal_ingest_records_committed",
+        "Number of WAL records which resulted in writes to pageserver storage"
+    )
+    .expect("failed to define a metric"),
+    records_filtered: register_int_counter!(
+        "pageserver_wal_ingest_records_filtered",
+        "Number of WAL records filtered out due to sharding"
+    )
+    .expect("failed to define a metric"),
+    time_spent_on_ingest: register_histogram!(
+        "pageserver_wal_ingest_put_value_seconds",
+        "Actual time spent on ingesting a record",
+        redo_histogram_time_buckets!(),
+    )
+    .expect("failed to define a metric"),
+});
+
 pub(crate) static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
     register_histogram!(
         "pageserver_wal_redo_seconds",
@@ -1819,6 +1939,29 @@ impl Default for WalRedoProcessCounters {
 pub(crate) static WAL_REDO_PROCESS_COUNTERS: Lazy<WalRedoProcessCounters> =
     Lazy::new(WalRedoProcessCounters::default);
 
+#[cfg(not(test))]
+pub mod wal_redo {
+    use super::*;
+
+    static PROCESS_KIND: Lazy<std::sync::Mutex<UIntGaugeVec>> = Lazy::new(|| {
+        std::sync::Mutex::new(
+            register_uint_gauge_vec!(
+                "pageserver_wal_redo_process_kind",
+                "The configured process kind for walredo",
+                &["kind"],
+            )
+            .unwrap(),
+        )
+    });
+
+    pub fn set_process_kind_metric(kind: crate::walredo::ProcessKind) {
+        // use guard to avoid races around the next two steps
+        let guard = PROCESS_KIND.lock().unwrap();
+        guard.reset();
+        guard.with_label_values(&[&format!("{kind}")]).set(1);
+    }
+}
+
 /// Similar to `prometheus::HistogramTimer` but does not record on drop.
 pub(crate) struct StorageTimeMetricsTimer {
     metrics: StorageTimeMetrics,
@@ -1840,6 +1983,22 @@ impl StorageTimeMetricsTimer {
         self.metrics.timeline_count.inc();
         self.metrics.global_histogram.observe(duration);
     }
+
+    /// Turns this timer into a timer, which will always record -- usually this means recording
+    /// regardless an early `?` path was taken in a function.
+    pub(crate) fn record_on_drop(self) -> AlwaysRecordingStorageTimeMetricsTimer {
+        AlwaysRecordingStorageTimeMetricsTimer(Some(self))
+    }
+}
+
+pub(crate) struct AlwaysRecordingStorageTimeMetricsTimer(Option<StorageTimeMetricsTimer>);
+
+impl Drop for AlwaysRecordingStorageTimeMetricsTimer {
+    fn drop(&mut self) {
+        if let Some(inner) = self.0.take() {
+            inner.stop_and_record();
+        }
+    }
 }
 
 /// Timing facilities for an globally histogrammed metric, which is supported by per tenant and
@@ -1900,6 +2059,7 @@ pub(crate) struct TimelineMetrics {
     pub imitate_logical_size_histo: StorageTimeMetrics,
     pub load_layer_map_histo: StorageTimeMetrics,
     pub garbage_collect_histo: StorageTimeMetrics,
+    pub find_gc_cutoffs_histo: StorageTimeMetrics,
     pub last_record_gauge: IntGauge,
     resident_physical_size_gauge: UIntGauge,
     /// copy of LayeredTimeline.current_logical_size
@@ -1960,6 +2120,12 @@ impl TimelineMetrics {
             &shard_id,
             &timeline_id,
         );
+        let find_gc_cutoffs_histo = StorageTimeMetrics::new(
+            StorageTimeOperation::FindGcCutoffs,
+            &tenant_id,
+            &shard_id,
+            &timeline_id,
+        );
         let last_record_gauge = LAST_RECORD_LSN
             .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
             .unwrap();
@@ -2002,6 +2168,7 @@ impl TimelineMetrics {
             logical_size_histo,
             imitate_logical_size_histo,
             garbage_collect_histo,
+            find_gc_cutoffs_histo,
             load_layer_map_histo,
             last_record_gauge,
             resident_physical_size_gauge,
@@ -2089,7 +2256,7 @@ impl TimelineMetrics {
 
 pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) {
     // Only shard zero deals in synthetic sizes
-    if tenant_shard_id.is_zero() {
+    if tenant_shard_id.is_shard_zero() {
         let tid = tenant_shard_id.tenant_id.to_string();
         let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]);
     }
@@ -2746,7 +2913,8 @@ pub fn preinitialize_metrics() {
 
     // histograms
     [
-        &READ_NUM_FS_LAYERS,
+        &READ_NUM_LAYERS_VISITED,
+        &VEC_READ_NUM_LAYERS_VISITED,
         &WAIT_LSN_TIME,
         &WAL_REDO_TIME,
         &WAL_REDO_RECORDS_HISTOGRAM,
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 5956b3c322..748c4b18e3 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -1,13 +1,5 @@
-//
 //! The Page Service listens for client connections and serves their GetPage@LSN
 //! requests.
-//
-//   It is possible to connect here using usual psql/pgbench/libpq. Following
-// commands are supported now:
-//     *status* -- show actual info about this pageserver,
-//     *pagestream* -- enter mode where smgr and pageserver talk with their
-//  custom protocol.
-//
 
 use anyhow::Context;
 use async_compression::tokio::write::GzipEncoder;
@@ -23,7 +15,7 @@ use pageserver_api::models::{
     PagestreamErrorResponse, PagestreamExistsRequest, PagestreamExistsResponse,
     PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetPageResponse,
     PagestreamGetSlruSegmentRequest, PagestreamGetSlruSegmentResponse, PagestreamNblocksRequest,
-    PagestreamNblocksResponse,
+    PagestreamNblocksResponse, PagestreamProtocolVersion,
 };
 use pageserver_api::shard::ShardIndex;
 use pageserver_api::shard::ShardNumber;
@@ -56,6 +48,7 @@ use utils::{
 
 use crate::auth::check_permission;
 use crate::basebackup;
+use crate::basebackup::BasebackupError;
 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::import_datadir::import_wal_from_tar;
@@ -551,6 +544,7 @@ impl PageServerHandler {
         pgb: &mut PostgresBackend<IO>,
         tenant_id: TenantId,
         timeline_id: TimelineId,
+        protocol_version: PagestreamProtocolVersion,
         ctx: RequestContext,
     ) -> Result<(), QueryError>
     where
@@ -613,14 +607,15 @@ impl PageServerHandler {
                 t.trace(&copy_data_bytes)
             }
 
-            let neon_fe_msg = PagestreamFeMessage::parse(&mut copy_data_bytes.reader())?;
+            let neon_fe_msg =
+                PagestreamFeMessage::parse(&mut copy_data_bytes.reader(), protocol_version)?;
 
             // TODO: We could create a new per-request context here, with unique ID.
             // Currently we use the same per-timeline context for all requests
 
             let (response, span) = match neon_fe_msg {
                 PagestreamFeMessage::Exists(req) => {
-                    let span = tracing::info_span!("handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.lsn);
+                    let span = tracing::info_span!("handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.request_lsn);
                     (
                         self.handle_get_rel_exists_request(tenant_id, timeline_id, &req, &ctx)
                             .instrument(span.clone())
@@ -629,7 +624,7 @@ impl PageServerHandler {
                     )
                 }
                 PagestreamFeMessage::Nblocks(req) => {
-                    let span = tracing::info_span!("handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.lsn);
+                    let span = tracing::info_span!("handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.request_lsn);
                     (
                         self.handle_get_nblocks_request(tenant_id, timeline_id, &req, &ctx)
                             .instrument(span.clone())
@@ -639,7 +634,7 @@ impl PageServerHandler {
                 }
                 PagestreamFeMessage::GetPage(req) => {
                     // shard_id is filled in by the handler
-                    let span = tracing::info_span!("handle_get_page_at_lsn_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.lsn);
+                    let span = tracing::info_span!("handle_get_page_at_lsn_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.request_lsn);
                     (
                         self.handle_get_page_at_lsn_request(tenant_id, timeline_id, &req, &ctx)
                             .instrument(span.clone())
@@ -648,7 +643,7 @@ impl PageServerHandler {
                     )
                 }
                 PagestreamFeMessage::DbSize(req) => {
-                    let span = tracing::info_span!("handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.lsn);
+                    let span = tracing::info_span!("handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.request_lsn);
                     (
                         self.handle_db_size_request(tenant_id, timeline_id, &req, &ctx)
                             .instrument(span.clone())
@@ -657,7 +652,7 @@ impl PageServerHandler {
                     )
                 }
                 PagestreamFeMessage::GetSlruSegment(req) => {
-                    let span = tracing::info_span!("handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.lsn);
+                    let span = tracing::info_span!("handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.request_lsn);
                     (
                         self.handle_get_slru_segment_request(tenant_id, timeline_id, &req, &ctx)
                             .instrument(span.clone())
@@ -838,78 +833,80 @@ impl PageServerHandler {
     /// Helper function to handle the LSN from client request.
     ///
     /// Each GetPage (and Exists and Nblocks) request includes information about
-    /// which version of the page is being requested. The client can request the
-    /// latest version of the page, or the version that's valid at a particular
-    /// LSN. The primary compute node will always request the latest page
-    /// version, while a standby will request a version at the LSN that it's
-    /// currently caught up to.
+    /// which version of the page is being requested. The primary compute node
+    /// will always request the latest page version, by setting 'request_lsn' to
+    /// the last inserted or flushed WAL position, while a standby will request
+    /// a version at the LSN that it's currently caught up to.
     ///
     /// In either case, if the page server hasn't received the WAL up to the
     /// requested LSN yet, we will wait for it to arrive. The return value is
     /// the LSN that should be used to look up the page versions.
+    ///
+    /// In addition to the request LSN, each request carries another LSN,
+    /// 'not_modified_since', which is a hint to the pageserver that the client
+    /// knows that the page has not been modified between 'not_modified_since'
+    /// and the request LSN. This allows skipping the wait, as long as the WAL
+    /// up to 'not_modified_since' has arrived. If the client doesn't have any
+    /// information about when the page was modified, it will use
+    /// not_modified_since == lsn. If the client lies and sends a too low
+    /// not_modified_hint such that there are in fact later page versions, the
+    /// behavior is undefined: the pageserver may return any of the page versions
+    /// or an error.
     async fn wait_or_get_last_lsn(
         timeline: &Timeline,
-        mut lsn: Lsn,
-        latest: bool,
+        request_lsn: Lsn,
+        not_modified_since: Lsn,
         latest_gc_cutoff_lsn: &RcuReadGuard<Lsn>,
         ctx: &RequestContext,
     ) -> Result<Lsn, PageStreamError> {
-        if latest {
-            // Latest page version was requested. If LSN is given, it is a hint
-            // to the page server that there have been no modifications to the
-            // page after that LSN. If we haven't received WAL up to that point,
-            // wait until it arrives.
-            let last_record_lsn = timeline.get_last_record_lsn();
+        let last_record_lsn = timeline.get_last_record_lsn();
 
-            // Note: this covers the special case that lsn == Lsn(0). That
-            // special case means "return the latest version whatever it is",
-            // and it's used for bootstrapping purposes, when the page server is
-            // connected directly to the compute node. That is needed because
-            // when you connect to the compute node, to receive the WAL, the
-            // walsender process will do a look up in the pg_authid catalog
-            // table for authentication. That poses a deadlock problem: the
-            // catalog table lookup will send a GetPage request, but the GetPage
-            // request will block in the page server because the recent WAL
-            // hasn't been received yet, and it cannot be received until the
-            // walsender completes the authentication and starts streaming the
-            // WAL.
-            if lsn <= last_record_lsn {
-                lsn = last_record_lsn;
+        // Sanity check the request
+        if request_lsn < not_modified_since {
+            return Err(PageStreamError::BadRequest(
+                format!(
+                    "invalid request with request LSN {} and not_modified_since {}",
+                    request_lsn, not_modified_since,
+                )
+                .into(),
+            ));
+        }
+
+        if request_lsn < **latest_gc_cutoff_lsn {
+            // Check explicitly for INVALID just to get a less scary error message if the
+            // request is obviously bogus
+            return Err(if request_lsn == Lsn::INVALID {
+                PageStreamError::BadRequest("invalid LSN(0) in request".into())
             } else {
-                timeline
-                    .wait_lsn(
-                        lsn,
-                        crate::tenant::timeline::WaitLsnWaiter::PageService,
-                        ctx,
-                    )
-                    .await?;
-                // Since we waited for 'lsn' to arrive, that is now the last
-                // record LSN. (Or close enough for our purposes; the
-                // last-record LSN can advance immediately after we return
-                // anyway)
-            }
-        } else {
-            if lsn == Lsn(0) {
-                return Err(PageStreamError::BadRequest(
-                    "invalid LSN(0) in request".into(),
-                ));
-            }
+                PageStreamError::BadRequest(format!(
+                        "tried to request a page version that was garbage collected. requested at {} gc cutoff {}",
+                        request_lsn, **latest_gc_cutoff_lsn
+                    ).into())
+            });
+        }
+
+        // Wait for WAL up to 'not_modified_since' to arrive, if necessary
+        if not_modified_since > last_record_lsn {
             timeline
                 .wait_lsn(
-                    lsn,
+                    not_modified_since,
                     crate::tenant::timeline::WaitLsnWaiter::PageService,
                     ctx,
                 )
                 .await?;
+            // Since we waited for 'not_modified_since' to arrive, that is now the last
+            // record LSN. (Or close enough for our purposes; the last-record LSN can
+            // advance immediately after we return anyway)
+            Ok(not_modified_since)
+        } else {
+            // It might be better to use max(not_modified_since, latest_gc_cutoff_lsn)
+            // here instead. That would give the same result, since we know that there
+            // haven't been any modifications since 'not_modified_since'. Using an older
+            // LSN might be faster, because that could allow skipping recent layers when
+            // finding the page. However, we have historically used 'last_record_lsn', so
+            // stick to that for now.
+            Ok(std::cmp::min(last_record_lsn, request_lsn))
         }
-
-        if lsn < **latest_gc_cutoff_lsn {
-            return Err(PageStreamError::BadRequest(format!(
-                "tried to request a page version that was garbage collected. requested at {} gc cutoff {}",
-                lsn, **latest_gc_cutoff_lsn
-            ).into()));
-        }
-        Ok(lsn)
     }
 
     #[instrument(skip_all, fields(shard_id))]
@@ -926,12 +923,17 @@ impl PageServerHandler {
             .start_timer(metrics::SmgrQueryType::GetRelExists, ctx);
 
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
-        let lsn =
-            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
-                .await?;
+        let lsn = Self::wait_or_get_last_lsn(
+            timeline,
+            req.request_lsn,
+            req.not_modified_since,
+            &latest_gc_cutoff_lsn,
+            ctx,
+        )
+        .await?;
 
         let exists = timeline
-            .get_rel_exists(req.rel, Version::Lsn(lsn), req.latest, ctx)
+            .get_rel_exists(req.rel, Version::Lsn(lsn), ctx)
             .await?;
 
         Ok(PagestreamBeMessage::Exists(PagestreamExistsResponse {
@@ -954,12 +956,17 @@ impl PageServerHandler {
             .start_timer(metrics::SmgrQueryType::GetRelSize, ctx);
 
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
-        let lsn =
-            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
-                .await?;
+        let lsn = Self::wait_or_get_last_lsn(
+            timeline,
+            req.request_lsn,
+            req.not_modified_since,
+            &latest_gc_cutoff_lsn,
+            ctx,
+        )
+        .await?;
 
         let n_blocks = timeline
-            .get_rel_size(req.rel, Version::Lsn(lsn), req.latest, ctx)
+            .get_rel_size(req.rel, Version::Lsn(lsn), ctx)
             .await?;
 
         Ok(PagestreamBeMessage::Nblocks(PagestreamNblocksResponse {
@@ -982,18 +989,17 @@ impl PageServerHandler {
             .start_timer(metrics::SmgrQueryType::GetDbSize, ctx);
 
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
-        let lsn =
-            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
-                .await?;
+        let lsn = Self::wait_or_get_last_lsn(
+            timeline,
+            req.request_lsn,
+            req.not_modified_since,
+            &latest_gc_cutoff_lsn,
+            ctx,
+        )
+        .await?;
 
         let total_blocks = timeline
-            .get_db_size(
-                DEFAULTTABLESPACE_OID,
-                req.dbnode,
-                Version::Lsn(lsn),
-                req.latest,
-                ctx,
-            )
+            .get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, Version::Lsn(lsn), ctx)
             .await?;
         let db_size = total_blocks as i64 * BLCKSZ as i64;
 
@@ -1160,12 +1166,17 @@ impl PageServerHandler {
             .start_timer(metrics::SmgrQueryType::GetPageAtLsn, ctx);
 
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
-        let lsn =
-            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
-                .await?;
+        let lsn = Self::wait_or_get_last_lsn(
+            timeline,
+            req.request_lsn,
+            req.not_modified_since,
+            &latest_gc_cutoff_lsn,
+            ctx,
+        )
+        .await?;
 
         let page = timeline
-            .get_rel_page_at_lsn(req.rel, req.blkno, Version::Lsn(lsn), req.latest, ctx)
+            .get_rel_page_at_lsn(req.rel, req.blkno, Version::Lsn(lsn), ctx)
             .await?;
 
         Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
@@ -1188,9 +1199,14 @@ impl PageServerHandler {
             .start_timer(metrics::SmgrQueryType::GetSlruSegment, ctx);
 
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
-        let lsn =
-            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
-                .await?;
+        let lsn = Self::wait_or_get_last_lsn(
+            timeline,
+            req.request_lsn,
+            req.not_modified_since,
+            &latest_gc_cutoff_lsn,
+            ctx,
+        )
+        .await?;
 
         let kind = SlruKind::from_repr(req.kind)
             .ok_or(PageStreamError::BadRequest("invalid SLRU kind".into()))?;
@@ -1201,6 +1217,10 @@ impl PageServerHandler {
         ))
     }
 
+    /// Note on "fullbackup":
+    /// Full basebackups should only be used for debugging purposes.
+    /// Originally, it was introduced to enable breaking storage format changes,
+    /// but that is not applicable anymore.
     #[allow(clippy::too_many_arguments)]
     #[instrument(skip_all, fields(shard_id, ?lsn, ?prev_lsn, %full_backup))]
     async fn handle_basebackup_request<IO>(
@@ -1217,6 +1237,13 @@ impl PageServerHandler {
     where
         IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
     {
+        fn map_basebackup_error(err: BasebackupError) -> QueryError {
+            match err {
+                BasebackupError::Client(e) => QueryError::Disconnected(ConnectionError::Io(e)),
+                BasebackupError::Server(e) => QueryError::Other(e),
+            }
+        }
+
         let started = std::time::Instant::now();
 
         // check that the timeline exists
@@ -1242,7 +1269,8 @@ impl PageServerHandler {
         let lsn_awaited_after = started.elapsed();
 
         // switch client to COPYOUT
-        pgb.write_message_noflush(&BeMessage::CopyOutResponse)?;
+        pgb.write_message_noflush(&BeMessage::CopyOutResponse)
+            .map_err(QueryError::Disconnected)?;
         self.flush_cancellable(pgb, &timeline.cancel).await?;
 
         // Send a tarball of the latest layer on the timeline. Compress if not
@@ -1257,7 +1285,8 @@ impl PageServerHandler {
                 full_backup,
                 ctx,
             )
-            .await?;
+            .await
+            .map_err(map_basebackup_error)?;
         } else {
             let mut writer = pgb.copyout_writer();
             if gzip {
@@ -1278,9 +1307,13 @@ impl PageServerHandler {
                     full_backup,
                     ctx,
                 )
-                .await?;
+                .await
+                .map_err(map_basebackup_error)?;
                 // shutdown the encoder to ensure the gzip footer is written
-                encoder.shutdown().await?;
+                encoder
+                    .shutdown()
+                    .await
+                    .map_err(|e| QueryError::Disconnected(ConnectionError::Io(e)))?;
             } else {
                 basebackup::send_basebackup_tarball(
                     &mut writer,
@@ -1290,11 +1323,13 @@ impl PageServerHandler {
                     full_backup,
                     ctx,
                 )
-                .await?;
+                .await
+                .map_err(map_basebackup_error)?;
             }
         }
 
-        pgb.write_message_noflush(&BeMessage::CopyDone)?;
+        pgb.write_message_noflush(&BeMessage::CopyDone)
+            .map_err(QueryError::Disconnected)?;
         self.flush_cancellable(pgb, &timeline.cancel).await?;
 
         let basebackup_after = started
@@ -1403,7 +1438,34 @@ where
 
         let ctx = self.connection_ctx.attached_child();
         debug!("process query {query_string:?}");
-        if query_string.starts_with("pagestream ") {
+        if query_string.starts_with("pagestream_v2 ") {
+            let (_, params_raw) = query_string.split_at("pagestream_v2 ".len());
+            let params = params_raw.split(' ').collect::<Vec<_>>();
+            if params.len() != 2 {
+                return Err(QueryError::Other(anyhow::anyhow!(
+                    "invalid param number for pagestream command"
+                )));
+            }
+            let tenant_id = TenantId::from_str(params[0])
+                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
+            let timeline_id = TimelineId::from_str(params[1])
+                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
+
+            tracing::Span::current()
+                .record("tenant_id", field::display(tenant_id))
+                .record("timeline_id", field::display(timeline_id));
+
+            self.check_permission(Some(tenant_id))?;
+
+            self.handle_pagerequests(
+                pgb,
+                tenant_id,
+                timeline_id,
+                PagestreamProtocolVersion::V2,
+                ctx,
+            )
+            .await?;
+        } else if query_string.starts_with("pagestream ") {
             let (_, params_raw) = query_string.split_at("pagestream ".len());
             let params = params_raw.split(' ').collect::<Vec<_>>();
             if params.len() != 2 {
@@ -1422,8 +1484,14 @@ where
 
             self.check_permission(Some(tenant_id))?;
 
-            self.handle_pagerequests(pgb, tenant_id, timeline_id, ctx)
-                .await?;
+            self.handle_pagerequests(
+                pgb,
+                tenant_id,
+                timeline_id,
+                PagestreamProtocolVersion::V1,
+                ctx,
+            )
+            .await?;
         } else if query_string.starts_with("basebackup ") {
             let (_, params_raw) = query_string.split_at("basebackup ".len());
             let params = params_raw.split_whitespace().collect::<Vec<_>>();
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 6f7d74bdee..12314c5961 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -9,6 +9,7 @@
 use super::tenant::{PageReconstructError, Timeline};
 use crate::context::RequestContext;
 use crate::keyspace::{KeySpace, KeySpaceAccum};
+use crate::metrics::WAL_INGEST;
 use crate::repository::*;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
 use crate::walrecord::NeonWalRecord;
@@ -22,6 +23,7 @@ use pageserver_api::key::{
     slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range,
     AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
 };
+use pageserver_api::keyspace::SparseKeySpace;
 use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::BLCKSZ;
@@ -175,7 +177,6 @@ impl Timeline {
         tag: RelTag,
         blknum: BlockNumber,
         version: Version<'_>,
-        latest: bool,
         ctx: &RequestContext,
     ) -> Result<Bytes, PageReconstructError> {
         if tag.relnode == 0 {
@@ -184,7 +185,7 @@ impl Timeline {
             ));
         }
 
-        let nblocks = self.get_rel_size(tag, version, latest, ctx).await?;
+        let nblocks = self.get_rel_size(tag, version, ctx).await?;
         if blknum >= nblocks {
             debug!(
                 "read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page",
@@ -206,7 +207,6 @@ impl Timeline {
         spcnode: Oid,
         dbnode: Oid,
         version: Version<'_>,
-        latest: bool,
         ctx: &RequestContext,
     ) -> Result<usize, PageReconstructError> {
         let mut total_blocks = 0;
@@ -214,7 +214,7 @@ impl Timeline {
         let rels = self.list_rels(spcnode, dbnode, version, ctx).await?;
 
         for rel in rels {
-            let n_blocks = self.get_rel_size(rel, version, latest, ctx).await?;
+            let n_blocks = self.get_rel_size(rel, version, ctx).await?;
             total_blocks += n_blocks as usize;
         }
         Ok(total_blocks)
@@ -225,7 +225,6 @@ impl Timeline {
         &self,
         tag: RelTag,
         version: Version<'_>,
-        latest: bool,
         ctx: &RequestContext,
     ) -> Result<BlockNumber, PageReconstructError> {
         if tag.relnode == 0 {
@@ -239,7 +238,7 @@ impl Timeline {
         }
 
         if (tag.forknum == FSM_FORKNUM || tag.forknum == VISIBILITYMAP_FORKNUM)
-            && !self.get_rel_exists(tag, version, latest, ctx).await?
+            && !self.get_rel_exists(tag, version, ctx).await?
         {
             // FIXME: Postgres sometimes calls smgrcreate() to create
             // FSM, and smgrnblocks() on it immediately afterwards,
@@ -252,16 +251,8 @@ impl Timeline {
         let mut buf = version.get(self, key, ctx).await?;
         let nblocks = buf.get_u32_le();
 
-        if latest {
-            // Update relation size cache only if "latest" flag is set.
-            // This flag is set by compute when it is working with most recent version of relation.
-            // Typically master compute node always set latest=true.
-            // Please notice, that even if compute node "by mistake" specifies old LSN but set
-            // latest=true, then it can not cause cache corruption, because with latest=true
-            // pageserver choose max(request_lsn, last_written_lsn) and so cached value will be
-            // associated with most recent value of LSN.
-            self.update_cached_rel_size(tag, version.get_lsn(), nblocks);
-        }
+        self.update_cached_rel_size(tag, version.get_lsn(), nblocks);
+
         Ok(nblocks)
     }
 
@@ -270,7 +261,6 @@ impl Timeline {
         &self,
         tag: RelTag,
         version: Version<'_>,
-        _latest: bool,
         ctx: &RequestContext,
     ) -> Result<bool, PageReconstructError> {
         if tag.relnode == 0 {
@@ -289,7 +279,7 @@ impl Timeline {
 
         match RelDirectory::des(&buf).context("deserialization failure") {
             Ok(dir) => {
-                let exists = dir.rels.get(&(tag.relnode, tag.forknum)).is_some();
+                let exists = dir.rels.contains(&(tag.relnode, tag.forknum));
                 Ok(exists)
             }
             Err(e) => Err(PageReconstructError::from(e)),
@@ -389,7 +379,7 @@ impl Timeline {
 
         match SlruSegmentDirectory::des(&buf).context("deserialization failure") {
             Ok(dir) => {
-                let exists = dir.segments.get(&segno).is_some();
+                let exists = dir.segments.contains(&segno);
                 Ok(exists)
             }
             Err(e) => Err(PageReconstructError::from(e)),
@@ -466,6 +456,12 @@ impl Timeline {
                 // Didn't find any commit timestamps smaller than the request
                 Ok(LsnForTimestamp::Past(min_lsn))
             }
+            (true, _) if commit_lsn < min_lsn => {
+                // the search above did set found_smaller to true but it never increased the lsn.
+                // Then, low is still the old min_lsn, and the subtraction above gave a value
+                // below the min_lsn. We should never do that.
+                Ok(LsnForTimestamp::Past(min_lsn))
+            }
             (true, false) => {
                 // Only found commits with timestamps smaller than the request.
                 // It's still a valid case for branch creation, return it.
@@ -735,11 +731,13 @@ impl Timeline {
     /// Get a KeySpace that covers all the Keys that are in use at the given LSN.
     /// Anything that's not listed maybe removed from the underlying storage (from
     /// that LSN forwards).
+    ///
+    /// The return value is (dense keyspace, sparse keyspace).
     pub(crate) async fn collect_keyspace(
         &self,
         lsn: Lsn,
         ctx: &RequestContext,
-    ) -> Result<KeySpace, CollectKeySpaceError> {
+    ) -> Result<(KeySpace, SparseKeySpace), CollectKeySpaceError> {
         // Iterate through key ranges, greedily packing them into partitions
         let mut result = KeySpaceAccum::new();
 
@@ -811,13 +809,18 @@ impl Timeline {
         if self.get(AUX_FILES_KEY, lsn, ctx).await.is_ok() {
             result.add_key(AUX_FILES_KEY);
         }
-        Ok(result.to_keyspace())
+
+        Ok((
+            result.to_keyspace(),
+            /* AUX sparse key space */
+            SparseKeySpace(KeySpace::single(Key::metadata_aux_key_range())),
+        ))
     }
 
     /// Get cached size of relation if it not updated after specified LSN
     pub fn get_cached_rel_size(&self, tag: &RelTag, lsn: Lsn) -> Option<BlockNumber> {
         let rel_size_cache = self.rel_size_cache.read().unwrap();
-        if let Some((cached_lsn, nblocks)) = rel_size_cache.get(tag) {
+        if let Some((cached_lsn, nblocks)) = rel_size_cache.map.get(tag) {
             if lsn >= *cached_lsn {
                 return Some(*nblocks);
             }
@@ -828,7 +831,16 @@ impl Timeline {
     /// Update cached relation size if there is no more recent update
     pub fn update_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) {
         let mut rel_size_cache = self.rel_size_cache.write().unwrap();
-        match rel_size_cache.entry(tag) {
+
+        if lsn < rel_size_cache.complete_as_of {
+            // Do not cache old values. It's safe to cache the size on read, as long as
+            // the read was at an LSN since we started the WAL ingestion. Reasoning: we
+            // never evict values from the cache, so if the relation size changed after
+            // 'lsn', the new value is already in the cache.
+            return;
+        }
+
+        match rel_size_cache.map.entry(tag) {
             hash_map::Entry::Occupied(mut entry) => {
                 let cached_lsn = entry.get_mut();
                 if lsn >= cached_lsn.0 {
@@ -844,13 +856,13 @@ impl Timeline {
     /// Store cached relation size
     pub fn set_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) {
         let mut rel_size_cache = self.rel_size_cache.write().unwrap();
-        rel_size_cache.insert(tag, (lsn, nblocks));
+        rel_size_cache.map.insert(tag, (lsn, nblocks));
     }
 
     /// Remove cached relation size
     pub fn remove_cached_rel_size(&self, tag: &RelTag) {
         let mut rel_size_cache = self.rel_size_cache.write().unwrap();
-        rel_size_cache.remove(tag);
+        rel_size_cache.map.remove(tag);
     }
 }
 
@@ -1088,7 +1100,7 @@ impl<'a> DatadirModification<'a> {
     ) -> anyhow::Result<()> {
         let total_blocks = self
             .tline
-            .get_db_size(spcnode, dbnode, Version::Modified(self), true, ctx)
+            .get_db_size(spcnode, dbnode, Version::Modified(self), ctx)
             .await?;
 
         // Remove entry from dbdir
@@ -1131,21 +1143,22 @@ impl<'a> DatadirModification<'a> {
         let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await.context("read db")?)
             .context("deserialize db")?;
         let rel_dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode);
-        let mut rel_dir = if dbdir.dbdirs.get(&(rel.spcnode, rel.dbnode)).is_none() {
-            // Didn't exist. Update dbdir
-            dbdir.dbdirs.insert((rel.spcnode, rel.dbnode), false);
-            let buf = DbDirectory::ser(&dbdir).context("serialize db")?;
-            self.pending_directory_entries
-                .push((DirectoryKind::Db, dbdir.dbdirs.len()));
-            self.put(DBDIR_KEY, Value::Image(buf.into()));
+        let mut rel_dir =
+            if let hash_map::Entry::Vacant(e) = dbdir.dbdirs.entry((rel.spcnode, rel.dbnode)) {
+                // Didn't exist. Update dbdir
+                e.insert(false);
+                let buf = DbDirectory::ser(&dbdir).context("serialize db")?;
+                self.pending_directory_entries
+                    .push((DirectoryKind::Db, dbdir.dbdirs.len()));
+                self.put(DBDIR_KEY, Value::Image(buf.into()));
 
-            // and create the RelDirectory
-            RelDirectory::default()
-        } else {
-            // reldir already exists, fetch it
-            RelDirectory::des(&self.get(rel_dir_key, ctx).await.context("read db")?)
-                .context("deserialize db")?
-        };
+                // and create the RelDirectory
+                RelDirectory::default()
+            } else {
+                // reldir already exists, fetch it
+                RelDirectory::des(&self.get(rel_dir_key, ctx).await.context("read db")?)
+                    .context("deserialize db")?
+            };
 
         // Add the new relation to the rel directory entry, and write it back
         if !rel_dir.rels.insert((rel.relnode, rel.forknum)) {
@@ -1187,7 +1200,7 @@ impl<'a> DatadirModification<'a> {
         anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
         if self
             .tline
-            .get_rel_exists(rel, Version::Modified(self), true, ctx)
+            .get_rel_exists(rel, Version::Modified(self), ctx)
             .await?
         {
             let size_key = rel_size_to_key(rel);
@@ -1401,7 +1414,7 @@ impl<'a> DatadirModification<'a> {
         let n_files;
         let mut aux_files = self.tline.aux_files.lock().await;
         if let Some(mut dir) = aux_files.dir.take() {
-            // We already updated aux files in `self`: emit a delta and update our latest value
+            // We already updated aux files in `self`: emit a delta and update our latest value.
             dir.upsert(file_path.clone(), content.clone());
             n_files = dir.files.len();
             if aux_files.n_deltas == MAX_AUX_FILE_DELTAS {
@@ -1446,10 +1459,14 @@ impl<'a> DatadirModification<'a> {
                     // reset the map.
                     return Err(e.into());
                 }
-                // FIXME: PageReconstructError doesn't have an explicit variant for key-not-found, so
-                // we are assuming that all _other_ possible errors represents a missing key.  If some
-                // other error occurs, we may incorrectly reset the map of aux files.
-                Err(PageReconstructError::Other(_) | PageReconstructError::WalRedo(_)) => {
+                // Note: we added missing key error variant in https://github.com/neondatabase/neon/pull/7393 but
+                // the original code assumes all other errors are missing keys. Therefore, we keep the code path
+                // the same for now, though in theory, we should only match the `MissingKey` variant.
+                Err(
+                    PageReconstructError::Other(_)
+                    | PageReconstructError::WalRedo(_)
+                    | PageReconstructError::MissingKey { .. },
+                ) => {
                     // Key is missing, we must insert an image as the basis for subsequent deltas.
 
                     let mut dir = AuxFilesDirectory {
@@ -1541,6 +1558,8 @@ impl<'a> DatadirModification<'a> {
     pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
         let mut writer = self.tline.writer().await;
 
+        let timer = WAL_INGEST.time_spent_on_ingest.start_timer();
+
         let pending_nblocks = self.pending_nblocks;
         self.pending_nblocks = 0;
 
@@ -1580,6 +1599,8 @@ impl<'a> DatadirModification<'a> {
             writer.update_directory_entries_count(kind, count as u64);
         }
 
+        timer.observe_duration();
+
         Ok(())
     }
 
diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs
index 9959d105eb..0a9ac50aad 100644
--- a/pageserver/src/repository.rs
+++ b/pageserver/src/repository.rs
@@ -33,6 +33,52 @@ impl Value {
     }
 }
 
+#[cfg(test)]
+#[derive(Debug, PartialEq)]
+pub(crate) enum InvalidInput {
+    TooShortValue,
+    TooShortPostgresRecord,
+}
+
+/// We could have a ValueRef where everything is `serde(borrow)`. Before implementing that, lets
+/// use this type for querying if a slice looks some particular way.
+#[cfg(test)]
+pub(crate) struct ValueBytes;
+
+#[cfg(test)]
+impl ValueBytes {
+    pub(crate) fn will_init(raw: &[u8]) -> Result<bool, InvalidInput> {
+        if raw.len() < 12 {
+            return Err(InvalidInput::TooShortValue);
+        }
+
+        let value_discriminator = &raw[0..4];
+
+        if value_discriminator == [0, 0, 0, 0] {
+            // Value::Image always initializes
+            return Ok(true);
+        }
+
+        if value_discriminator != [0, 0, 0, 1] {
+            // not a Value::WalRecord(..)
+            return Ok(false);
+        }
+
+        let walrecord_discriminator = &raw[4..8];
+
+        if walrecord_discriminator != [0, 0, 0, 0] {
+            // only NeonWalRecord::Postgres can have will_init
+            return Ok(false);
+        }
+
+        if raw.len() < 17 {
+            return Err(InvalidInput::TooShortPostgresRecord);
+        }
+
+        Ok(raw[8] == 1)
+    }
+}
+
 #[cfg(test)]
 mod test {
     use super::*;
@@ -70,6 +116,8 @@ mod test {
         ];
 
         roundtrip!(image, expected);
+
+        assert!(ValueBytes::will_init(&expected).unwrap());
     }
 
     #[test]
@@ -93,6 +141,96 @@ mod test {
         ];
 
         roundtrip!(rec, expected);
+
+        assert!(ValueBytes::will_init(&expected).unwrap());
+    }
+
+    #[test]
+    fn bytes_inspection_too_short_image() {
+        let rec = Value::Image(Bytes::from_static(b""));
+
+        #[rustfmt::skip]
+        let expected = [
+            // top level discriminator of 4 bytes
+            0x00, 0x00, 0x00, 0x00,
+            // 8 byte length
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        ];
+
+        roundtrip!(rec, expected);
+
+        assert!(ValueBytes::will_init(&expected).unwrap());
+        assert_eq!(expected.len(), 12);
+        for len in 0..12 {
+            assert_eq!(
+                ValueBytes::will_init(&expected[..len]).unwrap_err(),
+                InvalidInput::TooShortValue
+            );
+        }
+    }
+
+    #[test]
+    fn bytes_inspection_too_short_postgres_record() {
+        let rec = NeonWalRecord::Postgres {
+            will_init: false,
+            rec: Bytes::from_static(b""),
+        };
+        let rec = Value::WalRecord(rec);
+
+        #[rustfmt::skip]
+        let expected = [
+            // flattened discriminator of total 8 bytes
+            0x00, 0x00, 0x00, 0x01,
+            0x00, 0x00, 0x00, 0x00,
+            // will_init
+            0x00,
+            // 8 byte length
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        ];
+
+        roundtrip!(rec, expected);
+
+        assert!(!ValueBytes::will_init(&expected).unwrap());
+        assert_eq!(expected.len(), 17);
+        for len in 12..17 {
+            assert_eq!(
+                ValueBytes::will_init(&expected[..len]).unwrap_err(),
+                InvalidInput::TooShortPostgresRecord
+            )
+        }
+        for len in 0..12 {
+            assert_eq!(
+                ValueBytes::will_init(&expected[..len]).unwrap_err(),
+                InvalidInput::TooShortValue
+            )
+        }
+    }
+
+    #[test]
+    fn clear_visibility_map_flags_example() {
+        let rec = NeonWalRecord::ClearVisibilityMapFlags {
+            new_heap_blkno: Some(0x11),
+            old_heap_blkno: None,
+            flags: 0x03,
+        };
+        let rec = Value::WalRecord(rec);
+
+        #[rustfmt::skip]
+        let expected = [
+            // discriminators
+            0x00, 0x00, 0x00, 0x01,
+            0x00, 0x00, 0x00, 0x01,
+            // Some == 1 followed by 4 bytes
+            0x01, 0x00, 0x00, 0x00, 0x11,
+            // None == 0
+            0x00,
+            // flags
+            0x03
+        ];
+
+        roundtrip!(rec, expected);
+
+        assert!(!ValueBytes::will_init(&expected).unwrap());
     }
 }
 
diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs
index 9a1e354ecf..0c245580ee 100644
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -361,6 +361,10 @@ pub enum TaskKind {
 
     DebugTool,
 
+    EphemeralFilePreWarmPageCache,
+
+    LayerDownload,
+
     #[cfg(test)]
     UnitTest,
 }
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 17ff033e00..fdc49ae295 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -64,6 +64,7 @@ use self::timeline::uninit::UninitializedTimeline;
 use self::timeline::EvictionTaskTenantState;
 use self::timeline::TimelineResources;
 use self::timeline::WaitLsnError;
+use self::timeline::{GcCutoffs, GcInfo};
 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::deletion_queue::DeletionQueueClient;
@@ -86,7 +87,6 @@ use crate::tenant::remote_timeline_client::INITDB_PATH;
 use crate::tenant::storage_layer::DeltaLayer;
 use crate::tenant::storage_layer::ImageLayer;
 use crate::InitializationOrder;
-use std::cmp::min;
 use std::collections::hash_map::Entry;
 use std::collections::BTreeSet;
 use std::collections::HashMap;
@@ -386,7 +386,7 @@ impl WalRedoManager {
 
     pub(crate) fn status(&self) -> Option<WalRedoManagerStatus> {
         match self {
-            WalRedoManager::Prod(m) => m.status(),
+            WalRedoManager::Prod(m) => Some(m.status()),
             #[cfg(test)]
             WalRedoManager::Test(_) => None,
         }
@@ -559,9 +559,10 @@ impl Tenant {
             // By doing what we do here, the index part upload is retried.
             // If control plane retries timeline creation in the meantime, the mgmt API handler
             // for timeline creation will coalesce on the upload we queue here.
+            // FIXME: this branch should be dead code as we no longer write local metadata.
             let rtc = timeline.remote_client.as_ref().unwrap();
             rtc.init_upload_queue_for_empty_remote(&metadata)?;
-            rtc.schedule_index_upload_for_metadata_update(&metadata)?;
+            rtc.schedule_index_upload_for_full_metadata_update(&metadata)?;
         }
 
         timeline
@@ -887,7 +888,7 @@ impl Tenant {
 
     #[instrument(skip_all)]
     pub(crate) async fn preload(
-        self: &Arc<Tenant>,
+        self: &Arc<Self>,
         remote_storage: &GenericRemoteStorage,
         cancel: CancellationToken,
     ) -> anyhow::Result<TenantPreload> {
@@ -917,9 +918,13 @@ impl Tenant {
 
         Ok(TenantPreload {
             deleting,
-            timelines: self
-                .load_timeline_metadata(remote_timeline_ids, remote_storage, cancel)
-                .await?,
+            timelines: Self::load_timeline_metadata(
+                self,
+                remote_timeline_ids,
+                remote_storage,
+                cancel,
+            )
+            .await?,
         })
     }
 
@@ -2807,7 +2812,48 @@ impl Tenant {
         cancel: &CancellationToken,
         ctx: &RequestContext,
     ) -> anyhow::Result<Vec<Arc<Timeline>>> {
-        // grab mutex to prevent new timelines from being created here.
+        // before taking the gc_cs lock, do the heavier weight finding of gc_cutoff points for
+        // currently visible timelines.
+        let timelines = self
+            .timelines
+            .lock()
+            .unwrap()
+            .values()
+            .filter(|tl| match target_timeline_id.as_ref() {
+                Some(target) => &tl.timeline_id == target,
+                None => true,
+            })
+            .cloned()
+            .collect::<Vec<_>>();
+
+        let mut gc_cutoffs: HashMap<TimelineId, GcCutoffs> =
+            HashMap::with_capacity(timelines.len());
+
+        for timeline in timelines.iter() {
+            let cutoff = timeline
+                .get_last_record_lsn()
+                .checked_sub(horizon)
+                .unwrap_or(Lsn(0));
+
+            let res = timeline.find_gc_cutoffs(cutoff, pitr, cancel, ctx).await;
+
+            match res {
+                Ok(cutoffs) => {
+                    let old = gc_cutoffs.insert(timeline.timeline_id, cutoffs);
+                    assert!(old.is_none());
+                }
+                Err(e) => {
+                    tracing::warn!(timeline_id = %timeline.timeline_id, "ignoring failure to find gc cutoffs: {e:#}");
+                }
+            }
+        }
+
+        if !self.is_active() {
+            anyhow::bail!("shutting down");
+        }
+
+        // grab mutex to prevent new timelines from being created here; avoid doing long operations
+        // because that will stall branch creation.
         let gc_cs = self.gc_cs.lock().await;
 
         // Scan all timelines. For each timeline, remember the timeline ID and
@@ -2869,20 +2915,36 @@ impl Tenant {
                 }
             }
 
-            if let Some(cutoff) = timeline.get_last_record_lsn().checked_sub(horizon) {
-                let branchpoints: Vec<Lsn> = all_branchpoints
-                    .range((
-                        Included((timeline_id, Lsn(0))),
-                        Included((timeline_id, Lsn(u64::MAX))),
-                    ))
-                    .map(|&x| x.1)
-                    .collect();
-                timeline
-                    .update_gc_info(branchpoints, cutoff, pitr, cancel, ctx)
-                    .await?;
+            let branchpoints: Vec<Lsn> = all_branchpoints
+                .range((
+                    Included((timeline_id, Lsn(0))),
+                    Included((timeline_id, Lsn(u64::MAX))),
+                ))
+                .map(|&x| x.1)
+                .collect();
 
-                gc_timelines.push(timeline);
+            {
+                let mut target = timeline.gc_info.write().unwrap();
+
+                match gc_cutoffs.remove(&timeline_id) {
+                    Some(cutoffs) => {
+                        *target = GcInfo {
+                            retain_lsns: branchpoints,
+                            cutoffs,
+                        };
+                    }
+                    None => {
+                        // reasons for this being unavailable:
+                        // - this timeline was created while we were finding cutoffs
+                        // - lsn for timestamp search fails for this timeline repeatedly
+                        //
+                        // in both cases, refreshing the branchpoints is correct.
+                        target.retain_lsns = branchpoints;
+                    }
+                };
             }
+
+            gc_timelines.push(timeline);
         }
         drop(gc_cs);
         Ok(gc_timelines)
@@ -2969,7 +3031,7 @@ impl Tenant {
         // and then the planned GC cutoff
         {
             let gc_info = src_timeline.gc_info.read().unwrap();
-            let cutoff = min(gc_info.pitr_cutoff, gc_info.horizon_cutoff);
+            let cutoff = gc_info.min_cutoff();
             if start_lsn < cutoff {
                 return Err(CreateTimelineError::AncestorLsn(anyhow::anyhow!(
                     "invalid branch start lsn: less than planned GC cutoff {cutoff}"
@@ -3027,7 +3089,7 @@ impl Tenant {
         // See also https://github.com/neondatabase/neon/issues/3865
         if let Some(remote_client) = new_timeline.remote_client.as_ref() {
             remote_client
-                .schedule_index_upload_for_metadata_update(&metadata)
+                .schedule_index_upload_for_full_metadata_update(&metadata)
                 .context("branch initial metadata upload")?;
         }
 
@@ -3190,7 +3252,7 @@ impl Tenant {
             run_initdb(self.conf, &pgdata_path, pg_version, &self.cancel).await?;
 
             // Upload the created data dir to S3
-            if self.tenant_shard_id().is_zero() {
+            if self.tenant_shard_id().is_shard_zero() {
                 self.upload_initdb(&timelines_path, &pgdata_path, &timeline_id)
                     .await?;
             }
@@ -3398,7 +3460,11 @@ impl Tenant {
         // is in progress (which is not a common case).
         //
         // See more for on the issue #2748 condenced out of the initial PR review.
-        let mut shared_cache = self.cached_logical_sizes.lock().await;
+        let mut shared_cache = tokio::select! {
+            locked = self.cached_logical_sizes.lock() => locked,
+            _ = cancel.cancelled() => anyhow::bail!("cancelled"),
+            _ = self.cancel.cancelled() => anyhow::bail!("tenant is shutting down"),
+        };
 
         size::gather_inputs(
             self,
@@ -3437,7 +3503,7 @@ impl Tenant {
             .store(size, Ordering::Relaxed);
 
         // Only shard zero should be calculating synthetic sizes
-        debug_assert!(self.shard_identity.is_zero());
+        debug_assert!(self.shard_identity.is_shard_zero());
 
         TENANT_SYNTHETIC_SIZE_METRIC
             .get_metric_with_label_values(&[&self.tenant_shard_id.tenant_id.to_string()])
@@ -3660,6 +3726,7 @@ pub(crate) mod harness {
                 image_layer_creation_check_threshold: Some(
                     tenant_conf.image_layer_creation_check_threshold,
                 ),
+                switch_to_aux_file_v2: Some(tenant_conf.switch_to_aux_file_v2),
             }
         }
     }
@@ -3848,6 +3915,8 @@ pub(crate) mod harness {
 
 #[cfg(test)]
 mod tests {
+    use std::collections::BTreeMap;
+
     use super::*;
     use crate::keyspace::KeySpaceAccum;
     use crate::repository::{Key, Value};
@@ -3856,9 +3925,12 @@ mod tests {
     use crate::DEFAULT_PG_VERSION;
     use bytes::BytesMut;
     use hex_literal::hex;
+    use pageserver_api::key::{AUX_KEY_PREFIX, NON_INHERITED_RANGE};
     use pageserver_api::keyspace::KeySpace;
+    use pageserver_api::models::CompactionAlgorithm;
     use rand::{thread_rng, Rng};
-    use tests::timeline::ShutdownMode;
+    use tests::storage_layer::ValuesReconstructState;
+    use tests::timeline::{GetVectoredError, ShutdownMode};
 
     static TEST_KEY: Lazy<Key> =
         Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001")));
@@ -4495,11 +4567,25 @@ mod tests {
     }
 
     async fn bulk_insert_compact_gc(
-        timeline: Arc<Timeline>,
+        tenant: &Tenant,
+        timeline: &Arc<Timeline>,
+        ctx: &RequestContext,
+        lsn: Lsn,
+        repeat: usize,
+        key_count: usize,
+    ) -> anyhow::Result<()> {
+        let compact = true;
+        bulk_insert_maybe_compact_gc(tenant, timeline, ctx, lsn, repeat, key_count, compact).await
+    }
+
+    async fn bulk_insert_maybe_compact_gc(
+        tenant: &Tenant,
+        timeline: &Arc<Timeline>,
         ctx: &RequestContext,
         mut lsn: Lsn,
         repeat: usize,
         key_count: usize,
+        compact: bool,
     ) -> anyhow::Result<()> {
         let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
         let mut blknum = 0;
@@ -4507,6 +4593,8 @@ mod tests {
         // Enforce that key range is monotonously increasing
         let mut keyspace = KeySpaceAccum::new();
 
+        let cancel = CancellationToken::new();
+
         for _ in 0..repeat {
             for _ in 0..key_count {
                 test_key.field6 = blknum;
@@ -4528,22 +4616,19 @@ mod tests {
                 blknum += 1;
             }
 
-            let cutoff = timeline.get_last_record_lsn();
-
-            timeline
-                .update_gc_info(
-                    Vec::new(),
-                    cutoff,
-                    Duration::ZERO,
-                    &CancellationToken::new(),
-                    ctx,
-                )
-                .await?;
             timeline.freeze_and_flush().await?;
-            timeline
-                .compact(&CancellationToken::new(), EnumSet::empty(), ctx)
+            if compact {
+                // this requires timeline to be &Arc<Timeline>
+                timeline.compact(&cancel, EnumSet::empty(), ctx).await?;
+            }
+
+            // this doesn't really need to use the timeline_id target, but it is closer to what it
+            // originally was.
+            let res = tenant
+                .gc_iteration(Some(timeline.timeline_id), 0, Duration::ZERO, &cancel, ctx)
                 .await?;
-            timeline.gc().await?;
+
+            assert_eq!(res.layers_removed, 0, "this never removes anything");
         }
 
         Ok(())
@@ -4562,7 +4647,7 @@ mod tests {
             .await?;
 
         let lsn = Lsn(0x10);
-        bulk_insert_compact_gc(tline.clone(), &ctx, lsn, 50, 10000).await?;
+        bulk_insert_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000).await?;
 
         Ok(())
     }
@@ -4593,7 +4678,7 @@ mod tests {
             .await?;
 
         let lsn = Lsn(0x10);
-        bulk_insert_compact_gc(tline.clone(), &ctx, lsn, 50, 10000).await?;
+        bulk_insert_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000).await?;
 
         let guard = tline.layers.read().await;
         guard.layer_map().dump(true, &ctx).await?;
@@ -4646,7 +4731,9 @@ mod tests {
         for read in reads {
             info!("Doing vectored read on {:?}", read);
 
-            let vectored_res = tline.get_vectored_impl(read.clone(), reads_lsn, &ctx).await;
+            let vectored_res = tline
+                .get_vectored_impl(read.clone(), reads_lsn, ValuesReconstructState::new(), &ctx)
+                .await;
             tline
                 .validate_get_vectored_impl(&vectored_res, read, reads_lsn, &ctx)
                 .await;
@@ -4655,6 +4742,59 @@ mod tests {
         Ok(())
     }
 
+    #[tokio::test]
+    async fn test_get_vectored_aux_files() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("test_get_vectored_aux_files")?;
+
+        let (tenant, ctx) = harness.load().await;
+        let tline = tenant
+            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)
+            .await?;
+        let tline = tline.raw_timeline().unwrap();
+
+        let mut modification = tline.begin_modification(Lsn(0x1000));
+        modification.put_file("foo/bar1", b"content1", &ctx).await?;
+        modification.set_lsn(Lsn(0x1008))?;
+        modification.put_file("foo/bar2", b"content2", &ctx).await?;
+        modification.commit(&ctx).await?;
+
+        let child_timeline_id = TimelineId::generate();
+        tenant
+            .branch_timeline_test(
+                tline,
+                child_timeline_id,
+                Some(tline.get_last_record_lsn()),
+                &ctx,
+            )
+            .await?;
+
+        let child_timeline = tenant
+            .get_timeline(child_timeline_id, true)
+            .expect("Should have the branched timeline");
+
+        let aux_keyspace = KeySpace {
+            ranges: vec![NON_INHERITED_RANGE],
+        };
+        let read_lsn = child_timeline.get_last_record_lsn();
+
+        let vectored_res = child_timeline
+            .get_vectored_impl(
+                aux_keyspace.clone(),
+                read_lsn,
+                ValuesReconstructState::new(),
+                &ctx,
+            )
+            .await;
+
+        child_timeline
+            .validate_get_vectored_impl(&vectored_res, aux_keyspace, read_lsn, &ctx)
+            .await;
+
+        let images = vectored_res?;
+        assert!(images.is_empty());
+        Ok(())
+    }
+
     // Test that vectored get handles layer gaps correctly
     // by advancing into the next ancestor timeline if required.
     //
@@ -4783,7 +4923,12 @@ mod tests {
             ranges: vec![key_near_gap..gap_at_key.next(), key_near_end..current_key],
         };
         let results = child_timeline
-            .get_vectored_impl(read.clone(), current_lsn, &ctx)
+            .get_vectored_impl(
+                read.clone(),
+                current_lsn,
+                ValuesReconstructState::new(),
+                &ctx,
+            )
             .await?;
 
         for (key, img_res) in results {
@@ -4794,15 +4939,192 @@ mod tests {
         Ok(())
     }
 
+    // Test that vectored get descends into ancestor timelines correctly and
+    // does not return an image that's newer than requested.
+    //
+    // The diagram below ilustrates an interesting case. We have a parent timeline
+    // (top of the Lsn range) and a child timeline. The request key cannot be reconstructed
+    // from the child timeline, so the parent timeline must be visited. When advacing into
+    // the child timeline, the read path needs to remember what the requested Lsn was in
+    // order to avoid returning an image that's too new. The test below constructs such
+    // a timeline setup and does a few queries around the Lsn of each page image.
+    // ```
+    //    LSN
+    //     ^
+    //     |
+    //     |
+    // 500 | --------------------------------------> branch point
+    // 400 |        X
+    // 300 |        X
+    // 200 | --------------------------------------> requested lsn
+    // 100 |        X
+    //     |---------------------------------------> Key
+    //              |
+    //              ------> requested key
+    //
+    // Legend:
+    // * X - page images
+    // ```
+    #[tokio::test]
+    async fn test_get_vectored_ancestor_descent() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("test_get_vectored_on_lsn_axis")?;
+        let (tenant, ctx) = harness.load().await;
+
+        let start_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
+        let end_key = start_key.add(1000);
+        let child_gap_at_key = start_key.add(500);
+        let mut parent_gap_lsns: BTreeMap<Lsn, String> = BTreeMap::new();
+
+        let mut current_lsn = Lsn(0x10);
+
+        let timeline_id = TimelineId::generate();
+        let parent_timeline = tenant
+            .create_test_timeline(timeline_id, current_lsn, DEFAULT_PG_VERSION, &ctx)
+            .await?;
+
+        current_lsn += 0x100;
+
+        for _ in 0..3 {
+            let mut key = start_key;
+            while key < end_key {
+                current_lsn += 0x10;
+
+                let image_value = format!("{} at {}", child_gap_at_key, current_lsn);
+
+                let mut writer = parent_timeline.writer().await;
+                writer
+                    .put(
+                        key,
+                        current_lsn,
+                        &Value::Image(test_img(&image_value)),
+                        &ctx,
+                    )
+                    .await?;
+                writer.finish_write(current_lsn);
+
+                if key == child_gap_at_key {
+                    parent_gap_lsns.insert(current_lsn, image_value);
+                }
+
+                key = key.next();
+            }
+
+            parent_timeline.freeze_and_flush().await?;
+        }
+
+        let child_timeline_id = TimelineId::generate();
+
+        let child_timeline = tenant
+            .branch_timeline_test(&parent_timeline, child_timeline_id, Some(current_lsn), &ctx)
+            .await?;
+
+        let mut key = start_key;
+        while key < end_key {
+            if key == child_gap_at_key {
+                key = key.next();
+                continue;
+            }
+
+            current_lsn += 0x10;
+
+            let mut writer = child_timeline.writer().await;
+            writer
+                .put(
+                    key,
+                    current_lsn,
+                    &Value::Image(test_img(&format!("{} at {}", key, current_lsn))),
+                    &ctx,
+                )
+                .await?;
+            writer.finish_write(current_lsn);
+
+            key = key.next();
+        }
+
+        child_timeline.freeze_and_flush().await?;
+
+        let lsn_offsets: [i64; 5] = [-10, -1, 0, 1, 10];
+        let mut query_lsns = Vec::new();
+        for image_lsn in parent_gap_lsns.keys().rev() {
+            for offset in lsn_offsets {
+                query_lsns.push(Lsn(image_lsn
+                    .0
+                    .checked_add_signed(offset)
+                    .expect("Shouldn't overflow")));
+            }
+        }
+
+        for query_lsn in query_lsns {
+            let results = child_timeline
+                .get_vectored_impl(
+                    KeySpace {
+                        ranges: vec![child_gap_at_key..child_gap_at_key.next()],
+                    },
+                    query_lsn,
+                    ValuesReconstructState::new(),
+                    &ctx,
+                )
+                .await;
+
+            let expected_item = parent_gap_lsns
+                .iter()
+                .rev()
+                .find(|(lsn, _)| **lsn <= query_lsn);
+
+            info!(
+                "Doing vectored read at LSN {}. Expecting image to be: {:?}",
+                query_lsn, expected_item
+            );
+
+            match expected_item {
+                Some((_, img_value)) => {
+                    let key_results = results.expect("No vectored get error expected");
+                    let key_result = &key_results[&child_gap_at_key];
+                    let returned_img = key_result
+                        .as_ref()
+                        .expect("No page reconstruct error expected");
+
+                    info!(
+                        "Vectored read at LSN {} returned image {}",
+                        query_lsn,
+                        std::str::from_utf8(returned_img)?
+                    );
+                    assert_eq!(*returned_img, test_img(img_value));
+                }
+                None => {
+                    assert!(matches!(results, Err(GetVectoredError::MissingKey(_))));
+                }
+            }
+        }
+
+        Ok(())
+    }
+
     #[tokio::test]
     async fn test_random_updates() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_random_updates")?;
+        let names_algorithms = [
+            ("test_random_updates_legacy", CompactionAlgorithm::Legacy),
+            ("test_random_updates_tiered", CompactionAlgorithm::Tiered),
+        ];
+        for (name, algorithm) in names_algorithms {
+            test_random_updates_algorithm(name, algorithm).await?;
+        }
+        Ok(())
+    }
+
+    async fn test_random_updates_algorithm(
+        name: &'static str,
+        compaction_algorithm: CompactionAlgorithm,
+    ) -> anyhow::Result<()> {
+        let mut harness = TenantHarness::create(name)?;
+        harness.tenant_conf.compaction_algorithm = compaction_algorithm;
         let (tenant, ctx) = harness.load().await;
         let tline = tenant
             .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
             .await?;
 
         const NUM_KEYS: usize = 1000;
+        let cancel = CancellationToken::new();
 
         let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
 
@@ -4861,22 +5183,11 @@ mod tests {
                 );
             }
 
-            // Perform a cycle of flush, compact, and GC
-            let cutoff = tline.get_last_record_lsn();
-            tline
-                .update_gc_info(
-                    Vec::new(),
-                    cutoff,
-                    Duration::ZERO,
-                    &CancellationToken::new(),
-                    &ctx,
-                )
-                .await?;
+            // Perform a cycle of flush, and GC
             tline.freeze_and_flush().await?;
-            tline
-                .compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
+            tenant
+                .gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx)
                 .await?;
-            tline.gc().await?;
         }
 
         Ok(())
@@ -4897,6 +5208,8 @@ mod tests {
 
         let mut keyspace = KeySpaceAccum::new();
 
+        let cancel = CancellationToken::new();
+
         // Track when each page was last modified. Used to assert that
         // a read sees the latest page version.
         let mut updated = [Lsn(0); NUM_KEYS];
@@ -4960,21 +5273,11 @@ mod tests {
             }
 
             // Perform a cycle of flush, compact, and GC
-            let cutoff = tline.get_last_record_lsn();
-            tline
-                .update_gc_info(
-                    Vec::new(),
-                    cutoff,
-                    Duration::ZERO,
-                    &CancellationToken::new(),
-                    &ctx,
-                )
-                .await?;
             tline.freeze_and_flush().await?;
-            tline
-                .compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
+            tline.compact(&cancel, EnumSet::empty(), &ctx).await?;
+            tenant
+                .gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx)
                 .await?;
-            tline.gc().await?;
         }
 
         Ok(())
@@ -5156,19 +5459,140 @@ mod tests {
 
     #[tokio::test]
     async fn test_read_at_max_lsn() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_read_at_max_lsn")?;
+        let names_algorithms = [
+            ("test_read_at_max_lsn_legacy", CompactionAlgorithm::Legacy),
+            ("test_read_at_max_lsn_tiered", CompactionAlgorithm::Tiered),
+        ];
+        for (name, algorithm) in names_algorithms {
+            test_read_at_max_lsn_algorithm(name, algorithm).await?;
+        }
+        Ok(())
+    }
+
+    async fn test_read_at_max_lsn_algorithm(
+        name: &'static str,
+        compaction_algorithm: CompactionAlgorithm,
+    ) -> anyhow::Result<()> {
+        let mut harness = TenantHarness::create(name)?;
+        harness.tenant_conf.compaction_algorithm = compaction_algorithm;
         let (tenant, ctx) = harness.load().await;
         let tline = tenant
             .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
             .await?;
 
         let lsn = Lsn(0x10);
-        bulk_insert_compact_gc(tline.clone(), &ctx, lsn, 50, 10000).await?;
+        let compact = false;
+        bulk_insert_maybe_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000, compact).await?;
 
         let test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
         let read_lsn = Lsn(u64::MAX - 1);
 
-        assert!(tline.get(test_key, read_lsn, &ctx).await.is_ok());
+        let result = tline.get(test_key, read_lsn, &ctx).await;
+        assert!(result.is_ok(), "result is not Ok: {}", result.unwrap_err());
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_metadata_scan() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("test_metadata_scan")?;
+        let (tenant, ctx) = harness.load().await;
+        let tline = tenant
+            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
+            .await?;
+
+        const NUM_KEYS: usize = 1000;
+        const STEP: usize = 100; // random update + scan base_key + idx * STEP
+
+        let cancel = CancellationToken::new();
+
+        let mut base_key = Key::from_hex("000000000033333333444444445500000000").unwrap();
+        base_key.field1 = AUX_KEY_PREFIX;
+        let mut test_key = base_key;
+
+        // Track when each page was last modified. Used to assert that
+        // a read sees the latest page version.
+        let mut updated = [Lsn(0); NUM_KEYS];
+
+        let mut lsn = Lsn(0x10);
+        #[allow(clippy::needless_range_loop)]
+        for blknum in 0..NUM_KEYS {
+            lsn = Lsn(lsn.0 + 0x10);
+            test_key.field6 = (blknum * STEP) as u32;
+            let mut writer = tline.writer().await;
+            writer
+                .put(
+                    test_key,
+                    lsn,
+                    &Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
+                    &ctx,
+                )
+                .await?;
+            writer.finish_write(lsn);
+            updated[blknum] = lsn;
+            drop(writer);
+        }
+
+        let keyspace = KeySpace::single(base_key..base_key.add((NUM_KEYS * STEP) as u32));
+
+        for _ in 0..10 {
+            // Read all the blocks
+            for (blknum, last_lsn) in updated.iter().enumerate() {
+                test_key.field6 = (blknum * STEP) as u32;
+                assert_eq!(
+                    tline.get(test_key, lsn, &ctx).await?,
+                    test_img(&format!("{} at {}", blknum, last_lsn))
+                );
+            }
+
+            let mut cnt = 0;
+            for (key, value) in tline
+                .get_vectored_impl(
+                    keyspace.clone(),
+                    lsn,
+                    ValuesReconstructState::default(),
+                    &ctx,
+                )
+                .await?
+            {
+                let blknum = key.field6 as usize;
+                let value = value?;
+                assert!(blknum % STEP == 0);
+                let blknum = blknum / STEP;
+                assert_eq!(
+                    value,
+                    test_img(&format!("{} at {}", blknum, updated[blknum]))
+                );
+                cnt += 1;
+            }
+
+            assert_eq!(cnt, NUM_KEYS);
+
+            for _ in 0..NUM_KEYS {
+                lsn = Lsn(lsn.0 + 0x10);
+                let blknum = thread_rng().gen_range(0..NUM_KEYS);
+                test_key.field6 = (blknum * STEP) as u32;
+                let mut writer = tline.writer().await;
+                writer
+                    .put(
+                        test_key,
+                        lsn,
+                        &Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
+                        &ctx,
+                    )
+                    .await?;
+                writer.finish_write(lsn);
+                drop(writer);
+                updated[blknum] = lsn;
+            }
+
+            // Perform a cycle of flush, compact, and GC
+            tline.freeze_and_flush().await?;
+            tline.compact(&cancel, EnumSet::empty(), &ctx).await?;
+            tenant
+                .gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx)
+                .await?;
+        }
 
         Ok(())
     }
diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs
index 0d33100ead..1dc451f5c9 100644
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -121,7 +121,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
         self.offset
     }
 
-    const CAPACITY: usize = if BUFFERED { PAGE_SZ } else { 0 };
+    const CAPACITY: usize = if BUFFERED { 64 * 1024 } else { 0 };
 
     /// Writes the given buffer directly to the underlying `VirtualFile`.
     /// You need to make sure that the internal buffer is empty, otherwise
@@ -130,8 +130,9 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
     async fn write_all_unbuffered<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
         &mut self,
         src_buf: B,
+        ctx: &RequestContext,
     ) -> (B::Buf, Result<(), Error>) {
-        let (src_buf, res) = self.inner.write_all(src_buf).await;
+        let (src_buf, res) = self.inner.write_all(src_buf, ctx).await;
         let nbytes = match res {
             Ok(nbytes) => nbytes,
             Err(e) => return (src_buf, Err(e)),
@@ -142,9 +143,9 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
 
     #[inline(always)]
     /// Flushes the internal buffer to the underlying `VirtualFile`.
-    pub async fn flush_buffer(&mut self) -> Result<(), Error> {
+    pub async fn flush_buffer(&mut self, ctx: &RequestContext) -> Result<(), Error> {
         let buf = std::mem::take(&mut self.buf);
-        let (mut buf, res) = self.inner.write_all(buf).await;
+        let (mut buf, res) = self.inner.write_all(buf, ctx).await;
         res?;
         buf.clear();
         self.buf = buf;
@@ -165,10 +166,11 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
     async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
         &mut self,
         src_buf: B,
+        ctx: &RequestContext,
     ) -> (B::Buf, Result<(), Error>) {
         if !BUFFERED {
             assert!(self.buf.is_empty());
-            return self.write_all_unbuffered(src_buf).await;
+            return self.write_all_unbuffered(src_buf, ctx).await;
         }
         let remaining = Self::CAPACITY - self.buf.len();
         let src_buf_len = src_buf.bytes_init();
@@ -183,7 +185,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
         }
         // Then, if the buffer is full, flush it out
         if self.buf.len() == Self::CAPACITY {
-            if let Err(e) = self.flush_buffer().await {
+            if let Err(e) = self.flush_buffer(ctx).await {
                 return (Slice::into_inner(src_buf), Err(e));
             }
         }
@@ -199,7 +201,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
                 assert_eq!(copied, src_buf.len());
                 Slice::into_inner(src_buf)
             } else {
-                let (src_buf, res) = self.write_all_unbuffered(src_buf).await;
+                let (src_buf, res) = self.write_all_unbuffered(src_buf, ctx).await;
                 if let Err(e) = res {
                     return (src_buf, Err(e));
                 }
@@ -216,6 +218,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
     pub async fn write_blob<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
         &mut self,
         srcbuf: B,
+        ctx: &RequestContext,
     ) -> (B::Buf, Result<u64, Error>) {
         let offset = self.offset;
 
@@ -227,7 +230,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
             if len < 128 {
                 // Short blob. Write a 1-byte length header
                 io_buf.put_u8(len as u8);
-                self.write_all(io_buf).await
+                self.write_all(io_buf, ctx).await
             } else {
                 // Write a 4-byte length header
                 if len > 0x7fff_ffff {
@@ -242,7 +245,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
                 let mut len_buf = (len as u32).to_be_bytes();
                 len_buf[0] |= 0x80;
                 io_buf.extend_from_slice(&len_buf[..]);
-                self.write_all(io_buf).await
+                self.write_all(io_buf, ctx).await
             }
         }
         .await;
@@ -251,7 +254,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
             Ok(_) => (),
             Err(e) => return (Slice::into_inner(srcbuf.slice(..)), Err(e)),
         }
-        let (srcbuf, res) = self.write_all(srcbuf).await;
+        let (srcbuf, res) = self.write_all(srcbuf, ctx).await;
         (srcbuf, res.map(|_| offset))
     }
 }
@@ -261,8 +264,8 @@ impl BlobWriter<true> {
     ///
     /// This function flushes the internal buffer before giving access
     /// to the underlying `VirtualFile`.
-    pub async fn into_inner(mut self) -> Result<VirtualFile, Error> {
-        self.flush_buffer().await?;
+    pub async fn into_inner(mut self, ctx: &RequestContext) -> Result<VirtualFile, Error> {
+        self.flush_buffer(ctx).await?;
         Ok(self.inner)
     }
 
@@ -299,16 +302,16 @@ mod tests {
             let file = VirtualFile::create(pathbuf.as_path()).await?;
             let mut wtr = BlobWriter::<BUFFERED>::new(file, 0);
             for blob in blobs.iter() {
-                let (_, res) = wtr.write_blob(blob.clone()).await;
+                let (_, res) = wtr.write_blob(blob.clone(), &ctx).await;
                 let offs = res?;
                 offsets.push(offs);
             }
             // Write out one page worth of zeros so that we can
             // read again with read_blk
-            let (_, res) = wtr.write_blob(vec![0; PAGE_SZ]).await;
+            let (_, res) = wtr.write_blob(vec![0; PAGE_SZ], &ctx).await;
             let offs = res?;
             println!("Writing final blob at offs={offs}");
-            wtr.flush_buffer().await?;
+            wtr.flush_buffer(&ctx).await?;
         }
 
         let file = VirtualFile::open(pathbuf.as_path()).await?;
diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index a2bb479f63..9975c9edbc 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -369,6 +369,10 @@ pub struct TenantConf {
     // How much WAL must be ingested before checking again whether a new image layer is required.
     // Expresed in multiples of checkpoint distance.
     pub image_layer_creation_check_threshold: u8,
+
+    /// Switch to aux file v2. Switching this flag requires the user has not written any aux file into
+    /// the storage before, and this flag cannot be switched back. Otherwise there will be data corruptions.
+    pub switch_to_aux_file_v2: bool,
 }
 
 /// Same as TenantConf, but this struct preserves the information about
@@ -464,6 +468,10 @@ pub struct TenantConfOpt {
 
     #[serde(skip_serializing_if = "Option::is_none")]
     pub image_layer_creation_check_threshold: Option<u8>,
+
+    #[serde(skip_serializing_if = "Option::is_none")]
+    #[serde(default)]
+    pub switch_to_aux_file_v2: Option<bool>,
 }
 
 impl TenantConfOpt {
@@ -521,6 +529,9 @@ impl TenantConfOpt {
             image_layer_creation_check_threshold: self
                 .image_layer_creation_check_threshold
                 .unwrap_or(global_conf.image_layer_creation_check_threshold),
+            switch_to_aux_file_v2: self
+                .switch_to_aux_file_v2
+                .unwrap_or(global_conf.switch_to_aux_file_v2),
         }
     }
 }
@@ -562,6 +573,7 @@ impl Default for TenantConf {
             lazy_slru_download: false,
             timeline_get_throttle: crate::tenant::throttle::Config::disabled(),
             image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD,
+            switch_to_aux_file_v2: false,
         }
     }
 }
@@ -636,6 +648,7 @@ impl From<TenantConfOpt> for models::TenantConfig {
             lazy_slru_download: value.lazy_slru_download,
             timeline_get_throttle: value.timeline_get_throttle.map(ThrottleConfig::from),
             image_layer_creation_check_threshold: value.image_layer_creation_check_threshold,
+            switch_to_aux_file_v2: value.switch_to_aux_file_v2,
         }
     }
 }
diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs
index d1881f3897..33d0f677e5 100644
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -436,6 +436,11 @@ impl DeleteTenantFlow {
         .await
     }
 
+    /// Check whether background deletion of this tenant is currently in progress
+    pub(crate) fn is_in_progress(tenant: &Tenant) -> bool {
+        tenant.delete_progress.try_lock().is_err()
+    }
+
     async fn prepare(
         tenant: &Arc<Tenant>,
     ) -> Result<tokio::sync::OwnedMutexGuard<Self>, DeleteTenantError> {
diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs
index b27230db03..8b815a1885 100644
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -3,36 +3,26 @@
 
 use crate::config::PageServerConf;
 use crate::context::RequestContext;
-use crate::page_cache::{self, PAGE_SZ};
+use crate::page_cache;
 use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReader};
 use crate::virtual_file::{self, VirtualFile};
-use bytes::BytesMut;
 use camino::Utf8PathBuf;
 use pageserver_api::shard::TenantShardId;
-use std::cmp::min;
 
-use std::io::{self, ErrorKind};
-use std::ops::DerefMut;
+use std::io;
 use std::sync::atomic::AtomicU64;
-use tracing::*;
 use utils::id::TimelineId;
 
 pub struct EphemeralFile {
-    page_cache_file_id: page_cache::FileId,
-
     _tenant_shard_id: TenantShardId,
     _timeline_id: TimelineId,
-    file: VirtualFile,
-    len: u64,
-    /// An ephemeral file is append-only.
-    /// We keep the last page, which can still be modified, in [`Self::mutable_tail`].
-    /// The other pages, which can no longer be modified, are accessed through the page cache.
-    ///
-    /// None <=> IO is ongoing.
-    /// Size is fixed to PAGE_SZ at creation time and must not be changed.
-    mutable_tail: Option<BytesMut>,
+
+    rw: page_caching::RW,
 }
 
+mod page_caching;
+mod zero_padded_read_write;
+
 impl EphemeralFile {
     pub async fn create(
         conf: &PageServerConf,
@@ -59,21 +49,18 @@ impl EphemeralFile {
         .await?;
 
         Ok(EphemeralFile {
-            page_cache_file_id: page_cache::next_file_id(),
             _tenant_shard_id: tenant_shard_id,
             _timeline_id: timeline_id,
-            file,
-            len: 0,
-            mutable_tail: Some(BytesMut::zeroed(PAGE_SZ)),
+            rw: page_caching::RW::new(file),
         })
     }
 
     pub(crate) fn len(&self) -> u64 {
-        self.len
+        self.rw.bytes_written()
     }
 
-    pub(crate) fn id(&self) -> page_cache::FileId {
-        self.page_cache_file_id
+    pub(crate) fn page_cache_file_id(&self) -> page_cache::FileId {
+        self.rw.page_cache_file_id()
     }
 
     pub(crate) async fn read_blk(
@@ -81,44 +68,7 @@ impl EphemeralFile {
         blknum: u32,
         ctx: &RequestContext,
     ) -> Result<BlockLease, io::Error> {
-        let flushed_blknums = 0..self.len / PAGE_SZ as u64;
-        if flushed_blknums.contains(&(blknum as u64)) {
-            let cache = page_cache::get();
-            match cache
-                .read_immutable_buf(self.page_cache_file_id, blknum, ctx)
-                .await
-                .map_err(|e| {
-                    std::io::Error::new(
-                        std::io::ErrorKind::Other,
-                        // order path before error because error is anyhow::Error => might have many contexts
-                        format!(
-                            "ephemeral file: read immutable page #{}: {}: {:#}",
-                            blknum, self.file.path, e,
-                        ),
-                    )
-                })? {
-                page_cache::ReadBufResult::Found(guard) => {
-                    return Ok(BlockLease::PageReadGuard(guard))
-                }
-                page_cache::ReadBufResult::NotFound(write_guard) => {
-                    let write_guard = self
-                        .file
-                        .read_exact_at_page(write_guard, blknum as u64 * PAGE_SZ as u64)
-                        .await?;
-                    let read_guard = write_guard.mark_valid();
-                    return Ok(BlockLease::PageReadGuard(read_guard));
-                }
-            };
-        } else {
-            debug_assert_eq!(blknum as u64, self.len / PAGE_SZ as u64);
-            Ok(BlockLease::EphemeralFileMutableTail(
-                self.mutable_tail
-                    .as_deref()
-                    .expect("we're not doing IO, it must be Some()")
-                    .try_into()
-                    .expect("we ensure that it's always PAGE_SZ"),
-            ))
-        }
+        self.rw.read_blk(blknum, ctx).await
     }
 
     pub(crate) async fn write_blob(
@@ -126,137 +76,22 @@ impl EphemeralFile {
         srcbuf: &[u8],
         ctx: &RequestContext,
     ) -> Result<u64, io::Error> {
-        struct Writer<'a> {
-            ephemeral_file: &'a mut EphemeralFile,
-            /// The block to which the next [`push_bytes`] will write.
-            blknum: u32,
-            /// The offset inside the block identified by [`blknum`] to which [`push_bytes`] will write.
-            off: usize,
-        }
-        impl<'a> Writer<'a> {
-            fn new(ephemeral_file: &'a mut EphemeralFile) -> io::Result<Writer<'a>> {
-                Ok(Writer {
-                    blknum: (ephemeral_file.len / PAGE_SZ as u64) as u32,
-                    off: (ephemeral_file.len % PAGE_SZ as u64) as usize,
-                    ephemeral_file,
-                })
-            }
-            #[inline(always)]
-            async fn push_bytes(
-                &mut self,
-                src: &[u8],
-                ctx: &RequestContext,
-            ) -> Result<(), io::Error> {
-                let mut src_remaining = src;
-                while !src_remaining.is_empty() {
-                    let dst_remaining = &mut self
-                        .ephemeral_file
-                        .mutable_tail
-                        .as_deref_mut()
-                        .expect("IO is not yet ongoing")[self.off..];
-                    let n = min(dst_remaining.len(), src_remaining.len());
-                    dst_remaining[..n].copy_from_slice(&src_remaining[..n]);
-                    self.off += n;
-                    src_remaining = &src_remaining[n..];
-                    if self.off == PAGE_SZ {
-                        let mutable_tail = std::mem::take(&mut self.ephemeral_file.mutable_tail)
-                            .expect("IO is not yet ongoing");
-                        let (mutable_tail, res) = self
-                            .ephemeral_file
-                            .file
-                            .write_all_at(mutable_tail, self.blknum as u64 * PAGE_SZ as u64)
-                            .await;
-                        // TODO: If we panic before we can put the mutable_tail back, subsequent calls will fail.
-                        // I.e., the IO isn't retryable if we panic.
-                        self.ephemeral_file.mutable_tail = Some(mutable_tail);
-                        match res {
-                            Ok(_) => {
-                                // Pre-warm the page cache with what we just wrote.
-                                // This isn't necessary for coherency/correctness, but it's how we've always done it.
-                                let cache = page_cache::get();
-                                match cache
-                                    .read_immutable_buf(
-                                        self.ephemeral_file.page_cache_file_id,
-                                        self.blknum,
-                                        ctx,
-                                    )
-                                    .await
-                                {
-                                    Ok(page_cache::ReadBufResult::Found(_guard)) => {
-                                        // This function takes &mut self, so, it shouldn't be possible to reach this point.
-                                        unreachable!("we just wrote blknum {} and this function takes &mut self, so, no concurrent read_blk is possible", self.blknum);
-                                    }
-                                    Ok(page_cache::ReadBufResult::NotFound(mut write_guard)) => {
-                                        let buf: &mut [u8] = write_guard.deref_mut();
-                                        debug_assert_eq!(buf.len(), PAGE_SZ);
-                                        buf.copy_from_slice(
-                                            self.ephemeral_file
-                                                .mutable_tail
-                                                .as_deref()
-                                                .expect("IO is not ongoing"),
-                                        );
-                                        let _ = write_guard.mark_valid();
-                                        // pre-warm successful
-                                    }
-                                    Err(e) => {
-                                        error!("ephemeral_file write_blob failed to get immutable buf to pre-warm page cache: {e:?}");
-                                        // fail gracefully, it's not the end of the world if we can't pre-warm the cache here
-                                    }
-                                }
-                                // Zero the buffer for re-use.
-                                // Zeroing is critical for correcntess because the write_blob code below
-                                // and similarly read_blk expect zeroed pages.
-                                self.ephemeral_file
-                                    .mutable_tail
-                                    .as_deref_mut()
-                                    .expect("IO is not ongoing")
-                                    .fill(0);
-                                // This block is done, move to next one.
-                                self.blknum += 1;
-                                self.off = 0;
-                            }
-                            Err(e) => {
-                                return Err(std::io::Error::new(
-                                    ErrorKind::Other,
-                                    // order error before path because path is long and error is short
-                                    format!(
-                                        "ephemeral_file: write_blob: write-back full tail blk #{}: {:#}: {}",
-                                        self.blknum,
-                                        e,
-                                        self.ephemeral_file.file.path,
-                                    ),
-                                ));
-                            }
-                        }
-                    }
-                }
-                Ok(())
-            }
-        }
-
-        let pos = self.len;
-        let mut writer = Writer::new(self)?;
+        let pos = self.rw.bytes_written();
 
         // Write the length field
         if srcbuf.len() < 0x80 {
             // short one-byte length header
             let len_buf = [srcbuf.len() as u8];
-            writer.push_bytes(&len_buf, ctx).await?;
+
+            self.rw.write_all_borrowed(&len_buf, ctx).await?;
         } else {
             let mut len_buf = u32::to_be_bytes(srcbuf.len() as u32);
             len_buf[0] |= 0x80;
-            writer.push_bytes(&len_buf, ctx).await?;
+            self.rw.write_all_borrowed(&len_buf, ctx).await?;
         }
 
         // Write the payload
-        writer.push_bytes(srcbuf, ctx).await?;
-
-        if srcbuf.len() < 0x80 {
-            self.len += 1;
-        } else {
-            self.len += 4;
-        }
-        self.len += srcbuf.len() as u64;
+        self.rw.write_all_borrowed(srcbuf, ctx).await?;
 
         Ok(pos)
     }
@@ -271,28 +106,6 @@ pub fn is_ephemeral_file(filename: &str) -> bool {
     }
 }
 
-impl Drop for EphemeralFile {
-    fn drop(&mut self) {
-        // There might still be pages in the [`crate::page_cache`] for this file.
-        // We leave them there, [`crate::page_cache::PageCache::find_victim`] will evict them when needed.
-
-        // unlink the file
-        let res = std::fs::remove_file(&self.file.path);
-        if let Err(e) = res {
-            if e.kind() != std::io::ErrorKind::NotFound {
-                // just never log the not found errors, we cannot do anything for them; on detach
-                // the tenant directory is already gone.
-                //
-                // not found files might also be related to https://github.com/neondatabase/neon/issues/2442
-                error!(
-                    "could not remove ephemeral file '{}': {}",
-                    self.file.path, e
-                );
-            }
-        }
-    }
-}
-
 impl BlockReader for EphemeralFile {
     fn block_cursor(&self) -> super::block_io::BlockCursor<'_> {
         BlockCursor::new(super::block_io::BlockReaderRef::EphemeralFile(self))
diff --git a/pageserver/src/tenant/ephemeral_file/page_caching.rs b/pageserver/src/tenant/ephemeral_file/page_caching.rs
new file mode 100644
index 0000000000..42def8858e
--- /dev/null
+++ b/pageserver/src/tenant/ephemeral_file/page_caching.rs
@@ -0,0 +1,223 @@
+//! Wrapper around [`super::zero_padded_read_write::RW`] that uses the
+//! [`crate::page_cache`] to serve reads that need to go to the underlying [`VirtualFile`].
+
+use crate::context::RequestContext;
+use crate::page_cache::{self, PAGE_SZ};
+use crate::tenant::block_io::BlockLease;
+use crate::virtual_file::VirtualFile;
+
+use once_cell::sync::Lazy;
+use std::io::{self, ErrorKind};
+use tokio_epoll_uring::BoundedBuf;
+use tracing::*;
+
+use super::zero_padded_read_write;
+
+/// See module-level comment.
+pub struct RW {
+    page_cache_file_id: page_cache::FileId,
+    rw: super::zero_padded_read_write::RW<PreWarmingWriter>,
+}
+
+impl RW {
+    pub fn new(file: VirtualFile) -> Self {
+        let page_cache_file_id = page_cache::next_file_id();
+        Self {
+            page_cache_file_id,
+            rw: super::zero_padded_read_write::RW::new(PreWarmingWriter::new(
+                page_cache_file_id,
+                file,
+            )),
+        }
+    }
+
+    pub fn page_cache_file_id(&self) -> page_cache::FileId {
+        self.page_cache_file_id
+    }
+
+    pub(crate) async fn write_all_borrowed(
+        &mut self,
+        srcbuf: &[u8],
+        ctx: &RequestContext,
+    ) -> Result<usize, io::Error> {
+        // It doesn't make sense to proactively fill the page cache on the Pageserver write path
+        // because Compute is unlikely to access recently written data.
+        self.rw.write_all_borrowed(srcbuf, ctx).await
+    }
+
+    pub(crate) fn bytes_written(&self) -> u64 {
+        self.rw.bytes_written()
+    }
+
+    pub(crate) async fn read_blk(
+        &self,
+        blknum: u32,
+        ctx: &RequestContext,
+    ) -> Result<BlockLease, io::Error> {
+        match self.rw.read_blk(blknum).await? {
+            zero_padded_read_write::ReadResult::NeedsReadFromWriter { writer } => {
+                let cache = page_cache::get();
+                match cache
+                    .read_immutable_buf(self.page_cache_file_id, blknum, ctx)
+                    .await
+                    .map_err(|e| {
+                        std::io::Error::new(
+                            std::io::ErrorKind::Other,
+                            // order path before error because error is anyhow::Error => might have many contexts
+                            format!(
+                                "ephemeral file: read immutable page #{}: {}: {:#}",
+                                blknum,
+                                self.rw.as_writer().file.path,
+                                e,
+                            ),
+                        )
+                    })? {
+                    page_cache::ReadBufResult::Found(guard) => {
+                        return Ok(BlockLease::PageReadGuard(guard))
+                    }
+                    page_cache::ReadBufResult::NotFound(write_guard) => {
+                        let write_guard = writer
+                            .file
+                            .read_exact_at_page(write_guard, blknum as u64 * PAGE_SZ as u64)
+                            .await?;
+                        let read_guard = write_guard.mark_valid();
+                        return Ok(BlockLease::PageReadGuard(read_guard));
+                    }
+                }
+            }
+            zero_padded_read_write::ReadResult::ServedFromZeroPaddedMutableTail { buffer } => {
+                Ok(BlockLease::EphemeralFileMutableTail(buffer))
+            }
+        }
+    }
+}
+
+impl Drop for RW {
+    fn drop(&mut self) {
+        // There might still be pages in the [`crate::page_cache`] for this file.
+        // We leave them there, [`crate::page_cache::PageCache::find_victim`] will evict them when needed.
+
+        // unlink the file
+        let res = std::fs::remove_file(&self.rw.as_writer().file.path);
+        if let Err(e) = res {
+            if e.kind() != std::io::ErrorKind::NotFound {
+                // just never log the not found errors, we cannot do anything for them; on detach
+                // the tenant directory is already gone.
+                //
+                // not found files might also be related to https://github.com/neondatabase/neon/issues/2442
+                error!(
+                    "could not remove ephemeral file '{}': {}",
+                    self.rw.as_writer().file.path,
+                    e
+                );
+            }
+        }
+    }
+}
+
+struct PreWarmingWriter {
+    nwritten_blocks: u32,
+    page_cache_file_id: page_cache::FileId,
+    file: VirtualFile,
+}
+
+impl PreWarmingWriter {
+    fn new(page_cache_file_id: page_cache::FileId, file: VirtualFile) -> Self {
+        Self {
+            nwritten_blocks: 0,
+            page_cache_file_id,
+            file,
+        }
+    }
+}
+
+impl crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter for PreWarmingWriter {
+    async fn write_all<
+        B: tokio_epoll_uring::BoundedBuf<Buf = Buf>,
+        Buf: tokio_epoll_uring::IoBuf + Send,
+    >(
+        &mut self,
+        buf: B,
+        ctx: &RequestContext,
+    ) -> std::io::Result<(usize, B::Buf)> {
+        let buf = buf.slice(..);
+        let saved_bounds = buf.bounds(); // save for reconstructing the Slice from iobuf after the IO is done
+        let check_bounds_stuff_works = if cfg!(test) && cfg!(debug_assertions) {
+            Some(buf.to_vec())
+        } else {
+            None
+        };
+        let buflen = buf.len();
+        assert_eq!(
+            buflen % PAGE_SZ,
+            0,
+            "{buflen} ; we know TAIL_SZ is a PAGE_SZ multiple, and write_buffered_borrowed is used"
+        );
+
+        // Do the IO.
+        let iobuf = match self.file.write_all(buf, ctx).await {
+            (iobuf, Ok(nwritten)) => {
+                assert_eq!(nwritten, buflen);
+                iobuf
+            }
+            (_, Err(e)) => {
+                return Err(std::io::Error::new(
+                    ErrorKind::Other,
+                    // order error before path because path is long and error is short
+                    format!(
+                        "ephemeral_file: write_blob: write-back tail self.nwritten_blocks={}, buflen={}, {:#}: {}",
+                        self.nwritten_blocks, buflen, e, self.file.path,
+                    ),
+                ));
+            }
+        };
+
+        // Reconstruct the Slice (the write path consumed the Slice and returned us the underlying IoBuf)
+        let buf = tokio_epoll_uring::Slice::from_buf_bounds(iobuf, saved_bounds);
+        if let Some(check_bounds_stuff_works) = check_bounds_stuff_works {
+            assert_eq!(&check_bounds_stuff_works, &*buf);
+        }
+
+        // Pre-warm page cache with the contents.
+        // At least in isolated bulk ingest benchmarks (test_bulk_insert.py), the pre-warming
+        // benefits the code that writes InMemoryLayer=>L0 layers.
+        let nblocks = buflen / PAGE_SZ;
+        let nblocks32 = u32::try_from(nblocks).unwrap();
+        let cache = page_cache::get();
+        static CTX: Lazy<RequestContext> = Lazy::new(|| {
+            RequestContext::new(
+                crate::task_mgr::TaskKind::EphemeralFilePreWarmPageCache,
+                crate::context::DownloadBehavior::Error,
+            )
+        });
+        for blknum_in_buffer in 0..nblocks {
+            let blk_in_buffer = &buf[blknum_in_buffer * PAGE_SZ..(blknum_in_buffer + 1) * PAGE_SZ];
+            let blknum = self
+                .nwritten_blocks
+                .checked_add(blknum_in_buffer as u32)
+                .unwrap();
+            match cache
+                .read_immutable_buf(self.page_cache_file_id, blknum, &CTX)
+                .await
+            {
+                Err(e) => {
+                    error!("ephemeral_file write_blob failed to get immutable buf to pre-warm page cache: {e:?}");
+                    // fail gracefully, it's not the end of the world if we can't pre-warm the cache here
+                }
+                Ok(v) => match v {
+                    page_cache::ReadBufResult::Found(_guard) => {
+                        // This function takes &mut self, so, it shouldn't be possible to reach this point.
+                        unreachable!("we just wrote block {blknum} to the VirtualFile, which is owned by Self, \
+                                      and this function takes &mut self, so, no concurrent read_blk is possible");
+                    }
+                    page_cache::ReadBufResult::NotFound(mut write_guard) => {
+                        write_guard.copy_from_slice(blk_in_buffer);
+                        let _ = write_guard.mark_valid();
+                    }
+                },
+            }
+        }
+        self.nwritten_blocks = self.nwritten_blocks.checked_add(nblocks32).unwrap();
+        Ok((buflen, buf.into_inner()))
+    }
+}
diff --git a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs
new file mode 100644
index 0000000000..b37eafb52c
--- /dev/null
+++ b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs
@@ -0,0 +1,130 @@
+//! The heart of how [`super::EphemeralFile`] does its reads and writes.
+//!
+//! # Writes
+//!
+//! [`super::EphemeralFile`] writes small, borrowed buffers using [`RW::write_all_borrowed`].
+//! The [`RW`] batches these into [`TAIL_SZ`] bigger writes, using [`owned_buffers_io::write::BufferedWriter`].
+//!
+//! # Reads
+//!
+//! [`super::EphemeralFile`] always reads full [`PAGE_SZ`]ed blocks using [`RW::read_blk`].
+//!
+//! The [`RW`] serves these reads either from the buffered writer's in-memory buffer
+//! or redirects the caller to read from the underlying [`OwnedAsyncWriter`]
+//! if the read is for the prefix that has already been flushed.
+//!
+//! # Current Usage
+//!
+//! The current user of this module is [`super::page_caching::RW`].
+
+mod zero_padded;
+
+use crate::{
+    context::RequestContext,
+    page_cache::PAGE_SZ,
+    virtual_file::owned_buffers_io::{
+        self,
+        write::{Buffer, OwnedAsyncWriter},
+    },
+};
+
+const TAIL_SZ: usize = 64 * 1024;
+
+/// See module-level comment.
+pub struct RW<W: OwnedAsyncWriter> {
+    buffered_writer: owned_buffers_io::write::BufferedWriter<
+        zero_padded::Buffer<TAIL_SZ>,
+        owned_buffers_io::util::size_tracking_writer::Writer<W>,
+    >,
+}
+
+pub enum ReadResult<'a, W> {
+    NeedsReadFromWriter { writer: &'a W },
+    ServedFromZeroPaddedMutableTail { buffer: &'a [u8; PAGE_SZ] },
+}
+
+impl<W> RW<W>
+where
+    W: OwnedAsyncWriter,
+{
+    pub fn new(writer: W) -> Self {
+        let bytes_flushed_tracker =
+            owned_buffers_io::util::size_tracking_writer::Writer::new(writer);
+        let buffered_writer = owned_buffers_io::write::BufferedWriter::new(
+            bytes_flushed_tracker,
+            zero_padded::Buffer::default(),
+        );
+        Self { buffered_writer }
+    }
+
+    pub(crate) fn as_writer(&self) -> &W {
+        self.buffered_writer.as_inner().as_inner()
+    }
+
+    pub async fn write_all_borrowed(
+        &mut self,
+        buf: &[u8],
+        ctx: &RequestContext,
+    ) -> std::io::Result<usize> {
+        self.buffered_writer.write_buffered_borrowed(buf, ctx).await
+    }
+
+    pub fn bytes_written(&self) -> u64 {
+        let flushed_offset = self.buffered_writer.as_inner().bytes_written();
+        let buffer: &zero_padded::Buffer<TAIL_SZ> = self.buffered_writer.inspect_buffer();
+        flushed_offset + u64::try_from(buffer.pending()).unwrap()
+    }
+
+    pub(crate) async fn read_blk(&self, blknum: u32) -> Result<ReadResult<'_, W>, std::io::Error> {
+        let flushed_offset = self.buffered_writer.as_inner().bytes_written();
+        let buffer: &zero_padded::Buffer<TAIL_SZ> = self.buffered_writer.inspect_buffer();
+        let buffered_offset = flushed_offset + u64::try_from(buffer.pending()).unwrap();
+        let read_offset = (blknum as u64) * (PAGE_SZ as u64);
+
+        // The trailing page ("block") might only be partially filled,
+        // yet the blob_io code relies on us to return a full PAGE_SZed slice anyway.
+        // Moreover, it has to be zero-padded, because when we still had
+        // a write-back page cache, it provided pre-zeroed pages, and blob_io came to rely on it.
+        // DeltaLayer probably has the same issue, not sure why it needs no special treatment.
+        // => check here that the read doesn't go beyond this potentially trailing
+        // => the zero-padding is done in the `else` branch below
+        let blocks_written = if buffered_offset % (PAGE_SZ as u64) == 0 {
+            buffered_offset / (PAGE_SZ as u64)
+        } else {
+            (buffered_offset / (PAGE_SZ as u64)) + 1
+        };
+        if (blknum as u64) >= blocks_written {
+            return Err(std::io::Error::new(std::io::ErrorKind::Other, anyhow::anyhow!("read past end of ephemeral_file: read=0x{read_offset:x} buffered=0x{buffered_offset:x} flushed=0x{flushed_offset}")));
+        }
+
+        // assertions for the `if-else` below
+        assert_eq!(
+            flushed_offset % (TAIL_SZ as u64), 0,
+            "we only use write_buffered_borrowed to write to the buffered writer, so it's guaranteed that flushes happen buffer.cap()-sized chunks"
+        );
+        assert_eq!(
+            flushed_offset % (PAGE_SZ as u64),
+            0,
+            "the logic below can't handle if the page is spread across the flushed part and the buffer"
+        );
+
+        if read_offset < flushed_offset {
+            assert!(read_offset + (PAGE_SZ as u64) <= flushed_offset);
+            Ok(ReadResult::NeedsReadFromWriter {
+                writer: self.as_writer(),
+            })
+        } else {
+            let read_offset_in_buffer = read_offset
+                .checked_sub(flushed_offset)
+                .expect("would have taken `if` branch instead of this one");
+            let read_offset_in_buffer = usize::try_from(read_offset_in_buffer).unwrap();
+            let zero_padded_slice = buffer.as_zero_padded_slice();
+            let page = &zero_padded_slice[read_offset_in_buffer..(read_offset_in_buffer + PAGE_SZ)];
+            Ok(ReadResult::ServedFromZeroPaddedMutableTail {
+                buffer: page
+                    .try_into()
+                    .expect("the slice above got it as page-size slice"),
+            })
+        }
+    }
+}
diff --git a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write/zero_padded.rs b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write/zero_padded.rs
new file mode 100644
index 0000000000..f90291bbf8
--- /dev/null
+++ b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write/zero_padded.rs
@@ -0,0 +1,108 @@
+//! A [`crate::virtual_file::owned_buffers_io::write::Buffer`] whose
+//! unwritten range is guaranteed to be zero-initialized.
+//! This is used by [`crate::tenant::ephemeral_file::zero_padded_read_write::RW::read_blk`]
+//! to serve page-sized reads of the trailing page when the trailing page has only been partially filled.
+
+use std::mem::MaybeUninit;
+
+/// See module-level comment.
+pub struct Buffer<const N: usize> {
+    allocation: Box<[u8; N]>,
+    written: usize,
+}
+
+impl<const N: usize> Default for Buffer<N> {
+    fn default() -> Self {
+        Self {
+            allocation: Box::new(
+                // SAFETY: zeroed memory is a valid [u8; N]
+                unsafe { MaybeUninit::zeroed().assume_init() },
+            ),
+            written: 0,
+        }
+    }
+}
+
+impl<const N: usize> Buffer<N> {
+    #[inline(always)]
+    fn invariants(&self) {
+        // don't check by default, unoptimized is too expensive even for debug mode
+        if false {
+            debug_assert!(self.written <= N, "{}", self.written);
+            debug_assert!(self.allocation[self.written..N].iter().all(|v| *v == 0));
+        }
+    }
+
+    pub fn as_zero_padded_slice(&self) -> &[u8; N] {
+        &self.allocation
+    }
+}
+
+impl<const N: usize> crate::virtual_file::owned_buffers_io::write::Buffer for Buffer<N> {
+    type IoBuf = Self;
+
+    fn cap(&self) -> usize {
+        self.allocation.len()
+    }
+
+    fn extend_from_slice(&mut self, other: &[u8]) {
+        self.invariants();
+        let remaining = self.allocation.len() - self.written;
+        if other.len() > remaining {
+            panic!("calling extend_from_slice() with insufficient remaining capacity");
+        }
+        self.allocation[self.written..(self.written + other.len())].copy_from_slice(other);
+        self.written += other.len();
+        self.invariants();
+    }
+
+    fn pending(&self) -> usize {
+        self.written
+    }
+
+    fn flush(self) -> tokio_epoll_uring::Slice<Self> {
+        self.invariants();
+        let written = self.written;
+        tokio_epoll_uring::BoundedBuf::slice(self, 0..written)
+    }
+
+    fn reuse_after_flush(iobuf: Self::IoBuf) -> Self {
+        let Self {
+            mut allocation,
+            written,
+        } = iobuf;
+        allocation[0..written].fill(0);
+        let new = Self {
+            allocation,
+            written: 0,
+        };
+        new.invariants();
+        new
+    }
+}
+
+/// We have this trait impl so that the `flush` method in the `Buffer` impl above can produce a
+/// [`tokio_epoll_uring::BoundedBuf::slice`] of the [`Self::written`] range of the data.
+///
+/// Remember that bytes_init is generally _not_ a tracker of the amount
+/// of valid data in the io buffer; we use `Slice` for that.
+/// The `IoBuf` is _only_ for keeping track of uninitialized memory, a bit like MaybeUninit.
+///
+/// SAFETY:
+///
+/// The [`Self::allocation`] is stable becauses boxes are stable.
+/// The memory is zero-initialized, so, bytes_init is always N.
+unsafe impl<const N: usize> tokio_epoll_uring::IoBuf for Buffer<N> {
+    fn stable_ptr(&self) -> *const u8 {
+        self.allocation.as_ptr()
+    }
+
+    fn bytes_init(&self) -> usize {
+        // Yes, N, not self.written; Read the full comment of this impl block!
+        N
+    }
+
+    fn bytes_total(&self) -> usize {
+        N
+    }
+}
diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs
index 4c4cd90c99..2724a5cc07 100644
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -588,7 +588,7 @@ impl LayerMap {
             let kr = Key::from_i128(current_key)..Key::from_i128(change_key);
             coverage.push((kr, current_val.take()));
             current_key = change_key;
-            current_val = change_val.clone();
+            current_val.clone_from(&change_val);
         }
 
         // Add the final interval
@@ -672,12 +672,12 @@ impl LayerMap {
         // Loop through the delta coverage and recurse on each part
         for (change_key, change_val) in version.delta_coverage.range(start..end) {
             // If there's a relevant delta in this part, add 1 and recurse down
-            if let Some(val) = current_val {
+            if let Some(val) = &current_val {
                 if val.get_lsn_range().end > lsn.start {
                     let kr = Key::from_i128(current_key)..Key::from_i128(change_key);
                     let lr = lsn.start..val.get_lsn_range().start;
                     if !kr.is_empty() {
-                        let base_count = Self::is_reimage_worthy(&val, key) as usize;
+                        let base_count = Self::is_reimage_worthy(val, key) as usize;
                         let new_limit = limit.map(|l| l - base_count);
                         let max_stacked_deltas_underneath = self.count_deltas(&kr, &lr, new_limit);
                         max_stacked_deltas = std::cmp::max(
@@ -689,17 +689,17 @@ impl LayerMap {
             }
 
             current_key = change_key;
-            current_val = change_val.clone();
+            current_val.clone_from(&change_val);
         }
 
         // Consider the last part
-        if let Some(val) = current_val {
+        if let Some(val) = &current_val {
             if val.get_lsn_range().end > lsn.start {
                 let kr = Key::from_i128(current_key)..Key::from_i128(end);
                 let lr = lsn.start..val.get_lsn_range().start;
 
                 if !kr.is_empty() {
-                    let base_count = Self::is_reimage_worthy(&val, key) as usize;
+                    let base_count = Self::is_reimage_worthy(val, key) as usize;
                     let new_limit = limit.map(|l| l - base_count);
                     let max_stacked_deltas_underneath = self.count_deltas(&kr, &lr, new_limit);
                     max_stacked_deltas = std::cmp::max(
@@ -916,6 +916,7 @@ mod tests {
         assert_eq!(lhs, rhs);
     }
 
+    #[cfg(test)]
     fn brute_force_range_search(
         layer_map: &LayerMap,
         key_range: Range<Key>,
diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs
index 1736950d1f..39da713479 100644
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -235,6 +235,12 @@ impl TimelineMetadata {
         let bytes = instance.to_bytes().unwrap();
         Self::from_bytes(&bytes).unwrap()
     }
+
+    pub(crate) fn apply(&mut self, update: &MetadataUpdate) {
+        self.body.disk_consistent_lsn = update.disk_consistent_lsn;
+        self.body.prev_record_lsn = update.prev_record_lsn;
+        self.body.latest_gc_cutoff_lsn = update.latest_gc_cutoff_lsn;
+    }
 }
 
 impl<'de> Deserialize<'de> for TimelineMetadata {
@@ -259,6 +265,27 @@ impl Serialize for TimelineMetadata {
     }
 }
 
+/// Parts of the metadata which are regularly modified.
+pub(crate) struct MetadataUpdate {
+    disk_consistent_lsn: Lsn,
+    prev_record_lsn: Option<Lsn>,
+    latest_gc_cutoff_lsn: Lsn,
+}
+
+impl MetadataUpdate {
+    pub(crate) fn new(
+        disk_consistent_lsn: Lsn,
+        prev_record_lsn: Option<Lsn>,
+        latest_gc_cutoff_lsn: Lsn,
+    ) -> Self {
+        Self {
+            disk_consistent_lsn,
+            prev_record_lsn,
+            latest_gc_cutoff_lsn,
+        }
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index b1b46d487b..006d501daa 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -2,6 +2,7 @@
 //! page server.
 
 use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf};
+use futures::StreamExt;
 use itertools::Itertools;
 use pageserver_api::key::Key;
 use pageserver_api::models::LocationConfigMode;
@@ -253,17 +254,15 @@ impl TenantsMap {
     }
 }
 
+/// Precursor to deletion of a tenant dir: we do a fast rename to a tmp path, and then
+/// the slower actual deletion in the background.
+///
 /// This is "safe" in that that it won't leave behind a partially deleted directory
 /// at the original path, because we rename with TEMP_FILE_SUFFIX before starting deleting
 /// the contents.
 ///
 /// This is pageserver-specific, as it relies on future processes after a crash to check
 /// for TEMP_FILE_SUFFIX when loading things.
-async fn safe_remove_tenant_dir_all(path: impl AsRef<Utf8Path>) -> std::io::Result<()> {
-    let tmp_path = safe_rename_tenant_dir(path).await?;
-    fs::remove_dir_all(tmp_path).await
-}
-
 async fn safe_rename_tenant_dir(path: impl AsRef<Utf8Path>) -> std::io::Result<Utf8PathBuf> {
     let parent = path
         .as_ref()
@@ -286,6 +285,28 @@ async fn safe_rename_tenant_dir(path: impl AsRef<Utf8Path>) -> std::io::Result<U
     Ok(tmp_path)
 }
 
+/// When we have moved a tenant's content to a temporary directory, we may delete it lazily in
+/// the background, and thereby avoid blocking any API requests on this deletion completing.
+fn spawn_background_purge(tmp_path: Utf8PathBuf) {
+    // Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory.
+    // After a tenant is detached, there are no more task_mgr tasks for that tenant_id.
+    let task_tenant_id = None;
+
+    task_mgr::spawn(
+        task_mgr::BACKGROUND_RUNTIME.handle(),
+        TaskKind::MgmtRequest,
+        task_tenant_id,
+        None,
+        "tenant_files_delete",
+        false,
+        async move {
+            fs::remove_dir_all(tmp_path.as_path())
+                .await
+                .with_context(|| format!("tenant directory {:?} deletion", tmp_path))
+        },
+    );
+}
+
 static TENANTS: Lazy<std::sync::RwLock<TenantsMap>> =
     Lazy::new(|| std::sync::RwLock::new(TenantsMap::Initializing));
 
@@ -570,7 +591,11 @@ pub async fn init_tenant_mgr(
     );
     TENANT.startup_scheduled.inc_by(tenant_configs.len() as u64);
 
-    // Construct `Tenant` objects and start them running
+    // Accumulate futures for writing tenant configs, so that we can execute in parallel
+    let mut config_write_futs = Vec::new();
+
+    // Update the location configs according to the re-attach response and persist them to disk
+    tracing::info!("Updating {} location configs", tenant_configs.len());
     for (tenant_shard_id, location_conf) in tenant_configs {
         let tenant_dir_path = conf.tenant_path(&tenant_shard_id);
 
@@ -597,18 +622,22 @@ pub async fn init_tenant_mgr(
         const DEFAULT_SECONDARY_CONF: SecondaryLocationConfig =
             SecondaryLocationConfig { warm: true };
 
-        // Update the location config according to the re-attach response
         if let Some(tenant_modes) = &tenant_modes {
             // We have a generation map: treat it as the authority for whether
             // this tenant is really attached.
             match tenant_modes.get(&tenant_shard_id) {
                 None => {
                     info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Detaching tenant, control plane omitted it in re-attach response");
-                    if let Err(e) = safe_remove_tenant_dir_all(&tenant_dir_path).await {
-                        error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
-                            "Failed to remove detached tenant directory '{tenant_dir_path}': {e:?}",
-                        );
-                    }
+
+                    match safe_rename_tenant_dir(&tenant_dir_path).await {
+                        Ok(tmp_path) => {
+                            spawn_background_purge(tmp_path);
+                        }
+                        Err(e) => {
+                            error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
+                            "Failed to move detached tenant directory '{tenant_dir_path}': {e:?}");
+                        }
+                    };
 
                     // We deleted local content: move on to next tenant, don't try and spawn this one.
                     continue;
@@ -654,8 +683,32 @@ pub async fn init_tenant_mgr(
 
         // Presence of a generation number implies attachment: attach the tenant
         // if it wasn't already, and apply the generation number.
-        Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await?;
+        config_write_futs.push(async move {
+            let r = Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await;
+            (tenant_shard_id, location_conf, r)
+        });
+    }
 
+    // Execute config writes with concurrency, to avoid bottlenecking on local FS write latency
+    tracing::info!(
+        "Writing {} location config files...",
+        config_write_futs.len()
+    );
+    let config_write_results = futures::stream::iter(config_write_futs)
+        .buffer_unordered(16)
+        .collect::<Vec<_>>()
+        .await;
+
+    tracing::info!(
+        "Spawning {} tenant shard locations...",
+        config_write_results.len()
+    );
+    // For those shards that have live configurations, construct `Tenant` or `SecondaryTenant` objects and start them running
+    for (tenant_shard_id, location_conf, config_write_result) in config_write_results {
+        // Errors writing configs are fatal
+        config_write_result?;
+
+        let tenant_dir_path = conf.tenant_path(&tenant_shard_id);
         let shard_identity = location_conf.shard;
         let slot = match location_conf.mode {
             LocationMode::Attached(attached_conf) => {
@@ -678,12 +731,19 @@ pub async fn init_tenant_mgr(
                     }
                 }
             }
-            LocationMode::Secondary(secondary_conf) => TenantSlot::Secondary(SecondaryTenant::new(
-                tenant_shard_id,
-                shard_identity,
-                location_conf.tenant_conf,
-                &secondary_conf,
-            )),
+            LocationMode::Secondary(secondary_conf) => {
+                info!(
+                    tenant_id = %tenant_shard_id.tenant_id,
+                    shard_id = %tenant_shard_id.shard_slug(),
+                    "Starting secondary tenant"
+                );
+                TenantSlot::Secondary(SecondaryTenant::new(
+                    tenant_shard_id,
+                    shard_identity,
+                    location_conf.tenant_conf,
+                    &secondary_conf,
+                ))
+            }
         };
 
         tenants.insert(tenant_shard_id, slot);
@@ -1410,9 +1470,15 @@ impl TenantManager {
 
         match tenant.current_state() {
             TenantState::Broken { .. } | TenantState::Stopping { .. } => {
-                // If a tenant is broken or stopping, DeleteTenantFlow can
-                // handle it: broken tenants proceed to delete, stopping tenants
-                // are checked for deletion already in progress.
+                // If deletion is already in progress, return success (the semantics of this
+                // function are to rerturn success afterr deletion is spawned in background).
+                // Otherwise fall through and let [`DeleteTenantFlow`] handle this state.
+                if DeleteTenantFlow::is_in_progress(&tenant) {
+                    // The `delete_progress` lock is held: deletion is already happening
+                    // in the bacckground
+                    slot_guard.revert();
+                    return Ok(());
+                }
             }
             _ => {
                 tenant
@@ -1686,7 +1752,7 @@ impl TenantManager {
         let tmp_path = safe_rename_tenant_dir(&local_tenant_directory)
             .await
             .with_context(|| format!("local tenant directory {local_tenant_directory:?} rename"))?;
-        self.spawn_background_purge(tmp_path);
+        spawn_background_purge(tmp_path);
 
         fail::fail_point!("shard-split-pre-finish", |_| Err(anyhow::anyhow!(
             "failpoint"
@@ -1841,28 +1907,6 @@ impl TenantManager {
         shutdown_all_tenants0(self.tenants).await
     }
 
-    /// When we have moved a tenant's content to a temporary directory, we may delete it lazily in
-    /// the background, and thereby avoid blocking any API requests on this deletion completing.
-    fn spawn_background_purge(&self, tmp_path: Utf8PathBuf) {
-        // Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory.
-        // After a tenant is detached, there are no more task_mgr tasks for that tenant_id.
-        let task_tenant_id = None;
-
-        task_mgr::spawn(
-            task_mgr::BACKGROUND_RUNTIME.handle(),
-            TaskKind::MgmtRequest,
-            task_tenant_id,
-            None,
-            "tenant_files_delete",
-            false,
-            async move {
-                fs::remove_dir_all(tmp_path.as_path())
-                    .await
-                    .with_context(|| format!("tenant directory {:?} deletion", tmp_path))
-            },
-        );
-    }
-
     pub(crate) async fn detach_tenant(
         &self,
         conf: &'static PageServerConf,
@@ -1879,7 +1923,7 @@ impl TenantManager {
                 deletion_queue_client,
             )
             .await?;
-        self.spawn_background_purge(tmp_path);
+        spawn_background_purge(tmp_path);
 
         Ok(())
     }
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 3879135f26..a54e93c96b 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -202,12 +202,15 @@ use std::sync::atomic::{AtomicU32, Ordering};
 use std::sync::{Arc, Mutex};
 use std::time::Duration;
 
-use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath, TimeoutOrCancel};
+use remote_storage::{
+    DownloadError, GenericRemoteStorage, ListingMode, RemotePath, TimeoutOrCancel,
+};
 use std::ops::DerefMut;
 use tracing::{debug, error, info, instrument, warn};
 use tracing::{info_span, Instrument};
 use utils::lsn::Lsn;
 
+use crate::context::RequestContext;
 use crate::deletion_queue::{DeletionQueueClient, DeletionQueueError};
 use crate::metrics::{
     MeasureRemoteOp, RemoteOpFileKind, RemoteOpKind, RemoteTimelineClientMetrics,
@@ -236,11 +239,14 @@ use utils::id::{TenantId, TimelineId};
 
 use self::index::IndexPart;
 
+use super::metadata::MetadataUpdate;
 use super::storage_layer::{Layer, LayerFileName, ResidentLayer};
 use super::upload_queue::SetDeletedFlagProgress;
 use super::Generation;
 
-pub(crate) use download::{is_temp_download_file, list_remote_timelines};
+pub(crate) use download::{
+    download_index_part, is_temp_download_file, list_remote_tenant_shards, list_remote_timelines,
+};
 pub(crate) use index::LayerFileMetadata;
 
 // Occasional network issues and such can cause remote operations to fail, and
@@ -469,7 +475,7 @@ impl RemoteTimelineClient {
             },
         );
 
-        let index_part = download::download_index_part(
+        let (index_part, _index_generation) = download::download_index_part(
             &self.storage_impl,
             &self.tenant_shard_id,
             &self.timeline_id,
@@ -500,6 +506,7 @@ impl RemoteTimelineClient {
         layer_file_name: &LayerFileName,
         layer_metadata: &LayerFileMetadata,
         cancel: &CancellationToken,
+        ctx: &RequestContext,
     ) -> anyhow::Result<u64> {
         let downloaded_size = {
             let _unfinished_gauge_guard = self.metrics.call_begin(
@@ -517,6 +524,7 @@ impl RemoteTimelineClient {
                 layer_file_name,
                 layer_metadata,
                 cancel,
+                ctx,
             )
             .measure_remote_op(
                 RemoteOpFileKind::Layer,
@@ -536,9 +544,10 @@ impl RemoteTimelineClient {
     // Upload operations.
     //
 
-    ///
     /// Launch an index-file upload operation in the background, with
-    /// updated metadata.
+    /// fully updated metadata.
+    ///
+    /// This should only be used to upload initial metadata to remote storage.
     ///
     /// The upload will be added to the queue immediately, but it
     /// won't be performed until all previously scheduled layer file
@@ -550,7 +559,7 @@ impl RemoteTimelineClient {
     /// If there were any changes to the list of files, i.e. if any
     /// layer file uploads were scheduled, since the last index file
     /// upload, those will be included too.
-    pub fn schedule_index_upload_for_metadata_update(
+    pub fn schedule_index_upload_for_full_metadata_update(
         self: &Arc<Self>,
         metadata: &TimelineMetadata,
     ) -> anyhow::Result<()> {
@@ -566,6 +575,27 @@ impl RemoteTimelineClient {
         Ok(())
     }
 
+    /// Launch an index-file upload operation in the background, with only parts of the metadata
+    /// updated.
+    ///
+    /// This is the regular way of updating metadata on layer flushes or Gc.
+    ///
+    /// Using this lighter update mechanism allows for reparenting and detaching without changes to
+    /// `index_part.json`, while being more clear on what values update regularly.
+    pub(crate) fn schedule_index_upload_for_metadata_update(
+        self: &Arc<Self>,
+        update: &MetadataUpdate,
+    ) -> anyhow::Result<()> {
+        let mut guard = self.upload_queue.lock().unwrap();
+        let upload_queue = guard.initialized_mut()?;
+
+        upload_queue.latest_metadata.apply(update);
+
+        self.schedule_index_upload(upload_queue, upload_queue.latest_metadata.clone());
+
+        Ok(())
+    }
+
     ///
     /// Launch an index-file upload operation in the background, if necessary.
     ///
@@ -1122,7 +1152,7 @@ impl RemoteTimelineClient {
         // and retry will arrive to different pageserver there wont be any traces of it on remote storage
         let timeline_storage_path = remote_timeline_path(&self.tenant_shard_id, &self.timeline_id);
 
-        // Execute all pending deletions, so that when we proceed to do a list_prefixes below, we aren't
+        // Execute all pending deletions, so that when we proceed to do a listing below, we aren't
         // taking the burden of listing all the layers that we already know we should delete.
         self.flush_deletion_queue().await?;
 
@@ -1131,14 +1161,20 @@ impl RemoteTimelineClient {
         let remaining = download_retry(
             || async {
                 self.storage_impl
-                    .list_files(Some(&timeline_storage_path), None, &cancel)
+                    .list(
+                        Some(&timeline_storage_path),
+                        ListingMode::NoDelimiter,
+                        None,
+                        &cancel,
+                    )
                     .await
             },
             "list remaining files",
             &cancel,
         )
         .await
-        .context("list files remaining files")?;
+        .context("list files remaining files")?
+        .keys;
 
         // We will delete the current index_part object last, since it acts as a deletion
         // marker via its deleted_at attribute
@@ -1685,6 +1721,11 @@ impl RemoteTimelineClient {
     }
 }
 
+pub fn remote_tenant_path(tenant_shard_id: &TenantShardId) -> RemotePath {
+    let path = format!("tenants/{tenant_shard_id}");
+    RemotePath::from_string(&path).expect("Failed to construct path")
+}
+
 pub fn remote_timelines_path(tenant_shard_id: &TenantShardId) -> RemotePath {
     let path = format!("tenants/{tenant_shard_id}/{TIMELINES_SEGMENT_NAME}");
     RemotePath::from_string(&path).expect("Failed to construct path")
@@ -2024,7 +2065,7 @@ mod tests {
         // Schedule upload of index. Check that it is queued
         let metadata = dummy_metadata(Lsn(0x20));
         client
-            .schedule_index_upload_for_metadata_update(&metadata)
+            .schedule_index_upload_for_full_metadata_update(&metadata)
             .unwrap();
         {
             let mut guard = client.upload_queue.lock().unwrap();
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index 6ee8ad7155..b038f264f5 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -5,6 +5,7 @@
 
 use std::collections::HashSet;
 use std::future::Future;
+use std::str::FromStr;
 
 use anyhow::{anyhow, Context};
 use camino::{Utf8Path, Utf8PathBuf};
@@ -17,6 +18,7 @@ use tracing::warn;
 use utils::backoff;
 
 use crate::config::PageServerConf;
+use crate::context::RequestContext;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path};
 use crate::tenant::storage_layer::LayerFileName;
@@ -25,13 +27,13 @@ use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile};
 use crate::TEMP_FILE_SUFFIX;
 use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode, RemotePath};
 use utils::crashsafe::path_with_suffix_extension;
-use utils::id::TimelineId;
+use utils::id::{TenantId, TimelineId};
 
 use super::index::{IndexPart, LayerFileMetadata};
 use super::{
     parse_remote_index_path, remote_index_path, remote_initdb_archive_path,
-    remote_initdb_preserved_archive_path, FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES,
-    INITDB_PATH,
+    remote_initdb_preserved_archive_path, remote_tenant_path, FAILED_DOWNLOAD_WARN_THRESHOLD,
+    FAILED_REMOTE_OP_RETRIES, INITDB_PATH,
 };
 
 ///
@@ -39,6 +41,7 @@ use super::{
 /// in the metadata. (In the future, we might do more cross-checks, like CRC validation)
 ///
 /// Returns the size of the downloaded file.
+#[allow(clippy::too_many_arguments)]
 pub async fn download_layer_file<'a>(
     conf: &'static PageServerConf,
     storage: &'a GenericRemoteStorage,
@@ -47,6 +50,7 @@ pub async fn download_layer_file<'a>(
     layer_file_name: &'a LayerFileName,
     layer_metadata: &'a LayerFileMetadata,
     cancel: &CancellationToken,
+    ctx: &RequestContext,
 ) -> Result<u64, DownloadError> {
     debug_assert_current_span_has_tenant_and_timeline_id();
 
@@ -74,7 +78,7 @@ pub async fn download_layer_file<'a>(
     let temp_file_path = path_with_suffix_extension(&local_path, TEMP_DOWNLOAD_EXTENSION);
 
     let bytes_amount = download_retry(
-        || async { download_object(storage, &remote_path, &temp_file_path, cancel).await },
+        || async { download_object(storage, &remote_path, &temp_file_path, cancel, ctx).await },
         &format!("download {remote_path:?}"),
         cancel,
     )
@@ -132,6 +136,7 @@ async fn download_object<'a>(
     src_path: &RemotePath,
     dst_path: &Utf8PathBuf,
     cancel: &CancellationToken,
+    #[cfg_attr(target_os = "macos", allow(unused_variables))] ctx: &RequestContext,
 ) -> Result<u64, DownloadError> {
     let res = match crate::virtual_file::io_engine::get() {
         crate::virtual_file::io_engine::IoEngine::NotSet => panic!("unset"),
@@ -182,6 +187,7 @@ async fn download_object<'a>(
         #[cfg(target_os = "linux")]
         crate::virtual_file::io_engine::IoEngine::TokioEpollUring => {
             use crate::virtual_file::owned_buffers_io::{self, util::size_tracking_writer};
+            use bytes::BytesMut;
             async {
                 let destination_file = VirtualFile::create(dst_path)
                     .await
@@ -194,10 +200,10 @@ async fn download_object<'a>(
                 // There's chunks_vectored() on the stream.
                 let (bytes_amount, destination_file) = async {
                     let size_tracking = size_tracking_writer::Writer::new(destination_file);
-                    let mut buffered = owned_buffers_io::write::BufferedWriter::<
-                        { super::BUFFER_SIZE },
-                        _,
-                    >::new(size_tracking);
+                    let mut buffered = owned_buffers_io::write::BufferedWriter::<BytesMut, _>::new(
+                        size_tracking,
+                        BytesMut::with_capacity(super::BUFFER_SIZE),
+                    );
                     while let Some(res) =
                         futures::StreamExt::next(&mut download.download_stream).await
                     {
@@ -206,10 +212,10 @@ async fn download_object<'a>(
                             Err(e) => return Err(e),
                         };
                         buffered
-                            .write_buffered(tokio_epoll_uring::BoundedBuf::slice_full(chunk))
+                            .write_buffered(tokio_epoll_uring::BoundedBuf::slice_full(chunk), ctx)
                             .await?;
                     }
-                    let size_tracking = buffered.flush_and_into_inner().await?;
+                    let size_tracking = buffered.flush_and_into_inner(ctx).await?;
                     Ok(size_tracking.into_inner())
                 }
                 .await?;
@@ -252,42 +258,31 @@ pub(crate) fn is_temp_download_file(path: &Utf8Path) -> bool {
     }
 }
 
-/// List timelines of given tenant in remote storage
-pub async fn list_remote_timelines(
+async fn list_identifiers<T>(
     storage: &GenericRemoteStorage,
-    tenant_shard_id: TenantShardId,
+    prefix: RemotePath,
     cancel: CancellationToken,
-) -> anyhow::Result<(HashSet<TimelineId>, HashSet<String>)> {
-    let remote_path = remote_timelines_path(&tenant_shard_id);
-
-    fail::fail_point!("storage-sync-list-remote-timelines", |_| {
-        anyhow::bail!("storage-sync-list-remote-timelines");
-    });
-
+) -> anyhow::Result<(HashSet<T>, HashSet<String>)>
+where
+    T: FromStr + Eq + std::hash::Hash,
+{
     let listing = download_retry_forever(
-        || {
-            storage.list(
-                Some(&remote_path),
-                ListingMode::WithDelimiter,
-                None,
-                &cancel,
-            )
-        },
-        &format!("list timelines for {tenant_shard_id}"),
+        || storage.list(Some(&prefix), ListingMode::WithDelimiter, None, &cancel),
+        &format!("list identifiers in prefix {prefix}"),
         &cancel,
     )
     .await?;
 
-    let mut timeline_ids = HashSet::new();
+    let mut parsed_ids = HashSet::new();
     let mut other_prefixes = HashSet::new();
 
-    for timeline_remote_storage_key in listing.prefixes {
-        let object_name = timeline_remote_storage_key.object_name().ok_or_else(|| {
-            anyhow::anyhow!("failed to get timeline id for remote tenant {tenant_shard_id}")
+    for id_remote_storage_key in listing.prefixes {
+        let object_name = id_remote_storage_key.object_name().ok_or_else(|| {
+            anyhow::anyhow!("failed to get object name for key {id_remote_storage_key}")
         })?;
 
-        match object_name.parse::<TimelineId>() {
-            Ok(t) => timeline_ids.insert(t),
+        match object_name.parse::<T>() {
+            Ok(t) => parsed_ids.insert(t),
             Err(_) => other_prefixes.insert(object_name.to_string()),
         };
     }
@@ -299,7 +294,31 @@ pub async fn list_remote_timelines(
         other_prefixes.insert(object_name.to_string());
     }
 
-    Ok((timeline_ids, other_prefixes))
+    Ok((parsed_ids, other_prefixes))
+}
+
+/// List shards of given tenant in remote storage
+pub(crate) async fn list_remote_tenant_shards(
+    storage: &GenericRemoteStorage,
+    tenant_id: TenantId,
+    cancel: CancellationToken,
+) -> anyhow::Result<(HashSet<TenantShardId>, HashSet<String>)> {
+    let remote_path = remote_tenant_path(&TenantShardId::unsharded(tenant_id));
+    list_identifiers::<TenantShardId>(storage, remote_path, cancel).await
+}
+
+/// List timelines of given tenant shard in remote storage
+pub async fn list_remote_timelines(
+    storage: &GenericRemoteStorage,
+    tenant_shard_id: TenantShardId,
+    cancel: CancellationToken,
+) -> anyhow::Result<(HashSet<TimelineId>, HashSet<String>)> {
+    fail::fail_point!("storage-sync-list-remote-timelines", |_| {
+        anyhow::bail!("storage-sync-list-remote-timelines");
+    });
+
+    let remote_path = remote_timelines_path(&tenant_shard_id).add_trailing_slash();
+    list_identifiers::<TimelineId>(storage, remote_path, cancel).await
 }
 
 async fn do_download_index_part(
@@ -308,7 +327,7 @@ async fn do_download_index_part(
     timeline_id: &TimelineId,
     index_generation: Generation,
     cancel: &CancellationToken,
-) -> Result<IndexPart, DownloadError> {
+) -> Result<(IndexPart, Generation), DownloadError> {
     let remote_path = remote_index_path(tenant_shard_id, timeline_id, index_generation);
 
     let index_part_bytes = download_retry_forever(
@@ -333,7 +352,7 @@ async fn do_download_index_part(
         .with_context(|| format!("deserialize index part file at {remote_path:?}"))
         .map_err(DownloadError::Other)?;
 
-    Ok(index_part)
+    Ok((index_part, index_generation))
 }
 
 /// index_part.json objects are suffixed with a generation number, so we cannot
@@ -342,13 +361,13 @@ async fn do_download_index_part(
 /// In this function we probe for the most recent index in a generation <= our current generation.
 /// See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md
 #[tracing::instrument(skip_all, fields(generation=?my_generation))]
-pub(super) async fn download_index_part(
+pub(crate) async fn download_index_part(
     storage: &GenericRemoteStorage,
     tenant_shard_id: &TenantShardId,
     timeline_id: &TimelineId,
     my_generation: Generation,
     cancel: &CancellationToken,
-) -> Result<IndexPart, DownloadError> {
+) -> Result<(IndexPart, Generation), DownloadError> {
     debug_assert_current_span_has_tenant_and_timeline_id();
 
     if my_generation.is_none() {
@@ -417,11 +436,16 @@ pub(super) async fn download_index_part(
     let index_prefix = remote_index_path(tenant_shard_id, timeline_id, Generation::none());
 
     let indices = download_retry(
-        || async { storage.list_files(Some(&index_prefix), None, cancel).await },
+        || async {
+            storage
+                .list(Some(&index_prefix), ListingMode::NoDelimiter, None, cancel)
+                .await
+        },
         "list index_part files",
         cancel,
     )
-    .await?;
+    .await?
+    .keys;
 
     // General case logic for which index to use: the latest index whose generation
     // is <= our own.  See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md
diff --git a/pageserver/src/tenant/remote_timeline_client/upload.rs b/pageserver/src/tenant/remote_timeline_client/upload.rs
index 137fe48b73..0227331953 100644
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -167,7 +167,7 @@ pub(crate) async fn time_travel_recover_tenant(
     let warn_after = 3;
     let max_attempts = 10;
     let mut prefixes = Vec::with_capacity(2);
-    if tenant_shard_id.is_zero() {
+    if tenant_shard_id.is_shard_zero() {
         // Also recover the unsharded prefix for a shard of zero:
         // - if the tenant is totally unsharded, the unsharded prefix contains all the data
         // - if the tenant is sharded, we still want to recover the initdb data, but we only
diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs
index 19f36c722e..5c46df268a 100644
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -7,6 +7,7 @@ use std::{sync::Arc, time::SystemTime};
 
 use crate::{
     config::PageServerConf,
+    context::RequestContext,
     disk_usage_eviction_task::DiskUsageEvictionInfo,
     task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
     virtual_file::MaybeFatalIo,
@@ -316,9 +317,13 @@ pub fn spawn_tasks(
     let (upload_req_tx, upload_req_rx) =
         tokio::sync::mpsc::channel::<CommandRequest<UploadCommand>>(16);
 
+    let downloader_task_ctx = RequestContext::new(
+        TaskKind::SecondaryDownloads,
+        crate::context::DownloadBehavior::Download,
+    );
     task_mgr::spawn(
         BACKGROUND_RUNTIME.handle(),
-        TaskKind::SecondaryDownloads,
+        downloader_task_ctx.task_kind(),
         None,
         None,
         "secondary tenant downloads",
@@ -330,6 +335,7 @@ pub fn spawn_tasks(
                 download_req_rx,
                 bg_jobs_clone,
                 cancel_clone,
+                downloader_task_ctx,
             )
             .await;
 
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index 5b29c126d1..fb8907b5a8 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -8,6 +8,7 @@ use std::{
 
 use crate::{
     config::PageServerConf,
+    context::RequestContext,
     disk_usage_eviction_task::{
         finite_f32, DiskUsageEvictionInfo, EvictionCandidate, EvictionLayer, EvictionSecondaryLayer,
     },
@@ -30,7 +31,10 @@ use crate::{
 
 use super::{
     heatmap::HeatMapLayer,
-    scheduler::{self, Completion, JobGenerator, SchedulingResult, TenantBackgroundJobs},
+    scheduler::{
+        self, period_jitter, period_warmup, Completion, JobGenerator, SchedulingResult,
+        TenantBackgroundJobs,
+    },
     SecondaryTenant,
 };
 
@@ -44,7 +48,6 @@ use chrono::format::{DelayedFormat, StrftimeItems};
 use futures::Future;
 use pageserver_api::models::SecondaryProgress;
 use pageserver_api::shard::TenantShardId;
-use rand::Rng;
 use remote_storage::{DownloadError, Etag, GenericRemoteStorage};
 
 use tokio_util::sync::CancellationToken;
@@ -74,12 +77,14 @@ pub(super) async fn downloader_task(
     command_queue: tokio::sync::mpsc::Receiver<CommandRequest<DownloadCommand>>,
     background_jobs_can_start: Barrier,
     cancel: CancellationToken,
+    root_ctx: RequestContext,
 ) {
     let concurrency = tenant_manager.get_conf().secondary_download_concurrency;
 
     let generator = SecondaryDownloader {
         tenant_manager,
         remote_storage,
+        root_ctx,
     };
     let mut scheduler = Scheduler::new(generator, concurrency);
 
@@ -92,6 +97,7 @@ pub(super) async fn downloader_task(
 struct SecondaryDownloader {
     tenant_manager: Arc<TenantManager>,
     remote_storage: GenericRemoteStorage,
+    root_ctx: RequestContext,
 }
 
 #[derive(Debug, Clone)]
@@ -270,7 +276,7 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
         // Update freshened_at even if there was an error: we don't want errored tenants to implicitly
         // take priority to run again.
         let mut detail = secondary_state.detail.lock().unwrap();
-        detail.next_download = Some(Instant::now() + DOWNLOAD_FRESHEN_INTERVAL);
+        detail.next_download = Some(Instant::now() + period_jitter(DOWNLOAD_FRESHEN_INTERVAL, 5));
     }
 
     async fn schedule(&mut self) -> SchedulingResult<PendingDownload> {
@@ -301,18 +307,16 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
                     }
 
                     if detail.next_download.is_none() {
-                        // Initialize with a jitter: this spreads initial downloads on startup
-                        // or mass-attach across our freshen interval.
-                        let jittered_period =
-                            rand::thread_rng().gen_range(Duration::ZERO..DOWNLOAD_FRESHEN_INTERVAL);
-                        detail.next_download = Some(now.checked_add(jittered_period).expect(
+                        // Initialize randomly in the range from 0 to our interval: this uniformly spreads the start times.  Subsequent
+                        // rounds will use a smaller jitter to avoid accidentally synchronizing later.
+                        detail.next_download = Some(now.checked_add(period_warmup(DOWNLOAD_FRESHEN_INTERVAL)).expect(
                         "Using our constant, which is known to be small compared with clock range",
                     ));
                     }
                     (detail.last_download, detail.next_download.unwrap())
                 };
 
-                if now < next_download {
+                if now > next_download {
                     Some(PendingDownload {
                         secondary_state: secondary_tenant,
                         last_download,
@@ -367,11 +371,12 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
         let remote_storage = self.remote_storage.clone();
         let conf = self.tenant_manager.get_conf();
         let tenant_shard_id = *secondary_state.get_tenant_shard_id();
+        let download_ctx = self.root_ctx.attached_child();
         (RunningDownload { barrier }, Box::pin(async move {
             let _completion = completion;
 
             match TenantDownloader::new(conf, &remote_storage, &secondary_state)
-                .download()
+                .download(&download_ctx)
                 .await
             {
                 Err(UpdateError::NoData) => {
@@ -485,7 +490,7 @@ impl<'a> TenantDownloader<'a> {
         }
     }
 
-    async fn download(&self) -> Result<(), UpdateError> {
+    async fn download(&self, ctx: &RequestContext) -> Result<(), UpdateError> {
         debug_assert_current_span_has_tenant_id();
 
         // For the duration of a download, we must hold the SecondaryTenant::gate, to ensure
@@ -560,7 +565,7 @@ impl<'a> TenantDownloader<'a> {
             }
 
             let timeline_id = timeline.timeline_id;
-            self.download_timeline(timeline)
+            self.download_timeline(timeline, ctx)
                 .instrument(tracing::info_span!(
                     "secondary_download_timeline",
                     tenant_id=%tenant_shard_id.tenant_id,
@@ -647,6 +652,12 @@ impl<'a> TenantDownloader<'a> {
                 progress.bytes_downloaded += layer_byte_count;
                 progress.layers_downloaded += layer_count;
             }
+
+            for delete_timeline in &delete_timelines {
+                // We haven't removed from disk yet, but optimistically remove from in-memory state: if removal
+                // from disk fails that will be a fatal error.
+                detail.timelines.remove(delete_timeline);
+            }
         }
 
         // Execute accumulated deletions
@@ -710,13 +721,14 @@ impl<'a> TenantDownloader<'a> {
                     .await
                     .map_err(UpdateError::from)?;
 
+                SECONDARY_MODE.download_heatmap.inc();
+
                 if Some(&download.etag) == prev_etag {
                     Ok(HeatMapDownload::Unmodified)
                 } else {
                     let mut heatmap_bytes = Vec::new();
                     let mut body = tokio_util::io::StreamReader::new(download.download_stream);
                     let _size = tokio::io::copy_buf(&mut body, &mut heatmap_bytes).await?;
-                    SECONDARY_MODE.download_heatmap.inc();
                     Ok(HeatMapDownload::Modified(HeatMapModified {
                         etag: download.etag,
                         last_modified: download.last_modified,
@@ -735,7 +747,11 @@ impl<'a> TenantDownloader<'a> {
         .and_then(|x| x)
     }
 
-    async fn download_timeline(&self, timeline: HeatMapTimeline) -> Result<(), UpdateError> {
+    async fn download_timeline(
+        &self,
+        timeline: HeatMapTimeline,
+        ctx: &RequestContext,
+    ) -> Result<(), UpdateError> {
         debug_assert_current_span_has_tenant_and_timeline_id();
         let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
         let timeline_path = self
@@ -868,6 +884,7 @@ impl<'a> TenantDownloader<'a> {
                 &layer.name,
                 &LayerFileMetadata::from(&layer.metadata),
                 &self.secondary_state.cancel,
+                ctx,
             )
             .await
             {
diff --git a/pageserver/src/tenant/secondary/heatmap_uploader.rs b/pageserver/src/tenant/secondary/heatmap_uploader.rs
index 39d088ffc3..352409f5fc 100644
--- a/pageserver/src/tenant/secondary/heatmap_uploader.rs
+++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs
@@ -20,12 +20,14 @@ use crate::{
 
 use futures::Future;
 use pageserver_api::shard::TenantShardId;
-use rand::Rng;
 use remote_storage::{GenericRemoteStorage, TimeoutOrCancel};
 
 use super::{
     heatmap::HeatMapTenant,
-    scheduler::{self, JobGenerator, RunningJob, SchedulingResult, TenantBackgroundJobs},
+    scheduler::{
+        self, period_jitter, period_warmup, JobGenerator, RunningJob, SchedulingResult,
+        TenantBackgroundJobs,
+    },
     CommandRequest, UploadCommand,
 };
 use tokio_util::sync::CancellationToken;
@@ -181,15 +183,11 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
             let state = self
                 .tenants
                 .entry(*tenant.get_tenant_shard_id())
-                .or_insert_with(|| {
-                    let jittered_period = rand::thread_rng().gen_range(Duration::ZERO..period);
-
-                    UploaderTenantState {
-                        tenant: Arc::downgrade(&tenant),
-                        last_upload: None,
-                        next_upload: Some(now.checked_add(jittered_period).unwrap_or(now)),
-                        last_digest: None,
-                    }
+                .or_insert_with(|| UploaderTenantState {
+                    tenant: Arc::downgrade(&tenant),
+                    last_upload: None,
+                    next_upload: Some(now.checked_add(period_warmup(period)).unwrap_or(now)),
+                    last_digest: None,
                 });
 
             // Decline to do the upload if insufficient time has passed
@@ -274,7 +272,7 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
 
             let next_upload = tenant
                 .get_heatmap_period()
-                .and_then(|period| now.checked_add(period));
+                .and_then(|period| now.checked_add(period_jitter(period, 5)));
 
             WriteComplete {
                     tenant_shard_id: *tenant.get_tenant_shard_id(),
diff --git a/pageserver/src/tenant/secondary/scheduler.rs b/pageserver/src/tenant/secondary/scheduler.rs
index 3bd7be782e..3d042f4513 100644
--- a/pageserver/src/tenant/secondary/scheduler.rs
+++ b/pageserver/src/tenant/secondary/scheduler.rs
@@ -1,4 +1,5 @@
 use futures::Future;
+use rand::Rng;
 use std::{
     collections::HashMap,
     marker::PhantomData,
@@ -19,6 +20,26 @@ use super::{CommandRequest, CommandResponse};
 const MAX_SCHEDULING_INTERVAL: Duration = Duration::from_secs(10);
 const MIN_SCHEDULING_INTERVAL: Duration = Duration::from_secs(1);
 
+/// Jitter a Duration by an integer percentage.  Returned values are uniform
+/// in the range 100-pct..100+pct (i.e. a 5% jitter is 5% either way: a ~10% range)
+pub(super) fn period_jitter(d: Duration, pct: u32) -> Duration {
+    if d == Duration::ZERO {
+        d
+    } else {
+        rand::thread_rng().gen_range((d * (100 - pct)) / 100..(d * (100 + pct)) / 100)
+    }
+}
+
+/// When a periodic task first starts, it should wait for some time in the range 0..period, so
+/// that starting many such tasks at the same time spreads them across the time range.
+pub(super) fn period_warmup(period: Duration) -> Duration {
+    if period == Duration::ZERO {
+        period
+    } else {
+        rand::thread_rng().gen_range(Duration::ZERO..period)
+    }
+}
+
 /// Scheduling helper for background work across many tenants.
 ///
 /// Systems that need to run background work across many tenants may use this type
diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs
index ad79b74d8b..64fff5536c 100644
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -189,7 +189,9 @@ pub(super) async fn gather_inputs(
         // than a space bound (horizon cutoff).  This means that if someone drops a database and waits for their
         // PITR interval, they will see synthetic size decrease, even if we are still storing data inside
         // horizon_cutoff.
-        let mut next_gc_cutoff = gc_info.pitr_cutoff;
+        let pitr_cutoff = gc_info.cutoffs.pitr;
+        let horizon_cutoff = gc_info.cutoffs.horizon;
+        let mut next_gc_cutoff = pitr_cutoff;
 
         // If the caller provided a shorter retention period, use that instead of the GC cutoff.
         let retention_param_cutoff = if let Some(max_retention_period) = max_retention_period {
@@ -216,6 +218,8 @@ pub(super) async fn gather_inputs(
             .map(|lsn| (lsn, LsnKind::BranchPoint))
             .collect::<Vec<_>>();
 
+        drop(gc_info);
+
         // Add branch points we collected earlier, just in case there were any that were
         // not present in retain_lsns. We will remove any duplicates below later.
         if let Some(this_branchpoints) = branchpoints.get(&timeline_id) {
@@ -294,8 +298,8 @@ pub(super) async fn gather_inputs(
             last_record: last_record_lsn,
             // this is not used above, because it might not have updated recently enough
             latest_gc_cutoff: *timeline.get_latest_gc_cutoff_lsn(),
-            horizon_cutoff: gc_info.horizon_cutoff,
-            pitr_cutoff: gc_info.pitr_cutoff,
+            horizon_cutoff,
+            pitr_cutoff,
             next_gc_cutoff,
             retention_param_cutoff,
         });
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 9a2b086828..4f1b56ef9f 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -118,6 +118,7 @@ pub(crate) struct ValuesReconstructState {
     pub(crate) keys: HashMap<Key, Result<VectoredValueReconstructState, PageReconstructError>>,
 
     keys_done: KeySpaceRandomAccum,
+    layers_visited: u32,
 }
 
 impl ValuesReconstructState {
@@ -125,6 +126,7 @@ impl ValuesReconstructState {
         Self {
             keys: HashMap::new(),
             keys_done: KeySpaceRandomAccum::new(),
+            layers_visited: 0,
         }
     }
 
@@ -138,6 +140,37 @@ impl ValuesReconstructState {
         }
     }
 
+    pub(crate) fn on_layer_visited(&mut self) {
+        self.layers_visited += 1;
+    }
+
+    pub(crate) fn get_layers_visited(&self) -> u32 {
+        self.layers_visited
+    }
+
+    /// This function is called after reading a keyspace from a layer.
+    /// It checks if the read path has now moved past the cached Lsn for any keys.
+    ///
+    /// Implementation note: We intentionally iterate over the keys for which we've
+    /// already collected some reconstruct data. This avoids scaling complexity with
+    /// the size of the search space.
+    pub(crate) fn on_lsn_advanced(&mut self, keyspace: &KeySpace, advanced_to: Lsn) {
+        for (key, value) in self.keys.iter_mut() {
+            if !keyspace.contains(key) {
+                continue;
+            }
+
+            if let Ok(state) = value {
+                if state.situation != ValueReconstructSituation::Complete
+                    && state.get_cached_lsn() >= Some(advanced_to)
+                {
+                    state.situation = ValueReconstructSituation::Complete;
+                    self.keys_done.add_key(*key);
+                }
+            }
+        }
+    }
+
     /// Update the state collected for a given key.
     /// Returns true if this was the last value needed for the key and false otherwise.
     ///
@@ -162,11 +195,18 @@ impl ValuesReconstructState {
                         true
                     }
                     Value::WalRecord(rec) => {
-                        let reached_cache =
-                            state.get_cached_lsn().map(|clsn| clsn + 1) == Some(lsn);
+                        debug_assert!(
+                            Some(lsn) > state.get_cached_lsn(),
+                            "Attempt to collect a record below cached LSN for walredo: {} < {}",
+                            lsn,
+                            state
+                                .get_cached_lsn()
+                                .expect("Assertion can only fire if a cached lsn is present")
+                        );
+
                         let will_init = rec.will_init();
                         state.records.push((lsn, rec));
-                        will_init || reached_cache
+                        will_init
                     }
                 },
             };
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 466d95f46d..b5538dff3a 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -20,8 +20,8 @@
 //!    000000067F000032BE0000400000000020B6-000000067F000032BE0000400000000030B6__000000578C6B29-0000000057A50051
 //! ```
 //!
-//! Every delta file consists of three parts: "summary", "index", and
-//! "values". The summary is a fixed size header at the beginning of the file,
+//! Every delta file consists of three parts: "summary", "values", and
+//! "index". The summary is a fixed size header at the beginning of the file,
 //! and it contains basic information about the layer, and offsets to the other
 //! parts. The "index" is a B-tree, mapping from Key and LSN to an offset in the
 //! "values" part.  The actual page images and WAL records are stored in the
@@ -217,6 +217,7 @@ pub struct DeltaLayerInner {
     // values copied from summary
     index_start_blk: u32,
     index_root_blk: u32,
+    lsn_range: Range<Lsn>,
 
     file: VirtualFile,
     file_id: FileId,
@@ -427,9 +428,15 @@ impl DeltaLayerWriterInner {
     ///
     /// The values must be appended in key, lsn order.
     ///
-    async fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> {
+    async fn put_value(
+        &mut self,
+        key: Key,
+        lsn: Lsn,
+        val: Value,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
         let (_, res) = self
-            .put_value_bytes(key, lsn, Value::ser(&val)?, val.will_init())
+            .put_value_bytes(key, lsn, Value::ser(&val)?, val.will_init(), ctx)
             .await;
         res
     }
@@ -440,9 +447,10 @@ impl DeltaLayerWriterInner {
         lsn: Lsn,
         val: Vec<u8>,
         will_init: bool,
+        ctx: &RequestContext,
     ) -> (Vec<u8>, anyhow::Result<()>) {
         assert!(self.lsn_range.start <= lsn);
-        let (val, res) = self.blob_writer.write_blob(val).await;
+        let (val, res) = self.blob_writer.write_blob(val, ctx).await;
         let off = match res {
             Ok(off) => off,
             Err(e) => return (val, Err(anyhow::anyhow!(e))),
@@ -462,18 +470,23 @@ impl DeltaLayerWriterInner {
     ///
     /// Finish writing the delta layer.
     ///
-    async fn finish(self, key_end: Key, timeline: &Arc<Timeline>) -> anyhow::Result<ResidentLayer> {
+    async fn finish(
+        self,
+        key_end: Key,
+        timeline: &Arc<Timeline>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<ResidentLayer> {
         let index_start_blk =
             ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
 
-        let mut file = self.blob_writer.into_inner().await?;
+        let mut file = self.blob_writer.into_inner(ctx).await?;
 
         // Write out the index
         let (index_root_blk, block_buf) = self.tree.finish()?;
         file.seek(SeekFrom::Start(index_start_blk as u64 * PAGE_SZ as u64))
             .await?;
         for buf in block_buf.blocks {
-            let (_buf, res) = file.write_all(buf).await;
+            let (_buf, res) = file.write_all(buf, ctx).await;
             res?;
         }
         assert!(self.lsn_range.start < self.lsn_range.end);
@@ -493,7 +506,7 @@ impl DeltaLayerWriterInner {
         // TODO: could use smallvec here but it's a pain with Slice<T>
         Summary::ser_into(&summary, &mut buf)?;
         file.seek(SeekFrom::Start(0)).await?;
-        let (_buf, res) = file.write_all(buf).await;
+        let (_buf, res) = file.write_all(buf, ctx).await;
         res?;
 
         let metadata = file
@@ -591,8 +604,18 @@ impl DeltaLayerWriter {
     ///
     /// The values must be appended in key, lsn order.
     ///
-    pub async fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> {
-        self.inner.as_mut().unwrap().put_value(key, lsn, val).await
+    pub async fn put_value(
+        &mut self,
+        key: Key,
+        lsn: Lsn,
+        val: Value,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        self.inner
+            .as_mut()
+            .unwrap()
+            .put_value(key, lsn, val, ctx)
+            .await
     }
 
     pub async fn put_value_bytes(
@@ -601,11 +624,12 @@ impl DeltaLayerWriter {
         lsn: Lsn,
         val: Vec<u8>,
         will_init: bool,
+        ctx: &RequestContext,
     ) -> (Vec<u8>, anyhow::Result<()>) {
         self.inner
             .as_mut()
             .unwrap()
-            .put_value_bytes(key, lsn, val, will_init)
+            .put_value_bytes(key, lsn, val, will_init, ctx)
             .await
     }
 
@@ -620,10 +644,11 @@ impl DeltaLayerWriter {
         mut self,
         key_end: Key,
         timeline: &Arc<Timeline>,
+        ctx: &RequestContext,
     ) -> anyhow::Result<ResidentLayer> {
         let inner = self.inner.take().unwrap();
         let temp_path = inner.path.clone();
-        let result = inner.finish(key_end, timeline).await;
+        let result = inner.finish(key_end, timeline, ctx).await;
         // The delta layer files can sometimes be really large. Clean them up.
         if result.is_err() {
             tracing::warn!(
@@ -691,7 +716,7 @@ impl DeltaLayer {
         // TODO: could use smallvec here, but it's a pain with Slice<T>
         Summary::ser_into(&new_summary, &mut buf).context("serialize")?;
         file.seek(SeekFrom::Start(0)).await?;
-        let (_buf, res) = file.write_all(buf).await;
+        let (_buf, res) = file.write_all(buf, ctx).await;
         res?;
         Ok(())
     }
@@ -728,6 +753,9 @@ impl DeltaLayerInner {
             // production code path
             expected_summary.index_start_blk = actual_summary.index_start_blk;
             expected_summary.index_root_blk = actual_summary.index_root_blk;
+            // mask out the timeline_id, but still require the layers to be from the same tenant
+            expected_summary.timeline_id = actual_summary.timeline_id;
+
             if actual_summary != expected_summary {
                 bail!(
                     "in-file summary does not match expected summary. actual = {:?} expected = {:?}",
@@ -742,6 +770,7 @@ impl DeltaLayerInner {
             file_id,
             index_start_blk: actual_summary.index_start_blk,
             index_root_blk: actual_summary.index_root_blk,
+            lsn_range: actual_summary.lsn_range,
             max_vectored_read_bytes,
         }))
     }
@@ -863,10 +892,10 @@ impl DeltaLayerInner {
                 .into(),
         );
 
-        let data_end_offset = self.index_start_blk as u64 * PAGE_SZ as u64;
+        let data_end_offset = self.index_start_offset();
 
         let reads = Self::plan_reads(
-            keyspace,
+            &keyspace,
             lsn_range,
             data_end_offset,
             index_reader,
@@ -880,11 +909,13 @@ impl DeltaLayerInner {
         self.do_reads_and_update_state(reads, reconstruct_state)
             .await;
 
+        reconstruct_state.on_lsn_advanced(&keyspace, self.lsn_range.start);
+
         Ok(())
     }
 
     async fn plan_reads<Reader>(
-        keyspace: KeySpace,
+        keyspace: &KeySpace,
         lsn_range: Range<Lsn>,
         data_end_offset: u64,
         index_reader: DiskBtreeReader<Reader, DELTA_KEY_SIZE>,
@@ -939,7 +970,7 @@ impl DeltaLayerInner {
             }
 
             if !range_end_handled {
-                tracing::info!("Handling range end fallback at {}", data_end_offset);
+                tracing::debug!("Handling range end fallback at {}", data_end_offset);
                 planner.handle_range_end(data_end_offset);
             }
         }
@@ -1103,11 +1134,201 @@ impl DeltaLayerInner {
         if let Some(last) = all_keys.last_mut() {
             // Last key occupies all space till end of value storage,
             // which corresponds to beginning of the index
-            last.size = self.index_start_blk as u64 * PAGE_SZ as u64 - last.size;
+            last.size = self.index_start_offset() - last.size;
         }
         Ok(all_keys)
     }
 
+    /// Using the given writer, write out a truncated version, where LSNs higher than the
+    /// truncate_at are missing.
+    #[cfg(test)]
+    pub(super) async fn copy_prefix(
+        &self,
+        writer: &mut DeltaLayerWriter,
+        truncate_at: Lsn,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        use crate::tenant::vectored_blob_io::{
+            BlobMeta, VectoredReadBuilder, VectoredReadExtended,
+        };
+        use futures::stream::TryStreamExt;
+
+        #[derive(Debug)]
+        enum Item {
+            Actual(Key, Lsn, BlobRef),
+            Sentinel,
+        }
+
+        impl From<Item> for Option<(Key, Lsn, BlobRef)> {
+            fn from(value: Item) -> Self {
+                match value {
+                    Item::Actual(key, lsn, blob) => Some((key, lsn, blob)),
+                    Item::Sentinel => None,
+                }
+            }
+        }
+
+        impl Item {
+            fn offset(&self) -> Option<BlobRef> {
+                match self {
+                    Item::Actual(_, _, blob) => Some(*blob),
+                    Item::Sentinel => None,
+                }
+            }
+
+            fn is_last(&self) -> bool {
+                matches!(self, Item::Sentinel)
+            }
+        }
+
+        let block_reader = FileBlockReader::new(&self.file, self.file_id);
+        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
+            self.index_start_blk,
+            self.index_root_blk,
+            block_reader,
+        );
+
+        let stream = self.stream_index_forwards(&tree_reader, &[0u8; DELTA_KEY_SIZE], ctx);
+        let stream = stream.map_ok(|(key, lsn, pos)| Item::Actual(key, lsn, pos));
+        // put in a sentinel value for getting the end offset for last item, and not having to
+        // repeat the whole read part
+        let stream = stream.chain(futures::stream::once(futures::future::ready(Ok(
+            Item::Sentinel,
+        ))));
+        let mut stream = std::pin::pin!(stream);
+
+        let mut prev: Option<(Key, Lsn, BlobRef)> = None;
+
+        let mut read_builder: Option<VectoredReadBuilder> = None;
+
+        let max_read_size = self
+            .max_vectored_read_bytes
+            .map(|x| x.0.get())
+            .unwrap_or(8192);
+
+        let mut buffer = Some(BytesMut::with_capacity(max_read_size));
+
+        // FIXME: buffering of DeltaLayerWriter
+        let mut per_blob_copy = Vec::new();
+
+        while let Some(item) = stream.try_next().await? {
+            tracing::debug!(?item, "popped");
+            let offset = item
+                .offset()
+                .unwrap_or(BlobRef::new(self.index_start_offset(), false));
+
+            let actionable = if let Some((key, lsn, start_offset)) = prev.take() {
+                let end_offset = offset;
+
+                Some((BlobMeta { key, lsn }, start_offset..end_offset))
+            } else {
+                None
+            };
+
+            let is_last = item.is_last();
+
+            prev = Option::from(item);
+
+            let actionable = actionable.filter(|x| x.0.lsn < truncate_at);
+
+            let builder = if let Some((meta, offsets)) = actionable {
+                // extend or create a new builder
+                if read_builder
+                    .as_mut()
+                    .map(|x| x.extend(offsets.start.pos(), offsets.end.pos(), meta))
+                    .unwrap_or(VectoredReadExtended::No)
+                    == VectoredReadExtended::Yes
+                {
+                    None
+                } else {
+                    read_builder.replace(VectoredReadBuilder::new(
+                        offsets.start.pos(),
+                        offsets.end.pos(),
+                        meta,
+                        max_read_size,
+                    ))
+                }
+            } else {
+                // nothing to do, except perhaps flush any existing for the last element
+                None
+            };
+
+            // flush the possible older builder and also the new one if the item was the last one
+            let builders = builder.into_iter();
+            let builders = if is_last {
+                builders.chain(read_builder.take())
+            } else {
+                builders.chain(None)
+            };
+
+            for builder in builders {
+                let read = builder.build();
+
+                let reader = VectoredBlobReader::new(&self.file);
+
+                let mut buf = buffer.take().unwrap();
+
+                buf.clear();
+                buf.reserve(read.size());
+                let res = reader.read_blobs(&read, buf).await?;
+
+                for blob in res.blobs {
+                    let key = blob.meta.key;
+                    let lsn = blob.meta.lsn;
+                    let data = &res.buf[blob.start..blob.end];
+
+                    #[cfg(debug_assertions)]
+                    Value::des(data)
+                        .with_context(|| {
+                            format!(
+                                "blob failed to deserialize for {}@{}, {}..{}: {:?}",
+                                blob.meta.key,
+                                blob.meta.lsn,
+                                blob.start,
+                                blob.end,
+                                utils::Hex(data)
+                            )
+                        })
+                        .unwrap();
+
+                    // is it an image or will_init walrecord?
+                    // FIXME: this could be handled by threading the BlobRef to the
+                    // VectoredReadBuilder
+                    let will_init = crate::repository::ValueBytes::will_init(data)
+                        .inspect_err(|_e| {
+                            #[cfg(feature = "testing")]
+                            tracing::error!(data=?utils::Hex(data), err=?_e, "failed to parse will_init out of serialized value");
+                        })
+                        .unwrap_or(false);
+
+                    per_blob_copy.clear();
+                    per_blob_copy.extend_from_slice(data);
+
+                    let (tmp, res) = writer
+                        .put_value_bytes(
+                            key,
+                            lsn,
+                            std::mem::take(&mut per_blob_copy),
+                            will_init,
+                            ctx,
+                        )
+                        .await;
+                    per_blob_copy = tmp;
+                    res?;
+                }
+
+                buffer = Some(res.buf);
+            }
+        }
+
+        assert!(
+            read_builder.is_none(),
+            "with the sentinel above loop should had handled all"
+        );
+
+        Ok(())
+    }
+
     pub(super) async fn dump(&self, ctx: &RequestContext) -> anyhow::Result<()> {
         println!(
             "index_start_blk: {}, root {}",
@@ -1177,6 +1398,44 @@ impl DeltaLayerInner {
 
         Ok(())
     }
+
+    #[cfg(test)]
+    fn stream_index_forwards<'a, R>(
+        &'a self,
+        reader: &'a DiskBtreeReader<R, DELTA_KEY_SIZE>,
+        start: &'a [u8; DELTA_KEY_SIZE],
+        ctx: &'a RequestContext,
+    ) -> impl futures::stream::Stream<
+        Item = Result<(Key, Lsn, BlobRef), crate::tenant::disk_btree::DiskBtreeError>,
+    > + 'a
+    where
+        R: BlockReader,
+    {
+        use futures::stream::TryStreamExt;
+        let stream = reader.get_stream_from(start, ctx);
+        stream.map_ok(|(key, value)| {
+            let key = DeltaKey::from_slice(&key);
+            let (key, lsn) = (key.key(), key.lsn());
+            let offset = BlobRef(value);
+
+            (key, lsn, offset)
+        })
+    }
+
+    /// The file offset to the first block of index.
+    ///
+    /// The file structure is summary, values, and index. We often need this for the size of last blob.
+    fn index_start_offset(&self) -> u64 {
+        let offset = self.index_start_blk as u64 * PAGE_SZ as u64;
+        let bref = BlobRef(offset);
+        tracing::debug!(
+            index_start_blk = self.index_start_blk,
+            offset,
+            pos = bref.pos(),
+            "index_start_offset"
+        );
+        offset
+    }
 }
 
 /// A set of data associated with a delta layer key and its value
@@ -1310,7 +1569,7 @@ mod test {
 
         // Plan and validate
         let vectored_reads = DeltaLayerInner::plan_reads(
-            keyspace.clone(),
+            &keyspace,
             lsn_range.clone(),
             disk_offset,
             reader,
@@ -1531,14 +1790,16 @@ mod test {
 
         for entry in entries {
             let (_, res) = writer
-                .put_value_bytes(entry.key, entry.lsn, entry.value, false)
+                .put_value_bytes(entry.key, entry.lsn, entry.value, false, &ctx)
                 .await;
             res?;
         }
 
-        let resident = writer.finish(entries_meta.key_range.end, &timeline).await?;
+        let resident = writer
+            .finish(entries_meta.key_range.end, &timeline, &ctx)
+            .await?;
 
-        let inner = resident.get_inner_delta(&ctx).await?;
+        let inner = resident.as_delta(&ctx).await?;
 
         let file_size = inner.file.metadata().await?.len();
         tracing::info!(
@@ -1562,7 +1823,7 @@ mod test {
             let data_end_offset = inner.index_start_blk as u64 * PAGE_SZ as u64;
 
             let vectored_reads = DeltaLayerInner::plan_reads(
-                keyspace.clone(),
+                &keyspace,
                 entries_meta.lsn_range.clone(),
                 data_end_offset,
                 index_reader,
@@ -1594,4 +1855,217 @@ mod test {
 
         Ok(())
     }
+
+    #[tokio::test]
+    async fn copy_delta_prefix_smoke() {
+        use crate::walrecord::NeonWalRecord;
+        use bytes::Bytes;
+
+        let h = crate::tenant::harness::TenantHarness::create("truncate_delta_smoke").unwrap();
+        let (tenant, ctx) = h.load().await;
+        let ctx = &ctx;
+        let timeline = tenant
+            .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, ctx)
+            .await
+            .unwrap();
+
+        let initdb_layer = timeline
+            .layers
+            .read()
+            .await
+            .likely_resident_layers()
+            .next()
+            .unwrap();
+
+        {
+            let mut writer = timeline.writer().await;
+
+            let data = [
+                (0x20, 12, Value::Image(Bytes::from_static(b"foobar"))),
+                (
+                    0x30,
+                    12,
+                    Value::WalRecord(NeonWalRecord::Postgres {
+                        will_init: false,
+                        rec: Bytes::from_static(b"1"),
+                    }),
+                ),
+                (
+                    0x40,
+                    12,
+                    Value::WalRecord(NeonWalRecord::Postgres {
+                        will_init: true,
+                        rec: Bytes::from_static(b"2"),
+                    }),
+                ),
+                // build an oversized value so we cannot extend and existing read over
+                // this
+                (
+                    0x50,
+                    12,
+                    Value::WalRecord(NeonWalRecord::Postgres {
+                        will_init: true,
+                        rec: {
+                            let mut buf =
+                                vec![0u8; tenant.conf.max_vectored_read_bytes.0.get() + 1024];
+                            buf.iter_mut()
+                                .enumerate()
+                                .for_each(|(i, slot)| *slot = (i % 256) as u8);
+                            Bytes::from(buf)
+                        },
+                    }),
+                ),
+                // because the oversized read cannot be extended further, we are sure to exercise the
+                // builder created on the last round with this:
+                (
+                    0x60,
+                    12,
+                    Value::WalRecord(NeonWalRecord::Postgres {
+                        will_init: true,
+                        rec: Bytes::from_static(b"3"),
+                    }),
+                ),
+                (
+                    0x60,
+                    9,
+                    Value::Image(Bytes::from_static(b"something for a different key")),
+                ),
+            ];
+
+            let mut last_lsn = None;
+
+            for (lsn, key, value) in data {
+                let key = Key::from_i128(key);
+                writer.put(key, Lsn(lsn), &value, ctx).await.unwrap();
+                last_lsn = Some(lsn);
+            }
+
+            writer.finish_write(Lsn(last_lsn.unwrap()));
+        }
+        timeline.freeze_and_flush().await.unwrap();
+
+        let new_layer = timeline
+            .layers
+            .read()
+            .await
+            .likely_resident_layers()
+            .find(|x| x != &initdb_layer)
+            .unwrap();
+
+        // create a copy for the timeline, so we don't overwrite the file
+        let branch = tenant
+            .branch_timeline_test(&timeline, TimelineId::generate(), None, ctx)
+            .await
+            .unwrap();
+
+        assert_eq!(branch.get_ancestor_lsn(), Lsn(0x60));
+
+        // truncating at 0x61 gives us a full copy, otherwise just go backwards until there's just
+        // a single key
+
+        for truncate_at in [0x61, 0x51, 0x41, 0x31, 0x21] {
+            let truncate_at = Lsn(truncate_at);
+
+            let mut writer = DeltaLayerWriter::new(
+                tenant.conf,
+                branch.timeline_id,
+                tenant.tenant_shard_id,
+                Key::MIN,
+                Lsn(0x11)..truncate_at,
+            )
+            .await
+            .unwrap();
+
+            let new_layer = new_layer.download_and_keep_resident().await.unwrap();
+
+            new_layer
+                .copy_delta_prefix(&mut writer, truncate_at, ctx)
+                .await
+                .unwrap();
+
+            let copied_layer = writer.finish(Key::MAX, &branch, ctx).await.unwrap();
+
+            copied_layer.as_delta(ctx).await.unwrap();
+
+            assert_keys_and_values_eq(
+                new_layer.as_delta(ctx).await.unwrap(),
+                copied_layer.as_delta(ctx).await.unwrap(),
+                truncate_at,
+                ctx,
+            )
+            .await;
+        }
+    }
+
+    async fn assert_keys_and_values_eq(
+        source: &DeltaLayerInner,
+        truncated: &DeltaLayerInner,
+        truncated_at: Lsn,
+        ctx: &RequestContext,
+    ) {
+        use futures::future::ready;
+        use futures::stream::TryStreamExt;
+
+        let start_key = [0u8; DELTA_KEY_SIZE];
+
+        let source_reader = FileBlockReader::new(&source.file, source.file_id);
+        let source_tree = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
+            source.index_start_blk,
+            source.index_root_blk,
+            &source_reader,
+        );
+        let source_stream = source.stream_index_forwards(&source_tree, &start_key, ctx);
+        let source_stream = source_stream.filter(|res| match res {
+            Ok((_, lsn, _)) => ready(lsn < &truncated_at),
+            _ => ready(true),
+        });
+        let mut source_stream = std::pin::pin!(source_stream);
+
+        let truncated_reader = FileBlockReader::new(&truncated.file, truncated.file_id);
+        let truncated_tree = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
+            truncated.index_start_blk,
+            truncated.index_root_blk,
+            &truncated_reader,
+        );
+        let truncated_stream = truncated.stream_index_forwards(&truncated_tree, &start_key, ctx);
+        let mut truncated_stream = std::pin::pin!(truncated_stream);
+
+        let mut scratch_left = Vec::new();
+        let mut scratch_right = Vec::new();
+
+        loop {
+            let (src, truncated) = (source_stream.try_next(), truncated_stream.try_next());
+            let (src, truncated) = tokio::try_join!(src, truncated).unwrap();
+
+            if src.is_none() {
+                assert!(truncated.is_none());
+                break;
+            }
+
+            let (src, truncated) = (src.unwrap(), truncated.unwrap());
+
+            // because we've filtered the source with Lsn, we should always have the same keys from both.
+            assert_eq!(src.0, truncated.0);
+            assert_eq!(src.1, truncated.1);
+
+            // if this is needed for something else, just drop this assert.
+            assert!(
+                src.2.pos() >= truncated.2.pos(),
+                "value position should not go backwards {} vs. {}",
+                src.2.pos(),
+                truncated.2.pos()
+            );
+
+            scratch_left.clear();
+            let src_cursor = source_reader.block_cursor();
+            let left = src_cursor.read_blob_into_buf(src.2.pos(), &mut scratch_left, ctx);
+            scratch_right.clear();
+            let trunc_cursor = truncated_reader.block_cursor();
+            let right = trunc_cursor.read_blob_into_buf(truncated.2.pos(), &mut scratch_right, ctx);
+
+            tokio::try_join!(left, right).unwrap();
+
+            assert_eq!(utils::Hex(&scratch_left), utils::Hex(&scratch_right));
+        }
+    }
 }
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 5b44d2bc2c..1477a1fc33 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -357,7 +357,7 @@ impl ImageLayer {
         // TODO: could use smallvec here but it's a pain with Slice<T>
         Summary::ser_into(&new_summary, &mut buf).context("serialize")?;
         file.seek(SeekFrom::Start(0)).await?;
-        let (_buf, res) = file.write_all(buf).await;
+        let (_buf, res) = file.write_all(buf, ctx).await;
         res?;
         Ok(())
     }
@@ -396,6 +396,8 @@ impl ImageLayerInner {
             // production code path
             expected_summary.index_start_blk = actual_summary.index_start_blk;
             expected_summary.index_root_blk = actual_summary.index_root_blk;
+            // mask out the timeline_id, but still require the layers to be from the same tenant
+            expected_summary.timeline_id = actual_summary.timeline_id;
 
             if actual_summary != expected_summary {
                 bail!(
@@ -675,9 +677,14 @@ impl ImageLayerWriterInner {
     ///
     /// The page versions must be appended in blknum order.
     ///
-    async fn put_image(&mut self, key: Key, img: Bytes) -> anyhow::Result<()> {
+    async fn put_image(
+        &mut self,
+        key: Key,
+        img: Bytes,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
         ensure!(self.key_range.contains(&key));
-        let (_img, res) = self.blob_writer.write_blob(img).await;
+        let (_img, res) = self.blob_writer.write_blob(img, ctx).await;
         // TODO: re-use the buffer for `img` further upstack
         let off = res?;
 
@@ -691,7 +698,11 @@ impl ImageLayerWriterInner {
     ///
     /// Finish writing the image layer.
     ///
-    async fn finish(self, timeline: &Arc<Timeline>) -> anyhow::Result<ResidentLayer> {
+    async fn finish(
+        self,
+        timeline: &Arc<Timeline>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<ResidentLayer> {
         let index_start_blk =
             ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
 
@@ -702,7 +713,7 @@ impl ImageLayerWriterInner {
             .await?;
         let (index_root_blk, block_buf) = self.tree.finish()?;
         for buf in block_buf.blocks {
-            let (_buf, res) = file.write_all(buf).await;
+            let (_buf, res) = file.write_all(buf, ctx).await;
             res?;
         }
 
@@ -722,7 +733,7 @@ impl ImageLayerWriterInner {
         // TODO: could use smallvec here but it's a pain with Slice<T>
         Summary::ser_into(&summary, &mut buf)?;
         file.seek(SeekFrom::Start(0)).await?;
-        let (_buf, res) = file.write_all(buf).await;
+        let (_buf, res) = file.write_all(buf, ctx).await;
         res?;
 
         let metadata = file
@@ -804,8 +815,13 @@ impl ImageLayerWriter {
     ///
     /// The page versions must be appended in blknum order.
     ///
-    pub async fn put_image(&mut self, key: Key, img: Bytes) -> anyhow::Result<()> {
-        self.inner.as_mut().unwrap().put_image(key, img).await
+    pub async fn put_image(
+        &mut self,
+        key: Key,
+        img: Bytes,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        self.inner.as_mut().unwrap().put_image(key, img, ctx).await
     }
 
     ///
@@ -814,8 +830,9 @@ impl ImageLayerWriter {
     pub(crate) async fn finish(
         mut self,
         timeline: &Arc<Timeline>,
+        ctx: &RequestContext,
     ) -> anyhow::Result<super::ResidentLayer> {
-        self.inner.take().unwrap().finish(timeline).await
+        self.inner.take().unwrap().finish(timeline, ctx).await
     }
 }
 
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index 43942ba2db..4dacbec2f3 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -17,15 +17,16 @@ use anyhow::{anyhow, ensure, Result};
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::InMemoryLayerInfo;
 use pageserver_api::shard::TenantShardId;
-use std::collections::{BinaryHeap, HashMap, HashSet};
+use std::collections::{BTreeMap, BinaryHeap, HashSet};
 use std::sync::{Arc, OnceLock};
+use std::time::Instant;
 use tracing::*;
 use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn, vec_map::VecMap};
 // avoid binding to Write (conflicts with std::io::Write)
 // while being able to use std::fmt::Write's methods
 use crate::metrics::TIMELINE_EPHEMERAL_BYTES;
 use std::cmp::Ordering;
-use std::fmt::Write as _;
+use std::fmt::Write;
 use std::ops::Range;
 use std::sync::atomic::Ordering as AtomicOrdering;
 use std::sync::atomic::{AtomicU64, AtomicUsize};
@@ -53,6 +54,14 @@ pub struct InMemoryLayer {
     /// Writes are only allowed when this is `None`.
     end_lsn: OnceLock<Lsn>,
 
+    /// Used for traversal path. Cached representation of the in-memory layer before frozen.
+    local_path_str: Arc<str>,
+
+    /// Used for traversal path. Cached representation of the in-memory layer after frozen.
+    frozen_local_path_str: OnceLock<Arc<str>>,
+
+    opened_at: Instant,
+
     /// The above fields never change, except for `end_lsn`, which is only set once.
     /// All other changing parts are in `inner`, and protected by a mutex.
     inner: RwLock<InMemoryLayerInner>,
@@ -69,10 +78,10 @@ impl std::fmt::Debug for InMemoryLayer {
 }
 
 pub struct InMemoryLayerInner {
-    /// All versions of all pages in the layer are kept here.  Indexed
+    /// All versions of all pages in the layer are kept here. Indexed
     /// by block number and LSN. The value is an offset into the
     /// ephemeral file where the page version is stored.
-    index: HashMap<Key, VecMap<Lsn, u64>>,
+    index: BTreeMap<Key, VecMap<Lsn, u64>>,
 
     /// The values are stored in a serialized format in this file.
     /// Each serialized Value is preceded by a 'u32' length field.
@@ -238,6 +247,12 @@ impl InMemoryLayer {
         self.start_lsn..self.end_lsn_or_max()
     }
 
+    pub(crate) fn local_path_str(&self) -> &Arc<str> {
+        self.frozen_local_path_str
+            .get()
+            .unwrap_or(&self.local_path_str)
+    }
+
     /// debugging function to print out the contents of the layer
     ///
     /// this is likely completly unused
@@ -369,29 +384,24 @@ impl InMemoryLayer {
         let mut planned_block_reads = BinaryHeap::new();
 
         for range in keyspace.ranges.iter() {
-            let mut key = range.start;
-            while key < range.end {
-                if let Some(vec_map) = inner.index.get(&key) {
-                    let lsn_range = match reconstruct_state.get_cached_lsn(&key) {
-                        Some(cached_lsn) => (cached_lsn + 1)..end_lsn,
-                        None => self.start_lsn..end_lsn,
-                    };
+            for (key, vec_map) in inner.index.range(range.start..range.end) {
+                let lsn_range = match reconstruct_state.get_cached_lsn(key) {
+                    Some(cached_lsn) => (cached_lsn + 1)..end_lsn,
+                    None => self.start_lsn..end_lsn,
+                };
 
-                    let slice = vec_map.slice_range(lsn_range);
-                    for (entry_lsn, pos) in slice.iter().rev() {
-                        planned_block_reads.push(BlockRead {
-                            key,
-                            lsn: *entry_lsn,
-                            block_offset: *pos,
-                        });
-                    }
+                let slice = vec_map.slice_range(lsn_range);
+                for (entry_lsn, pos) in slice.iter().rev() {
+                    planned_block_reads.push(BlockRead {
+                        key: *key,
+                        lsn: *entry_lsn,
+                        block_offset: *pos,
+                    });
                 }
-
-                key = key.next();
             }
         }
 
-        let keyspace_size = keyspace.total_size();
+        let keyspace_size = keyspace.total_raw_size();
 
         let mut completed_keys = HashSet::new();
         while completed_keys.len() < keyspace_size && !planned_block_reads.is_empty() {
@@ -423,14 +433,30 @@ impl InMemoryLayer {
             }
         }
 
+        reconstruct_state.on_lsn_advanced(&keyspace, self.start_lsn);
+
         Ok(())
     }
 }
 
+fn inmem_layer_display(mut f: impl Write, start_lsn: Lsn, end_lsn: Lsn) -> std::fmt::Result {
+    write!(f, "inmem-{:016X}-{:016X}", start_lsn.0, end_lsn.0)
+}
+
+fn inmem_layer_log_display(
+    mut f: impl Write,
+    timeline: TimelineId,
+    start_lsn: Lsn,
+    end_lsn: Lsn,
+) -> std::fmt::Result {
+    write!(f, "timeline {} in-memory ", timeline)?;
+    inmem_layer_display(f, start_lsn, end_lsn)
+}
+
 impl std::fmt::Display for InMemoryLayer {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         let end_lsn = self.end_lsn_or_max();
-        write!(f, "inmem-{:016X}-{:016X}", self.start_lsn.0, end_lsn.0)
+        inmem_layer_display(f, self.start_lsn, end_lsn)
     }
 }
 
@@ -451,17 +477,24 @@ impl InMemoryLayer {
         trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}");
 
         let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id).await?;
-        let key = InMemoryLayerFileId(file.id());
+        let key = InMemoryLayerFileId(file.page_cache_file_id());
 
         Ok(InMemoryLayer {
             file_id: key,
+            local_path_str: {
+                let mut buf = String::new();
+                inmem_layer_log_display(&mut buf, timeline_id, start_lsn, Lsn::MAX).unwrap();
+                buf.into()
+            },
+            frozen_local_path_str: OnceLock::new(),
             conf,
             timeline_id,
             tenant_shard_id,
             start_lsn,
             end_lsn: OnceLock::new(),
+            opened_at: Instant::now(),
             inner: RwLock::new(InMemoryLayerInner {
-                index: HashMap::new(),
+                index: BTreeMap::new(),
                 file,
                 resource_units: GlobalResourceUnits::new(),
             }),
@@ -520,6 +553,10 @@ impl InMemoryLayer {
         Ok(())
     }
 
+    pub(crate) fn get_opened_at(&self) -> Instant {
+        self.opened_at
+    }
+
     pub(crate) async fn tick(&self) -> Option<u64> {
         let mut inner = self.inner.write().await;
         let size = inner.file.len();
@@ -544,6 +581,15 @@ impl InMemoryLayer {
         );
         self.end_lsn.set(end_lsn).expect("end_lsn set only once");
 
+        self.frozen_local_path_str
+            .set({
+                let mut buf = String::new();
+                inmem_layer_log_display(&mut buf, self.get_timeline_id(), self.start_lsn, end_lsn)
+                    .unwrap();
+                buf.into()
+            })
+            .expect("frozen_local_path_str set only once");
+
         for vec_map in inner.index.values() {
             for (lsn, _pos) in vec_map.as_slice() {
                 assert!(*lsn < end_lsn);
@@ -551,14 +597,17 @@ impl InMemoryLayer {
         }
     }
 
-    /// Write this frozen in-memory layer to disk.
+    /// Write this frozen in-memory layer to disk. If `key_range` is set, the delta
+    /// layer will only contain the key range the user specifies, and may return `None`
+    /// if there are no matching keys.
     ///
     /// Returns a new delta layer with all the same data as this in-memory layer
     pub(crate) async fn write_to_disk(
         &self,
         timeline: &Arc<Timeline>,
         ctx: &RequestContext,
-    ) -> Result<ResidentLayer> {
+        key_range: Option<Range<Key>>,
+    ) -> Result<Option<ResidentLayer>> {
         // Grab the lock in read-mode. We hold it over the I/O, but because this
         // layer is not writeable anymore, no one should be trying to acquire the
         // write lock on it, so we shouldn't block anyone. There's one exception
@@ -572,6 +621,21 @@ impl InMemoryLayer {
 
         let end_lsn = *self.end_lsn.get().unwrap();
 
+        let keys: Vec<_> = if let Some(key_range) = key_range {
+            inner
+                .index
+                .iter()
+                .filter(|(k, _)| key_range.contains(k))
+                .map(|(k, m)| (k.to_i128(), m))
+                .collect()
+        } else {
+            inner.index.iter().map(|(k, m)| (k.to_i128(), m)).collect()
+        };
+
+        if keys.is_empty() {
+            return Ok(None);
+        }
+
         let mut delta_layer_writer = DeltaLayerWriter::new(
             self.conf,
             self.timeline_id,
@@ -585,33 +649,24 @@ impl InMemoryLayer {
 
         let cursor = inner.file.block_cursor();
 
-        // Sort the keys because delta layer writer expects them sorted.
-        //
-        // NOTE: this sort can take up significant time if the layer has millions of
-        //       keys. To speed up all the comparisons we convert the key to i128 and
-        //       keep the value as a reference.
-        let mut keys: Vec<_> = inner.index.iter().map(|(k, m)| (k.to_i128(), m)).collect();
-        keys.sort_unstable_by_key(|k| k.0);
-
         let ctx = RequestContextBuilder::extend(ctx)
             .page_content_kind(PageContentKind::InMemoryLayer)
             .build();
-        for (key, vec_map) in keys.iter() {
-            let key = Key::from_i128(*key);
+        for (key, vec_map) in inner.index.iter() {
             // Write all page versions
             for (lsn, pos) in vec_map.as_slice() {
                 cursor.read_blob_into_buf(*pos, &mut buf, &ctx).await?;
                 let will_init = Value::des(&buf)?.will_init();
                 let res;
                 (buf, res) = delta_layer_writer
-                    .put_value_bytes(key, *lsn, buf, will_init)
+                    .put_value_bytes(*key, *lsn, buf, will_init, &ctx)
                     .await;
                 res?;
             }
         }
 
         // MAX is used here because we identify L0 layers by full key range
-        let delta_layer = delta_layer_writer.finish(Key::MAX, timeline).await?;
-        Ok(delta_layer)
+        let delta_layer = delta_layer_writer.finish(Key::MAX, timeline, &ctx).await?;
+        Ok(Some(delta_layer))
     }
 }
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 27e60f783c..ebc0cbf9a4 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -14,9 +14,10 @@ use utils::lsn::Lsn;
 use utils::sync::heavier_once_cell;
 
 use crate::config::PageServerConf;
-use crate::context::RequestContext;
+use crate::context::{DownloadBehavior, RequestContext};
 use crate::repository::Key;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
+use crate::task_mgr::TaskKind;
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::{remote_timeline_client::LayerFileMetadata, Timeline};
 
@@ -116,6 +117,12 @@ impl AsLayerDesc for Layer {
     }
 }
 
+impl PartialEq for Layer {
+    fn eq(&self, other: &Self) -> bool {
+        Arc::as_ptr(&self.0) == Arc::as_ptr(&other.0)
+    }
+}
+
 impl Layer {
     /// Creates a layer value for a file we know to not be resident.
     pub(crate) fn for_evicted(
@@ -330,6 +337,12 @@ impl Layer {
             .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_data, &self.0, ctx)
             .instrument(tracing::debug_span!("get_values_reconstruct_data", layer=%self))
             .await
+            .map_err(|err| match err {
+                GetVectoredError::Other(err) => GetVectoredError::Other(
+                    err.context(format!("get_values_reconstruct_data for layer {self}")),
+                ),
+                err => err,
+            })
     }
 
     /// Download the layer if evicted.
@@ -389,6 +402,10 @@ impl Layer {
         &self.0.path
     }
 
+    pub(crate) fn debug_str(&self) -> &Arc<str> {
+        &self.0.debug_str
+    }
+
     pub(crate) fn metadata(&self) -> LayerFileMetadata {
         self.0.metadata()
     }
@@ -511,6 +528,9 @@ struct LayerInner {
     /// Full path to the file; unclear if this should exist anymore.
     path: Utf8PathBuf,
 
+    /// String representation of the layer, used for traversal id.
+    debug_str: Arc<str>,
+
     desc: PersistentLayerDesc,
 
     /// Timeline access is needed for remote timeline client and metrics.
@@ -604,9 +624,17 @@ enum Status {
 
 impl Drop for LayerInner {
     fn drop(&mut self) {
+        // if there was a pending eviction, mark it cancelled here to balance metrics
+        if let Some((ResidentOrWantedEvicted::WantedEvicted(..), _)) = self.inner.take_and_deinit()
+        {
+            // eviction has already been started
+            LAYER_IMPL_METRICS.inc_eviction_cancelled(EvictionCancelled::LayerGone);
+
+            // eviction request is intentionally not honored as no one is present to wait for it
+            // and we could be delaying shutdown for nothing.
+        }
+
         if !*self.wanted_deleted.get_mut() {
-            // should we try to evict if the last wish was for eviction? seems more like a hazard
-            // than a clear win.
             return;
         }
 
@@ -708,6 +736,7 @@ impl LayerInner {
 
         LayerInner {
             conf,
+            debug_str: { format!("timelines/{}/{}", timeline.timeline_id, desc.filename()).into() },
             path,
             desc,
             timeline: Arc::downgrade(timeline),
@@ -911,11 +940,20 @@ impl LayerInner {
             return Err(DownloadError::DownloadRequired);
         }
 
+        let download_ctx = ctx
+            .map(|ctx| ctx.detached_child(TaskKind::LayerDownload, DownloadBehavior::Download))
+            .unwrap_or(RequestContext::new(
+                TaskKind::LayerDownload,
+                DownloadBehavior::Download,
+            ));
+
         async move {
             tracing::info!(%reason, "downloading on-demand");
 
             let init_cancelled = scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
-            let res = self.download_init_and_wait(timeline, permit).await?;
+            let res = self
+                .download_init_and_wait(timeline, permit, download_ctx)
+                .await?;
             scopeguard::ScopeGuard::into_inner(init_cancelled);
             Ok(res)
         }
@@ -954,6 +992,7 @@ impl LayerInner {
         self: &Arc<Self>,
         timeline: Arc<Timeline>,
         permit: heavier_once_cell::InitPermit,
+        ctx: RequestContext,
     ) -> Result<Arc<DownloadedLayer>, DownloadError> {
         debug_assert_current_span_has_tenant_and_timeline_id();
 
@@ -983,7 +1022,7 @@ impl LayerInner {
                     .await
                     .unwrap();
 
-                let res = this.download_and_init(timeline, permit).await;
+                let res = this.download_and_init(timeline, permit, &ctx).await;
 
                 if let Err(res) = tx.send(res) {
                     match res {
@@ -1026,6 +1065,7 @@ impl LayerInner {
         self: &Arc<LayerInner>,
         timeline: Arc<Timeline>,
         permit: heavier_once_cell::InitPermit,
+        ctx: &RequestContext,
     ) -> anyhow::Result<Arc<DownloadedLayer>> {
         let client = timeline
             .remote_client
@@ -1033,7 +1073,12 @@ impl LayerInner {
             .expect("checked before download_init_and_wait");
 
         let result = client
-            .download_layer_file(&self.desc.filename(), &self.metadata(), &timeline.cancel)
+            .download_layer_file(
+                &self.desc.filename(),
+                &self.metadata(),
+                &timeline.cancel,
+                ctx,
+            )
             .await;
 
         match result {
@@ -1552,8 +1597,8 @@ impl Drop for DownloadedLayer {
         if let Some(owner) = self.owner.upgrade() {
             owner.on_downloaded_layer_drop(self.version);
         } else {
-            // no need to do anything, we are shutting down
-            LAYER_IMPL_METRICS.inc_eviction_cancelled(EvictionCancelled::LayerGone);
+            // Layer::drop will handle cancelling the eviction; because of drop order and
+            // `DownloadedLayer` never leaking, we cannot know here if eviction was requested.
         }
     }
 }
@@ -1752,6 +1797,28 @@ impl ResidentLayer {
         }
     }
 
+    /// FIXME: truncate is bad name because we are not truncating anything, but copying the
+    /// filtered parts.
+    #[cfg(test)]
+    pub(super) async fn copy_delta_prefix(
+        &self,
+        writer: &mut super::delta_layer::DeltaLayerWriter,
+        truncate_at: Lsn,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        use LayerKind::*;
+
+        let owner = &self.owner.0;
+
+        match self.downloaded.get(owner, ctx).await? {
+            Delta(ref d) => d
+                .copy_prefix(writer, truncate_at, ctx)
+                .await
+                .with_context(|| format!("truncate {self}")),
+            Image(_) => anyhow::bail!(format!("cannot truncate image layer {self}")),
+        }
+    }
+
     pub(crate) fn local_path(&self) -> &Utf8Path {
         &self.owner.0.path
     }
@@ -1761,14 +1828,14 @@ impl ResidentLayer {
     }
 
     #[cfg(test)]
-    pub(crate) async fn get_inner_delta<'a>(
-        &'a self,
+    pub(crate) async fn as_delta(
+        &self,
         ctx: &RequestContext,
-    ) -> anyhow::Result<&'a delta_layer::DeltaLayerInner> {
-        let owner = &self.owner.0;
-        match self.downloaded.get(owner, ctx).await? {
-            LayerKind::Delta(d) => Ok(d),
-            LayerKind::Image(_) => Err(anyhow::anyhow!("Expected a delta layer")),
+    ) -> anyhow::Result<&delta_layer::DeltaLayerInner> {
+        use LayerKind::*;
+        match self.downloaded.get(&self.owner.0, ctx).await? {
+            Delta(ref d) => Ok(d),
+            Image(_) => Err(anyhow::anyhow!("image layer")),
         }
     }
 }
diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs
index 247ff123b5..52f62faa8d 100644
--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -721,11 +721,110 @@ async fn evict_and_wait_does_not_wait_for_download() {
     layer.evict_and_wait(FOREVER).await.unwrap();
 }
 
+/// Asserts that there is no miscalculation when Layer is dropped while it is being kept resident,
+/// which is the last value.
+///
+/// Also checks that the same does not happen on a non-evicted layer (regression test).
+#[tokio::test(start_paused = true)]
+async fn eviction_cancellation_on_drop() {
+    use crate::repository::Value;
+    use bytes::Bytes;
+
+    // this is the runtime on which Layer spawns the blocking tasks on
+    let handle = tokio::runtime::Handle::current();
+
+    let h = TenantHarness::create("eviction_cancellation_on_drop").unwrap();
+    utils::logging::replace_panic_hook_with_tracing_panic_hook().forget();
+    let (tenant, ctx) = h.load().await;
+
+    let timeline = tenant
+        .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
+        .await
+        .unwrap();
+
+    {
+        // create_test_timeline wrote us one layer, write another
+        let mut writer = timeline.writer().await;
+        writer
+            .put(
+                Key::from_i128(5),
+                Lsn(0x20),
+                &Value::Image(Bytes::from_static(b"this does not matter either")),
+                &ctx,
+            )
+            .await
+            .unwrap();
+
+        writer.finish_write(Lsn(0x20));
+    }
+
+    timeline.freeze_and_flush().await.unwrap();
+
+    // wait for the upload to complete so our Arc::strong_count assertion holds
+    timeline
+        .remote_client
+        .as_ref()
+        .unwrap()
+        .wait_completion()
+        .await
+        .unwrap();
+
+    let (evicted_layer, not_evicted) = {
+        let mut layers = {
+            let mut guard = timeline.layers.write().await;
+            let layers = guard.likely_resident_layers().collect::<Vec<_>>();
+            // remove the layers from layermap
+            guard.finish_gc_timeline(&layers);
+
+            layers
+        };
+
+        assert_eq!(layers.len(), 2);
+
+        (layers.pop().unwrap(), layers.pop().unwrap())
+    };
+
+    let victims = [(evicted_layer, true), (not_evicted, false)];
+
+    for (victim, evict) in victims {
+        let resident = victim.keep_resident().await.unwrap();
+        drop(victim);
+
+        assert_eq!(Arc::strong_count(&resident.owner.0), 1);
+
+        if evict {
+            let evict_and_wait = resident.owner.evict_and_wait(FOREVER);
+
+            // drive the future to await on the status channel, and then drop it
+            tokio::time::timeout(ADVANCE, evict_and_wait)
+                .await
+                .expect_err("should had been a timeout since we are holding the layer resident");
+        }
+
+        // 1 == we only evict one of the layers
+        assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get());
+
+        drop(resident);
+
+        // run any spawned
+        tokio::time::sleep(ADVANCE).await;
+
+        SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle).await;
+
+        assert_eq!(
+            1,
+            LAYER_IMPL_METRICS.cancelled_evictions[EvictionCancelled::LayerGone].get()
+        );
+    }
+}
+
+/// A test case to remind you the cost of these structures. You can bump the size limit
+/// below if it is really necessary to add more fields to the structures.
 #[test]
 fn layer_size() {
     assert_eq!(std::mem::size_of::<LayerAccessStats>(), 2040);
     assert_eq!(std::mem::size_of::<PersistentLayerDesc>(), 104);
-    assert_eq!(std::mem::size_of::<LayerInner>(), 2328);
+    assert_eq!(std::mem::size_of::<LayerInner>(), 2344);
     // it also has the utf8 path
 }
 
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index 74ed677ffe..41b77c1f4a 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -62,7 +62,7 @@ impl BackgroundLoopKind {
 pub(crate) async fn concurrent_background_tasks_rate_limit_permit(
     loop_kind: BackgroundLoopKind,
     _ctx: &RequestContext,
-) -> impl Drop {
+) -> tokio::sync::SemaphorePermit<'static> {
     let _guard = crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_GAUGE
         .with_label_values(&[loop_kind.as_static_str()])
         .guard();
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index d3c8c5f66c..c7a5598cec 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -16,14 +16,17 @@ use enumset::EnumSet;
 use fail::fail_point;
 use once_cell::sync::Lazy;
 use pageserver_api::{
-    key::AUX_FILES_KEY,
-    keyspace::KeySpaceAccum,
+    key::{
+        AUX_FILES_KEY, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, NON_INHERITED_RANGE,
+        NON_INHERITED_SPARSE_RANGE,
+    },
+    keyspace::{KeySpaceAccum, SparseKeyPartitioning},
     models::{
         CompactionAlgorithm, DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest,
         EvictionPolicy, InMemoryLayerInfo, LayerMapInfo, TimelineState,
     },
     reltag::BlockNumber,
-    shard::{ShardIdentity, TenantShardId},
+    shard::{ShardIdentity, ShardNumber, TenantShardId},
 };
 use rand::Rng;
 use serde_with::serde_as;
@@ -55,8 +58,6 @@ use std::{
     ops::ControlFlow,
 };
 
-use crate::deletion_queue::DeletionQueueClient;
-use crate::tenant::timeline::logical_size::CurrentLogicalSize;
 use crate::tenant::{
     layer_map::{LayerMap, SearchResult},
     metadata::TimelineMetadata,
@@ -66,6 +67,7 @@ use crate::{
     disk_usage_eviction_task::DiskUsageEvictionInfo,
     pgdatadir_mapping::CollectKeySpaceError,
 };
+use crate::{deletion_queue::DeletionQueueClient, metrics::GetKind};
 use crate::{
     disk_usage_eviction_task::finite_f32,
     tenant::storage_layer::{
@@ -77,6 +79,9 @@ use crate::{
 use crate::{
     disk_usage_eviction_task::EvictionCandidate, tenant::storage_layer::delta_layer::DeltaEntry,
 };
+use crate::{
+    metrics::ScanLatencyOngoingRecording, tenant::timeline::logical_size::CurrentLogicalSize,
+};
 use crate::{pgdatadir_mapping::LsnForTimestamp, tenant::tasks::BackgroundLoopKind};
 use crate::{
     pgdatadir_mapping::{AuxFilesDirectory, DirectoryKind},
@@ -119,8 +124,8 @@ use self::layer_manager::LayerManager;
 use self::logical_size::LogicalSize;
 use self::walreceiver::{WalReceiver, WalReceiverConf};
 
-use super::config::TenantConf;
 use super::secondary::heatmap::{HeatMapLayer, HeatMapTimeline};
+use super::{config::TenantConf, storage_layer::VectoredValueReconstructState};
 use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
 use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe};
 use super::{remote_timeline_client::RemoteTimelineClient, storage_layer::ReadableLayer};
@@ -137,6 +142,25 @@ pub(super) enum FlushLoopState {
     Exited,
 }
 
+#[derive(Debug, Copy, Clone, PartialEq, Eq)]
+pub enum ImageLayerCreationMode {
+    /// Try to create image layers based on `time_for_new_image_layer`. Used in compaction code path.
+    Try,
+    /// Force creating the image layers if possible. For now, no image layers will be created
+    /// for metadata keys. Used in compaction code path with force flag enabled.
+    Force,
+    /// Initial ingestion of the data, and no data should be dropped in this function. This
+    /// means that no metadata keys should be included in the partitions. Used in flush frozen layer
+    /// code path.
+    Initial,
+}
+
+impl std::fmt::Display for ImageLayerCreationMode {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{:?}", self)
+    }
+}
+
 /// Wrapper for key range to provide reverse ordering by range length for BinaryHeap
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub(crate) struct Hole {
@@ -182,6 +206,16 @@ pub(crate) struct AuxFilesState {
     pub(crate) n_deltas: usize,
 }
 
+/// The relation size cache caches relation sizes at the end of the timeline. It speeds up WAL
+/// ingestion considerably, because WAL ingestion needs to check on most records if the record
+/// implicitly extends the relation.  At startup, `complete_as_of` is initialized to the current end
+/// of the timeline (disk_consistent_lsn).  It's used on reads of relation sizes to check if the
+/// value can be used to also update the cache, see [`Timeline::update_cached_rel_size`].
+pub(crate) struct RelSizeCache {
+    pub(crate) complete_as_of: Lsn,
+    pub(crate) map: HashMap<RelTag, (Lsn, BlockNumber)>,
+}
+
 pub struct Timeline {
     conf: &'static PageServerConf,
     tenant_conf: Arc<ArcSwap<AttachedTenantConf>>,
@@ -296,7 +330,7 @@ pub struct Timeline {
 
     // List of child timelines and their branch points. This is needed to avoid
     // garbage collecting data that is still needed by the child timelines.
-    pub gc_info: std::sync::RwLock<GcInfo>,
+    pub(crate) gc_info: std::sync::RwLock<GcInfo>,
 
     // It may change across major versions so for simplicity
     // keep it after running initdb for a timeline.
@@ -307,7 +341,7 @@ pub struct Timeline {
     pub initdb_lsn: Lsn,
 
     /// When did we last calculate the partitioning?
-    partitioning: tokio::sync::Mutex<(KeyPartitioning, Lsn)>,
+    partitioning: tokio::sync::Mutex<((KeyPartitioning, SparseKeyPartitioning), Lsn)>,
 
     /// Configuration: how often should the partitioning be recalculated.
     repartition_threshold: u64,
@@ -324,7 +358,7 @@ pub struct Timeline {
     pub walreceiver: Mutex<Option<WalReceiver>>,
 
     /// Relation size cache
-    pub rel_size_cache: RwLock<HashMap<RelTag, (Lsn, BlockNumber)>>,
+    pub(crate) rel_size_cache: RwLock<RelSizeCache>,
 
     download_all_remote_layers_task_info: RwLock<Option<DownloadRemoteLayersTaskInfo>>,
 
@@ -380,33 +414,59 @@ pub struct WalReceiverInfo {
     pub last_received_msg_ts: u128,
 }
 
-///
 /// Information about how much history needs to be retained, needed by
 /// Garbage Collection.
-///
-pub struct GcInfo {
+#[derive(Default)]
+pub(crate) struct GcInfo {
     /// Specific LSNs that are needed.
     ///
     /// Currently, this includes all points where child branches have
     /// been forked off from. In the future, could also include
     /// explicit user-defined snapshot points.
-    pub retain_lsns: Vec<Lsn>,
+    pub(crate) retain_lsns: Vec<Lsn>,
 
-    /// In addition to 'retain_lsns', keep everything newer than this
-    /// point.
+    /// The cutoff coordinates, which are combined by selecting the minimum.
+    pub(crate) cutoffs: GcCutoffs,
+}
+
+impl GcInfo {
+    pub(crate) fn min_cutoff(&self) -> Lsn {
+        self.cutoffs.select_min()
+    }
+}
+
+/// The `GcInfo` component describing which Lsns need to be retained.
+#[derive(Debug)]
+pub(crate) struct GcCutoffs {
+    /// Keep everything newer than this point.
     ///
     /// This is calculated by subtracting 'gc_horizon' setting from
     /// last-record LSN
     ///
     /// FIXME: is this inclusive or exclusive?
-    pub horizon_cutoff: Lsn,
+    pub(crate) horizon: Lsn,
 
     /// In addition to 'retain_lsns' and 'horizon_cutoff', keep everything newer than this
     /// point.
     ///
     /// This is calculated by finding a number such that a record is needed for PITR
     /// if only if its LSN is larger than 'pitr_cutoff'.
-    pub pitr_cutoff: Lsn,
+    pub(crate) pitr: Lsn,
+}
+
+impl Default for GcCutoffs {
+    fn default() -> Self {
+        Self {
+            horizon: Lsn::INVALID,
+            pitr: Lsn::INVALID,
+        }
+    }
+}
+
+impl GcCutoffs {
+    fn select_min(&self) -> Lsn {
+        std::cmp::min(self.horizon, self.pitr)
+    }
 }
 
 /// An error happened in a get() operation.
@@ -428,6 +488,51 @@ pub(crate) enum PageReconstructError {
     /// An error happened replaying WAL records
     #[error(transparent)]
     WalRedo(anyhow::Error),
+
+    #[error("{0}")]
+    MissingKey(MissingKeyError),
+}
+
+#[derive(Debug)]
+pub struct MissingKeyError {
+    key: Key,
+    shard: ShardNumber,
+    cont_lsn: Lsn,
+    request_lsn: Lsn,
+    ancestor_lsn: Option<Lsn>,
+    traversal_path: Vec<TraversalPathItem>,
+    backtrace: Option<std::backtrace::Backtrace>,
+}
+
+impl std::fmt::Display for MissingKeyError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "could not find data for key {} (shard {:?}) at LSN {}, request LSN {}",
+            self.key, self.shard, self.cont_lsn, self.request_lsn
+        )?;
+        if let Some(ref ancestor_lsn) = self.ancestor_lsn {
+            write!(f, ", ancestor {}", ancestor_lsn)?;
+        }
+
+        if !self.traversal_path.is_empty() {
+            writeln!(f)?;
+        }
+
+        for (r, c, l) in &self.traversal_path {
+            writeln!(
+                f,
+                "layer traversal: result {:?}, cont_lsn {}, layer: {}",
+                r, c, l,
+            )?;
+        }
+
+        if let Some(ref backtrace) = self.backtrace {
+            write!(f, "\n{}", backtrace)?;
+        }
+
+        Ok(())
+    }
 }
 
 impl PageReconstructError {
@@ -439,6 +544,7 @@ impl PageReconstructError {
             AncestorLsnTimeout(_) => false,
             Cancelled | AncestorStopping(_) => true,
             WalRedo(_) => false,
+            MissingKey { .. } => false,
         }
     }
 }
@@ -482,8 +588,8 @@ pub(crate) enum GetVectoredError {
     #[error("Requested at invalid LSN: {0}")]
     InvalidLsn(Lsn),
 
-    #[error("Requested key {0} not found")]
-    MissingKey(Key),
+    #[error("Requested key not found: {0}")]
+    MissingKey(MissingKeyError),
 
     #[error(transparent)]
     GetReadyAncestorError(GetReadyAncestorError),
@@ -586,6 +692,19 @@ impl From<GetVectoredError> for CreateImageLayersError {
     }
 }
 
+impl From<GetVectoredError> for PageReconstructError {
+    fn from(e: GetVectoredError) -> Self {
+        match e {
+            GetVectoredError::Cancelled => PageReconstructError::Cancelled,
+            GetVectoredError::InvalidLsn(_) => PageReconstructError::Other(anyhow!("Invalid LSN")),
+            err @ GetVectoredError::Oversized(_) => PageReconstructError::Other(err.into()),
+            GetVectoredError::MissingKey(err) => PageReconstructError::MissingKey(err),
+            GetVectoredError::GetReadyAncestorError(err) => PageReconstructError::from(err),
+            GetVectoredError::Other(err) => PageReconstructError::Other(err),
+        }
+    }
+}
+
 impl From<GetReadyAncestorError> for PageReconstructError {
     fn from(e: GetReadyAncestorError) -> Self {
         use GetReadyAncestorError::*;
@@ -615,6 +734,23 @@ pub enum GetVectoredImpl {
     Vectored,
 }
 
+#[derive(
+    Eq,
+    PartialEq,
+    Debug,
+    Copy,
+    Clone,
+    strum_macros::EnumString,
+    strum_macros::Display,
+    serde_with::DeserializeFromStr,
+    serde_with::SerializeDisplay,
+)]
+#[strum(serialize_all = "kebab-case")]
+pub enum GetImpl {
+    Legacy,
+    Vectored,
+}
+
 pub(crate) enum WaitLsnWaiter<'a> {
     Timeline(&'a Timeline),
     Tenant,
@@ -676,16 +812,6 @@ impl Timeline {
         key: Key,
         lsn: Lsn,
         ctx: &RequestContext,
-    ) -> Result<Bytes, PageReconstructError> {
-        self.timeline_get_throttle.throttle(ctx, 1).await;
-        self.get_impl(key, lsn, ctx).await
-    }
-    /// Not subject to [`Self::timeline_get_throttle`].
-    async fn get_impl(
-        &self,
-        key: Key,
-        lsn: Lsn,
-        ctx: &RequestContext,
     ) -> Result<Bytes, PageReconstructError> {
         if !lsn.is_valid() {
             return Err(PageReconstructError::Other(anyhow::anyhow!("Invalid LSN")));
@@ -696,13 +822,7 @@ impl Timeline {
         // page_service.
         debug_assert!(!self.shard_identity.is_key_disposable(&key));
 
-        // XXX: structured stats collection for layer eviction here.
-        trace!(
-            "get page request for {}@{} from task kind {:?}",
-            key,
-            lsn,
-            ctx.task_kind()
-        );
+        self.timeline_get_throttle.throttle(ctx, 1).await;
 
         // Check the page cache. We will get back the most recent page with lsn <= `lsn`.
         // The cached image can be returned directly if there is no WAL between the cached image
@@ -725,12 +845,84 @@ impl Timeline {
             None => None,
         };
 
-        let mut reconstruct_state = ValueReconstructState {
-            records: Vec::new(),
-            img: cached_page_img,
-        };
+        match self.conf.get_impl {
+            GetImpl::Legacy => {
+                let reconstruct_state = ValueReconstructState {
+                    records: Vec::new(),
+                    img: cached_page_img,
+                };
 
-        let timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME.start_timer();
+                self.get_impl(key, lsn, reconstruct_state, ctx).await
+            }
+            GetImpl::Vectored => {
+                let keyspace = KeySpace {
+                    ranges: vec![key..key.next()],
+                };
+
+                // Initialise the reconstruct state for the key with the cache
+                // entry returned above.
+                let mut reconstruct_state = ValuesReconstructState::new();
+                let mut key_state = VectoredValueReconstructState::default();
+                key_state.img = cached_page_img;
+                reconstruct_state.keys.insert(key, Ok(key_state));
+
+                let vectored_res = self
+                    .get_vectored_impl(keyspace.clone(), lsn, reconstruct_state, ctx)
+                    .await;
+
+                if self.conf.validate_vectored_get {
+                    self.validate_get_vectored_impl(&vectored_res, keyspace, lsn, ctx)
+                        .await;
+                }
+
+                let key_value = vectored_res?.pop_first();
+                match key_value {
+                    Some((got_key, value)) => {
+                        if got_key != key {
+                            error!(
+                                "Expected {}, but singular vectored get returned {}",
+                                key, got_key
+                            );
+                            Err(PageReconstructError::Other(anyhow!(
+                                "Singular vectored get returned wrong key"
+                            )))
+                        } else {
+                            value
+                        }
+                    }
+                    None => Err(PageReconstructError::MissingKey(MissingKeyError {
+                        key,
+                        shard: self.shard_identity.get_shard_number(&key),
+                        cont_lsn: Lsn(0),
+                        request_lsn: lsn,
+                        ancestor_lsn: None,
+                        traversal_path: Vec::new(),
+                        backtrace: None,
+                    })),
+                }
+            }
+        }
+    }
+
+    /// Not subject to [`Self::timeline_get_throttle`].
+    async fn get_impl(
+        &self,
+        key: Key,
+        lsn: Lsn,
+        mut reconstruct_state: ValueReconstructState,
+        ctx: &RequestContext,
+    ) -> Result<Bytes, PageReconstructError> {
+        // XXX: structured stats collection for layer eviction here.
+        trace!(
+            "get page request for {}@{} from task kind {:?}",
+            key,
+            lsn,
+            ctx.task_kind()
+        );
+
+        let timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME
+            .for_get_kind(GetKind::Singular)
+            .start_timer();
         let path = self
             .get_reconstruct_data(key, lsn, &mut reconstruct_state, ctx)
             .await?;
@@ -740,7 +932,7 @@ impl Timeline {
         let res = self.reconstruct_value(key, lsn, reconstruct_state).await;
         let elapsed = start.elapsed();
         crate::metrics::RECONSTRUCT_TIME
-            .for_result(&res)
+            .for_get_kind(GetKind::Singular)
             .observe(elapsed.as_secs_f64());
 
         if cfg!(feature = "testing") && res.is_err() {
@@ -753,7 +945,7 @@ impl Timeline {
                 writeln!(
                     msg,
                     "- layer traversal: result {res:?}, cont_lsn {cont_lsn}, layer: {}",
-                    layer(),
+                    layer,
                 )
                 .expect("string grows")
             });
@@ -782,7 +974,7 @@ impl Timeline {
             return Err(GetVectoredError::InvalidLsn(lsn));
         }
 
-        let key_count = keyspace.total_size().try_into().unwrap();
+        let key_count = keyspace.total_raw_size().try_into().unwrap();
         if key_count > Timeline::MAX_GET_VECTORED_KEYS {
             return Err(GetVectoredError::Oversized(key_count));
         }
@@ -819,7 +1011,9 @@ impl Timeline {
                 self.get_vectored_sequential_impl(keyspace, lsn, ctx).await
             }
             GetVectoredImpl::Vectored => {
-                let vectored_res = self.get_vectored_impl(keyspace.clone(), lsn, ctx).await;
+                let vectored_res = self
+                    .get_vectored_impl(keyspace.clone(), lsn, ValuesReconstructState::new(), ctx)
+                    .await;
 
                 if self.conf.validate_vectored_get {
                     self.validate_get_vectored_impl(&vectored_res, keyspace, lsn, ctx)
@@ -854,6 +1048,70 @@ impl Timeline {
         res
     }
 
+    /// Scan the keyspace and return all existing key-values in the keyspace. This currently uses vectored
+    /// get underlying. Normal vectored get would throw an error when a key in the keyspace is not found
+    /// during the search, but for the scan interface, it returns all existing key-value pairs, and does
+    /// not expect each single key in the key space will be found. The semantics is closer to the RocksDB
+    /// scan iterator interface. We could optimize this interface later to avoid some checks in the vectored
+    /// get path to maintain and split the probing and to-be-probe keyspace. We also need to ensure that
+    /// the scan operation will not cause OOM in the future.
+    #[allow(dead_code)]
+    pub(crate) async fn scan(
+        &self,
+        keyspace: KeySpace,
+        lsn: Lsn,
+        ctx: &RequestContext,
+    ) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
+        if !lsn.is_valid() {
+            return Err(GetVectoredError::InvalidLsn(lsn));
+        }
+
+        trace!(
+            "key-value scan request for {:?}@{} from task kind {:?}",
+            keyspace,
+            lsn,
+            ctx.task_kind()
+        );
+
+        // We should generalize this into Keyspace::contains in the future.
+        for range in &keyspace.ranges {
+            if range.start.field1 < METADATA_KEY_BEGIN_PREFIX
+                || range.end.field1 >= METADATA_KEY_END_PREFIX
+            {
+                return Err(GetVectoredError::Other(anyhow::anyhow!(
+                    "only metadata keyspace can be scanned"
+                )));
+            }
+        }
+
+        let start = crate::metrics::SCAN_LATENCY
+            .for_task_kind(ctx.task_kind())
+            .map(ScanLatencyOngoingRecording::start_recording);
+
+        // start counting after throttle so that throttle time
+        // is always less than observation time
+        let throttled = self
+            .timeline_get_throttle
+            // assume scan = 1 quota for now until we find a better way to process this
+            .throttle(ctx, 1)
+            .await;
+
+        let vectored_res = self
+            .get_vectored_impl(
+                keyspace.clone(),
+                lsn,
+                ValuesReconstructState::default(),
+                ctx,
+            )
+            .await;
+
+        if let Some(recording) = start {
+            recording.observe(throttled);
+        }
+
+        vectored_res
+    }
+
     /// Not subject to [`Self::timeline_get_throttle`].
     pub(super) async fn get_vectored_sequential_impl(
         &self,
@@ -862,18 +1120,47 @@ impl Timeline {
         ctx: &RequestContext,
     ) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
         let mut values = BTreeMap::new();
+
         for range in keyspace.ranges {
             let mut key = range.start;
             while key != range.end {
-                let block = self.get_impl(key, lsn, ctx).await;
+                let block = self
+                    .get_impl(key, lsn, ValueReconstructState::default(), ctx)
+                    .await;
 
                 use PageReconstructError::*;
                 match block {
                     Err(Cancelled | AncestorStopping(_)) => {
                         return Err(GetVectoredError::Cancelled)
                     }
-                    Err(Other(err)) if err.to_string().contains("could not find data for key") => {
-                        return Err(GetVectoredError::MissingKey(key))
+                    Err(MissingKey(_))
+                        if NON_INHERITED_RANGE.contains(&key)
+                            || NON_INHERITED_SPARSE_RANGE.contains(&key) =>
+                    {
+                        // Ignore missing key error for aux key range. TODO: currently, we assume non_inherited_range == aux_key_range.
+                        // When we add more types of keys into the page server, we should revisit this part of code and throw errors
+                        // accordingly.
+                        key = key.next();
+                    }
+                    Err(MissingKey(err)) => {
+                        return Err(GetVectoredError::MissingKey(err));
+                    }
+                    Err(Other(err))
+                        if err
+                            .to_string()
+                            .contains("downloading evicted layer file failed") =>
+                    {
+                        return Err(GetVectoredError::Other(err))
+                    }
+                    Err(Other(err))
+                        if err
+                            .chain()
+                            .any(|cause| cause.to_string().contains("layer loading failed")) =>
+                    {
+                        // The intent here is to achieve error parity with the vectored read path.
+                        // When vectored read fails to load a layer it fails the whole read, hence
+                        // we mimic this behaviour here to keep the validation happy.
+                        return Err(GetVectoredError::Other(err));
                     }
                     _ => {
                         values.insert(key, block);
@@ -890,14 +1177,27 @@ impl Timeline {
         &self,
         keyspace: KeySpace,
         lsn: Lsn,
+        mut reconstruct_state: ValuesReconstructState,
         ctx: &RequestContext,
     ) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
-        let mut reconstruct_state = ValuesReconstructState::new();
+        let get_kind = if keyspace.total_raw_size() == 1 {
+            GetKind::Singular
+        } else {
+            GetKind::Vectored
+        };
 
+        let get_data_timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME
+            .for_get_kind(get_kind)
+            .start_timer();
         self.get_vectored_reconstruct_data(keyspace, lsn, &mut reconstruct_state, ctx)
             .await?;
+        get_data_timer.stop_and_record();
 
+        let reconstruct_timer = crate::metrics::RECONSTRUCT_TIME
+            .for_get_kind(get_kind)
+            .start_timer();
         let mut results: BTreeMap<Key, Result<Bytes, PageReconstructError>> = BTreeMap::new();
+        let layers_visited = reconstruct_state.get_layers_visited();
         for (key, res) in reconstruct_state.keys {
             match res {
                 Err(err) => {
@@ -911,6 +1211,13 @@ impl Timeline {
                 }
             }
         }
+        reconstruct_timer.stop_and_record();
+
+        // Note that this is an approximation. Tracking the exact number of layers visited
+        // per key requires virtually unbounded memory usage and is inefficient
+        // (i.e. segment tree tracking each range queried from a layer)
+        crate::metrics::VEC_READ_NUM_LAYERS_VISITED
+            .observe(layers_visited as f64 / results.len() as f64);
 
         Ok(results)
     }
@@ -923,6 +1230,11 @@ impl Timeline {
         lsn: Lsn,
         ctx: &RequestContext,
     ) {
+        if keyspace.overlaps(&Key::metadata_key_range()) {
+            // skip validation for metadata key range
+            return;
+        }
+
         let sequential_res = self
             .get_vectored_sequential_impl(keyspace.clone(), lsn, ctx)
             .await;
@@ -932,7 +1244,7 @@ impl Timeline {
             match (lhs, rhs) {
                 (Oversized(l), Oversized(r)) => l == r,
                 (InvalidLsn(l), InvalidLsn(r)) => l == r,
-                (MissingKey(l), MissingKey(r)) => l == r,
+                (MissingKey(l), MissingKey(r)) => l.key == r.key,
                 (GetReadyAncestorError(_), GetReadyAncestorError(_)) => true,
                 (Other(_), Other(_)) => true,
                 _ => false,
@@ -946,6 +1258,11 @@ impl Timeline {
                 panic!(concat!("Sequential get failed with {}, but vectored get did not",
                                " - keyspace={:?} lsn={}"),
                        seq_err, keyspace, lsn) },
+            (Ok(_), Err(GetVectoredError::GetReadyAncestorError(GetReadyAncestorError::AncestorLsnTimeout(_)))) => {
+                // Sequential get runs after vectored get, so it is possible for the later
+                // to time out while waiting for its ancestor's Lsn to become ready and for the
+                // former to succeed (it essentially has a doubled wait time).
+            },
             (Ok(_), Err(vec_err)) => {
                 panic!(concat!("Vectored get failed with {}, but sequential get did not",
                                " - keyspace={:?} lsn={}"),
@@ -1026,6 +1343,12 @@ impl Timeline {
         self.last_record_lsn.load()
     }
 
+    /// Subscribe to callers of wait_lsn(). The value of the channel is None if there are no
+    /// wait_lsn() calls in progress, and Some(Lsn) if there is an active waiter for wait_lsn().
+    pub(crate) fn subscribe_for_wait_lsn_updates(&self) -> watch::Receiver<Option<Lsn>> {
+        self.last_record_lsn.status_receiver()
+    }
+
     pub(crate) fn get_disk_consistent_lsn(&self) -> Lsn {
         self.disk_consistent_lsn.load()
     }
@@ -1257,7 +1580,7 @@ impl Timeline {
             checkpoint_distance,
             self.get_last_record_lsn(),
             self.last_freeze_at.load(),
-            *self.last_freeze_ts.read().unwrap(),
+            open_layer.get_opened_at(),
         ) {
             match open_layer.info() {
                 InMemoryLayerInfo::Frozen { lsn_start, lsn_end } => {
@@ -1344,7 +1667,7 @@ impl Timeline {
         background_jobs_can_start: Option<&completion::Barrier>,
         ctx: &RequestContext,
     ) {
-        if self.tenant_shard_id.is_zero() {
+        if self.tenant_shard_id.is_shard_zero() {
             // Logical size is only maintained accurately on shard zero.
             self.spawn_initial_logical_size_computation_task(ctx);
         }
@@ -1622,7 +1945,7 @@ impl Timeline {
         checkpoint_distance: u64,
         projected_lsn: Lsn,
         last_freeze_at: Lsn,
-        last_freeze_ts: Instant,
+        opened_at: Instant,
     ) -> bool {
         let distance = projected_lsn.widening_sub(last_freeze_at);
 
@@ -1648,13 +1971,13 @@ impl Timeline {
             );
 
             true
-        } else if distance > 0 && last_freeze_ts.elapsed() >= self.get_checkpoint_timeout() {
+        } else if distance > 0 && opened_at.elapsed() >= self.get_checkpoint_timeout() {
             info!(
-                "Will roll layer at {} with layer size {} due to time since last flush ({:?})",
-                projected_lsn,
-                layer_size,
-                last_freeze_ts.elapsed()
-            );
+                    "Will roll layer at {} with layer size {} due to time since first write to the layer ({:?})",
+                    projected_lsn,
+                    layer_size,
+                    opened_at.elapsed()
+                );
 
             true
         } else {
@@ -1668,6 +1991,15 @@ const REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE: u64 = 10;
 
 // Private functions
 impl Timeline {
+    #[allow(dead_code)]
+    pub(crate) fn get_switch_to_aux_file_v2(&self) -> bool {
+        let tenant_conf = self.tenant_conf.load();
+        tenant_conf
+            .tenant_conf
+            .switch_to_aux_file_v2
+            .unwrap_or(self.conf.default_tenant_conf.switch_to_aux_file_v2)
+    }
+
     pub(crate) fn get_lazy_slru_download(&self) -> bool {
         let tenant_conf = self.tenant_conf.load();
         tenant_conf
@@ -1869,11 +2201,7 @@ impl Timeline {
 
                 write_lock: tokio::sync::Mutex::new(None),
 
-                gc_info: std::sync::RwLock::new(GcInfo {
-                    retain_lsns: Vec::new(),
-                    horizon_cutoff: Lsn(0),
-                    pitr_cutoff: Lsn(0),
-                }),
+                gc_info: std::sync::RwLock::new(GcInfo::default()),
 
                 latest_gc_cutoff_lsn: Rcu::new(metadata.latest_gc_cutoff_lsn()),
                 initdb_lsn: metadata.initdb_lsn(),
@@ -1887,12 +2215,18 @@ impl Timeline {
                     // initial logical size is 0.
                     LogicalSize::empty_initial()
                 },
-                partitioning: tokio::sync::Mutex::new((KeyPartitioning::new(), Lsn(0))),
+                partitioning: tokio::sync::Mutex::new((
+                    (KeyPartitioning::new(), KeyPartitioning::new().into_sparse()),
+                    Lsn(0),
+                )),
                 repartition_threshold: 0,
                 last_image_layer_creation_check_at: AtomicLsn::new(0),
 
                 last_received_wal: Mutex::new(None),
-                rel_size_cache: RwLock::new(HashMap::new()),
+                rel_size_cache: RwLock::new(RelSizeCache {
+                    complete_as_of: disk_consistent_lsn,
+                    map: HashMap::new(),
+                }),
 
                 download_all_remote_layers_task_info: RwLock::new(None),
 
@@ -2237,7 +2571,7 @@ impl Timeline {
         priority: GetLogicalSizePriority,
         ctx: &RequestContext,
     ) -> logical_size::CurrentLogicalSize {
-        if !self.tenant_shard_id.is_zero() {
+        if !self.tenant_shard_id.is_shard_zero() {
             // Logical size is only accurately maintained on shard zero: when called elsewhere, for example
             // when HTTP API is serving a GET for timeline zero, return zero
             return logical_size::CurrentLogicalSize::Approximate(logical_size::Approximate::zero());
@@ -2533,7 +2867,7 @@ impl Timeline {
         crate::span::debug_assert_current_span_has_tenant_and_timeline_id();
         // We should never be calculating logical sizes on shard !=0, because these shards do not have
         // accurate relation sizes, and they do not emit consumption metrics.
-        debug_assert!(self.tenant_shard_id.is_zero());
+        debug_assert!(self.tenant_shard_id.is_shard_zero());
 
         let guard = self
             .gate
@@ -2692,7 +3026,7 @@ impl Timeline {
     }
 }
 
-type TraversalId = String;
+type TraversalId = Arc<str>;
 
 trait TraversalLayerExt {
     fn traversal_id(&self) -> TraversalId;
@@ -2700,13 +3034,13 @@ trait TraversalLayerExt {
 
 impl TraversalLayerExt for Layer {
     fn traversal_id(&self) -> TraversalId {
-        self.local_path().to_string()
+        Arc::clone(self.debug_str())
     }
 }
 
 impl TraversalLayerExt for Arc<InMemoryLayer> {
     fn traversal_id(&self) -> TraversalId {
-        format!("timeline {} in-memory {self}", self.get_timeline_id())
+        Arc::clone(self.local_path_str())
     }
 }
 
@@ -2735,7 +3069,7 @@ impl Timeline {
         let mut timeline = self;
 
         let mut read_count = scopeguard::guard(0, |cnt| {
-            crate::metrics::READ_NUM_FS_LAYERS.observe(cnt as f64)
+            crate::metrics::READ_NUM_LAYERS_VISITED.observe(cnt as f64)
         });
 
         // For debugging purposes, collect the path of layers that we traversed
@@ -2775,32 +3109,33 @@ impl Timeline {
                         if prev <= cont_lsn {
                             // Didn't make any progress in last iteration. Error out to avoid
                             // getting stuck in the loop.
-                            return Err(layer_traversal_error(format!(
-                                "could not find layer with more data for key {} at LSN {}, request LSN {}, ancestor {}",
+                            return Err(PageReconstructError::MissingKey(MissingKeyError {
                                 key,
-                                Lsn(cont_lsn.0 - 1),
+                                shard: self.shard_identity.get_shard_number(&key),
+                                cont_lsn: Lsn(cont_lsn.0 - 1),
                                 request_lsn,
-                                timeline.ancestor_lsn
-                            ), traversal_path));
+                                ancestor_lsn: Some(timeline.ancestor_lsn),
+                                traversal_path,
+                                backtrace: None,
+                            }));
                         }
                     }
                     prev_lsn = Some(cont_lsn);
                 }
                 ValueReconstructResult::Missing => {
-                    return Err(layer_traversal_error(
-                        if cfg!(test) {
-                            format!(
-                                "could not find data for key {} (shard {:?}) at LSN {}, for request at LSN {}\n{}",
-                                key, self.shard_identity.get_shard_number(&key), cont_lsn, request_lsn, std::backtrace::Backtrace::force_capture(),
-                            )
-                        } else {
-                            format!(
-                                "could not find data for key {} (shard {:?}) at LSN {}, for request at LSN {}",
-                                key, self.shard_identity.get_shard_number(&key), cont_lsn, request_lsn
-                            )
-                        },
+                    return Err(PageReconstructError::MissingKey(MissingKeyError {
+                        key,
+                        shard: self.shard_identity.get_shard_number(&key),
+                        cont_lsn,
+                        request_lsn,
+                        ancestor_lsn: None,
                         traversal_path,
-                    ));
+                        backtrace: if cfg!(test) {
+                            Some(std::backtrace::Backtrace::force_capture())
+                        } else {
+                            None
+                        },
+                    }));
                 }
             }
 
@@ -2847,12 +3182,8 @@ impl Timeline {
                         Err(e) => return Err(PageReconstructError::from(e)),
                     };
                     cont_lsn = lsn_floor;
-                    // metrics: open_layer does not count as fs access, so we are not updating `read_count`
-                    traversal_path.push((
-                        result,
-                        cont_lsn,
-                        Box::new(move || open_layer.traversal_id()),
-                    ));
+                    *read_count += 1;
+                    traversal_path.push((result, cont_lsn, open_layer.traversal_id()));
                     continue 'outer;
                 }
             }
@@ -2878,12 +3209,8 @@ impl Timeline {
                         Err(e) => return Err(PageReconstructError::from(e)),
                     };
                     cont_lsn = lsn_floor;
-                    // metrics: open_layer does not count as fs access, so we are not updating `read_count`
-                    traversal_path.push((
-                        result,
-                        cont_lsn,
-                        Box::new(move || frozen_layer.traversal_id()),
-                    ));
+                    *read_count += 1;
+                    traversal_path.push((result, cont_lsn, frozen_layer.traversal_id()));
                     continue 'outer;
                 }
             }
@@ -2891,7 +3218,6 @@ impl Timeline {
             if let Some(SearchResult { lsn_floor, layer }) = layers.search(key, cont_lsn) {
                 let layer = guard.get_from_desc(&layer);
                 drop(guard);
-
                 // Get all the data needed to reconstruct the page version from this layer.
                 // But if we have an older cached page image, no need to go past that.
                 let lsn_floor = max(cached_lsn + 1, lsn_floor);
@@ -2904,14 +3230,7 @@ impl Timeline {
                 };
                 cont_lsn = lsn_floor;
                 *read_count += 1;
-                traversal_path.push((
-                    result,
-                    cont_lsn,
-                    Box::new({
-                        let layer = layer.to_owned();
-                        move || layer.traversal_id()
-                    }),
-                ));
+                traversal_path.push((result, cont_lsn, layer.traversal_id()));
                 continue 'outer;
             } else if timeline.ancestor_timeline.is_some() {
                 // Nothing on this timeline. Traverse to parent
@@ -2964,11 +3283,22 @@ impl Timeline {
             .await?;
 
             keyspace.remove_overlapping_with(&completed);
-            if keyspace.total_size() == 0 || timeline.ancestor_timeline.is_none() {
+
+            // Do not descend into the ancestor timeline for aux files.
+            // We don't return a blanket [`GetVectoredError::MissingKey`] to avoid
+            // stalling compaction.
+            keyspace.remove_overlapping_with(&KeySpace {
+                ranges: vec![NON_INHERITED_RANGE, NON_INHERITED_SPARSE_RANGE],
+            });
+
+            // Keyspace is fully retrieved, no ancestor timeline, or metadata scan (where we do not look
+            // into ancestor timelines). TODO: is there any other metadata which we want to inherit?
+            if keyspace.total_raw_size() == 0 || timeline.ancestor_timeline.is_none() {
                 break;
             }
 
-            cont_lsn = Lsn(timeline.ancestor_lsn.0 + 1);
+            // Take the min to avoid reconstructing a page with data newer than request Lsn.
+            cont_lsn = std::cmp::min(Lsn(request_lsn.0 + 1), Lsn(timeline.ancestor_lsn.0 + 1));
             timeline_owned = timeline
                 .get_ready_ancestor_timeline(ctx)
                 .await
@@ -2976,14 +3306,24 @@ impl Timeline {
             timeline = &*timeline_owned;
         }
 
-        if keyspace.total_size() != 0 {
-            return Err(GetVectoredError::MissingKey(keyspace.start().unwrap()));
+        if keyspace.total_raw_size() != 0 {
+            return Err(GetVectoredError::MissingKey(MissingKeyError {
+                key: keyspace.start().unwrap(), /* better if we can store the full keyspace */
+                shard: self
+                    .shard_identity
+                    .get_shard_number(&keyspace.start().unwrap()),
+                cont_lsn,
+                request_lsn,
+                ancestor_lsn: Some(timeline.ancestor_lsn),
+                traversal_path: vec![],
+                backtrace: None,
+            }));
         }
 
         Ok(())
     }
 
-    /// Collect the reconstruct data for a ketspace from the specified timeline.
+    /// Collect the reconstruct data for a keyspace from the specified timeline.
     ///
     /// Maintain a fringe [`LayerFringe`] which tracks all the layers that intersect
     /// the current keyspace. The current keyspace of the search at any given timeline
@@ -3018,55 +3358,61 @@ impl Timeline {
             unmapped_keyspace.remove_overlapping_with(&keys_done_last_step);
             completed_keyspace.merge(&keys_done_last_step);
 
-            let guard = timeline.layers.read().await;
-            let layers = guard.layer_map();
+            // Do not descent any further if the last layer we visited
+            // completed all keys in the keyspace it inspected. This is not
+            // required for correctness, but avoids visiting extra layers
+            // which turns out to be a perf bottleneck in some cases.
+            if !unmapped_keyspace.is_empty() {
+                let guard = timeline.layers.read().await;
+                let layers = guard.layer_map();
 
-            let in_memory_layer = layers.find_in_memory_layer(|l| {
-                let start_lsn = l.get_lsn_range().start;
-                cont_lsn > start_lsn
-            });
+                let in_memory_layer = layers.find_in_memory_layer(|l| {
+                    let start_lsn = l.get_lsn_range().start;
+                    cont_lsn > start_lsn
+                });
 
-            match in_memory_layer {
-                Some(l) => {
-                    let lsn_range = l.get_lsn_range().start..cont_lsn;
-                    fringe.update(
-                        ReadableLayer::InMemoryLayer(l),
-                        unmapped_keyspace.clone(),
-                        lsn_range,
-                    );
-                }
-                None => {
-                    for range in unmapped_keyspace.ranges.iter() {
-                        let results = layers.range_search(range.clone(), cont_lsn);
+                match in_memory_layer {
+                    Some(l) => {
+                        let lsn_range = l.get_lsn_range().start..cont_lsn;
+                        fringe.update(
+                            ReadableLayer::InMemoryLayer(l),
+                            unmapped_keyspace.clone(),
+                            lsn_range,
+                        );
+                    }
+                    None => {
+                        for range in unmapped_keyspace.ranges.iter() {
+                            let results = layers.range_search(range.clone(), cont_lsn);
 
-                        results
-                            .found
-                            .into_iter()
-                            .map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| {
-                                (
-                                    ReadableLayer::PersistentLayer(guard.get_from_desc(&layer)),
-                                    keyspace_accum.to_keyspace(),
-                                    lsn_floor..cont_lsn,
-                                )
-                            })
-                            .for_each(|(layer, keyspace, lsn_range)| {
-                                fringe.update(layer, keyspace, lsn_range)
-                            });
+                            results
+                                .found
+                                .into_iter()
+                                .map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| {
+                                    (
+                                        ReadableLayer::PersistentLayer(guard.get_from_desc(&layer)),
+                                        keyspace_accum.to_keyspace(),
+                                        lsn_floor..cont_lsn,
+                                    )
+                                })
+                                .for_each(|(layer, keyspace, lsn_range)| {
+                                    fringe.update(layer, keyspace, lsn_range)
+                                });
+                        }
                     }
                 }
-            }
 
-            // It's safe to drop the layer map lock after planning the next round of reads.
-            // The fringe keeps readable handles for the layers which are safe to read even
-            // if layers were compacted or flushed.
-            //
-            // The more interesting consideration is: "Why is the read algorithm still correct
-            // if the layer map changes while it is operating?". Doing a vectored read on a
-            // timeline boils down to pushing an imaginary lsn boundary downwards for each range
-            // covered by the read. The layer map tells us how to move the lsn downwards for a
-            // range at *a particular point in time*. It is fine for the answer to be different
-            // at two different time points.
-            drop(guard);
+                // It's safe to drop the layer map lock after planning the next round of reads.
+                // The fringe keeps readable handles for the layers which are safe to read even
+                // if layers were compacted or flushed.
+                //
+                // The more interesting consideration is: "Why is the read algorithm still correct
+                // if the layer map changes while it is operating?". Doing a vectored read on a
+                // timeline boils down to pushing an imaginary lsn boundary downwards for each range
+                // covered by the read. The layer map tells us how to move the lsn downwards for a
+                // range at *a particular point in time*. It is fine for the answer to be different
+                // at two different time points.
+                drop(guard);
+            }
 
             if let Some((layer_to_read, keyspace_to_read, lsn_range)) = fringe.next_layer() {
                 let next_cont_lsn = lsn_range.start;
@@ -3081,6 +3427,8 @@ impl Timeline {
 
                 unmapped_keyspace = keyspace_to_read;
                 cont_lsn = next_cont_lsn;
+
+                reconstruct_state.on_layer_visited();
             } else {
                 break;
             }
@@ -3404,66 +3752,103 @@ impl Timeline {
         // files instead. This is possible as long as *all* the data imported into the
         // repository have the same LSN.
         let lsn_range = frozen_layer.get_lsn_range();
-        let (layers_to_upload, delta_layer_to_add) =
-            if lsn_range.start == self.initdb_lsn && lsn_range.end == Lsn(self.initdb_lsn.0 + 1) {
-                #[cfg(test)]
-                match &mut *self.flush_loop_state.lock().unwrap() {
-                    FlushLoopState::NotStarted | FlushLoopState::Exited => {
-                        panic!("flush loop not running")
-                    }
-                    FlushLoopState::Running {
-                        initdb_optimization_count,
-                        ..
-                    } => {
+
+        // Whether to directly create image layers for this flush, or flush them as delta layers
+        let create_image_layer =
+            lsn_range.start == self.initdb_lsn && lsn_range.end == Lsn(self.initdb_lsn.0 + 1);
+
+        #[cfg(test)]
+        {
+            match &mut *self.flush_loop_state.lock().unwrap() {
+                FlushLoopState::NotStarted | FlushLoopState::Exited => {
+                    panic!("flush loop not running")
+                }
+                FlushLoopState::Running {
+                    expect_initdb_optimization,
+                    initdb_optimization_count,
+                    ..
+                } => {
+                    if create_image_layer {
                         *initdb_optimization_count += 1;
-                    }
-                }
-                // Note: The 'ctx' in use here has DownloadBehavior::Error. We should not
-                // require downloading anything during initial import.
-                let (partitioning, _lsn) = self
-                    .repartition(
-                        self.initdb_lsn,
-                        self.get_compaction_target_size(),
-                        EnumSet::empty(),
-                        ctx,
-                    )
-                    .await?;
-
-                if self.cancel.is_cancelled() {
-                    return Err(FlushLayerError::Cancelled);
-                }
-
-                // For image layers, we add them immediately into the layer map.
-                (
-                    self.create_image_layers(&partitioning, self.initdb_lsn, true, ctx)
-                        .await?,
-                    None,
-                )
-            } else {
-                #[cfg(test)]
-                match &mut *self.flush_loop_state.lock().unwrap() {
-                    FlushLoopState::NotStarted | FlushLoopState::Exited => {
-                        panic!("flush loop not running")
-                    }
-                    FlushLoopState::Running {
-                        expect_initdb_optimization,
-                        ..
-                    } => {
+                    } else {
                         assert!(!*expect_initdb_optimization, "expected initdb optimization");
                     }
                 }
-                // Normal case, write out a L0 delta layer file.
-                // `create_delta_layer` will not modify the layer map.
-                // We will remove frozen layer and add delta layer in one atomic operation later.
-                let layer = self.create_delta_layer(&frozen_layer, ctx).await?;
-                (
-                    // FIXME: even though we have a single image and single delta layer assumption
-                    // we push them to vec
-                    vec![layer.clone()],
-                    Some(layer),
+            }
+        }
+
+        let (layers_to_upload, delta_layer_to_add) = if create_image_layer {
+            // Note: The 'ctx' in use here has DownloadBehavior::Error. We should not
+            // require downloading anything during initial import.
+            let ((rel_partition, metadata_partition), _lsn) = self
+                .repartition(
+                    self.initdb_lsn,
+                    self.get_compaction_target_size(),
+                    EnumSet::empty(),
+                    ctx,
                 )
+                .await?;
+
+            if self.cancel.is_cancelled() {
+                return Err(FlushLayerError::Cancelled);
+            }
+
+            // For metadata, always create delta layers.
+            let delta_layer = if !metadata_partition.parts.is_empty() {
+                assert_eq!(
+                    metadata_partition.parts.len(),
+                    1,
+                    "currently sparse keyspace should only contain a single aux file keyspace"
+                );
+                let metadata_keyspace = &metadata_partition.parts[0];
+                assert_eq!(
+                    metadata_keyspace.0.ranges.len(),
+                    1,
+                    "aux file keyspace should be a single range"
+                );
+                self.create_delta_layer(
+                    &frozen_layer,
+                    ctx,
+                    Some(metadata_keyspace.0.ranges[0].clone()),
+                )
+                .await?
+            } else {
+                None
             };
 
+            // For image layers, we add them immediately into the layer map.
+            let mut layers_to_upload = Vec::new();
+            layers_to_upload.extend(
+                self.create_image_layers(
+                    &rel_partition,
+                    self.initdb_lsn,
+                    ImageLayerCreationMode::Initial,
+                    ctx,
+                )
+                .await?,
+            );
+
+            if let Some(delta_layer) = delta_layer {
+                layers_to_upload.push(delta_layer.clone());
+                (layers_to_upload, Some(delta_layer))
+            } else {
+                (layers_to_upload, None)
+            }
+        } else {
+            // Normal case, write out a L0 delta layer file.
+            // `create_delta_layer` will not modify the layer map.
+            // We will remove frozen layer and add delta layer in one atomic operation later.
+            let Some(layer) = self.create_delta_layer(&frozen_layer, ctx, None).await? else {
+                panic!("delta layer cannot be empty if no filter is applied");
+            };
+            (
+                // FIXME: even though we have a single image and single delta layer assumption
+                // we push them to vec
+                vec![layer.clone()],
+                Some(layer),
+            )
+        };
+
         pausable_failpoint!("flush-layer-cancel-after-writing-layer-out-pausable");
 
         if self.cancel.is_cancelled() {
@@ -3524,7 +3909,7 @@ impl Timeline {
         &self,
         disk_consistent_lsn: Lsn,
         layers_to_upload: impl IntoIterator<Item = ResidentLayer>,
-    ) -> anyhow::Result<TimelineMetadata> {
+    ) -> anyhow::Result<()> {
         // We can only save a valid 'prev_record_lsn' value on disk if we
         // flushed *all* in-memory changes to disk. We only track
         // 'prev_record_lsn' in memory for the latest processed record, so we
@@ -3541,19 +3926,10 @@ impl Timeline {
             None
         };
 
-        let ancestor_timeline_id = self
-            .ancestor_timeline
-            .as_ref()
-            .map(|ancestor| ancestor.timeline_id);
-
-        let metadata = TimelineMetadata::new(
+        let update = crate::tenant::metadata::MetadataUpdate::new(
             disk_consistent_lsn,
             ondisk_prev_record_lsn,
-            ancestor_timeline_id,
-            self.ancestor_lsn,
             *self.latest_gc_cutoff_lsn.read(),
-            self.initdb_lsn,
-            self.pg_version,
         );
 
         fail_point!("checkpoint-before-saving-metadata", |x| bail!(
@@ -3565,10 +3941,10 @@ impl Timeline {
             for layer in layers_to_upload {
                 remote_client.schedule_layer_file_upload(layer)?;
             }
-            remote_client.schedule_index_upload_for_metadata_update(&metadata)?;
+            remote_client.schedule_index_upload_for_metadata_update(&update)?;
         }
 
-        Ok(metadata)
+        Ok(())
     }
 
     pub(crate) async fn preserve_initdb_archive(&self) -> anyhow::Result<()> {
@@ -3592,12 +3968,18 @@ impl Timeline {
         self: &Arc<Self>,
         frozen_layer: &Arc<InMemoryLayer>,
         ctx: &RequestContext,
-    ) -> anyhow::Result<ResidentLayer> {
+        key_range: Option<Range<Key>>,
+    ) -> anyhow::Result<Option<ResidentLayer>> {
         let self_clone = Arc::clone(self);
         let frozen_layer = Arc::clone(frozen_layer);
         let ctx = ctx.attached_child();
         let work = async move {
-            let new_delta = frozen_layer.write_to_disk(&self_clone, &ctx).await?;
+            let Some(new_delta) = frozen_layer
+                .write_to_disk(&self_clone, &ctx, key_range)
+                .await?
+            else {
+                return Ok(None);
+            };
             // The write_to_disk() above calls writer.finish() which already did the fsync of the inodes.
             // We just need to fsync the directory in which these inodes are linked,
             // which we know to be the timeline directory.
@@ -3616,7 +3998,7 @@ impl Timeline {
                 .sync_all()
                 .await
                 .fatal_err("VirtualFile::sync_all timeline dir");
-            anyhow::Ok(new_delta)
+            anyhow::Ok(Some(new_delta))
         };
         // Before tokio-epoll-uring, we ran write_to_disk & the sync_all inside spawn_blocking.
         // Preserve that behavior to maintain the same behavior for `virtual_file_io_engine=std-fs`.
@@ -3643,19 +4025,20 @@ impl Timeline {
         partition_size: u64,
         flags: EnumSet<CompactFlags>,
         ctx: &RequestContext,
-    ) -> anyhow::Result<(KeyPartitioning, Lsn)> {
+    ) -> anyhow::Result<((KeyPartitioning, SparseKeyPartitioning), Lsn)> {
         let Ok(mut partitioning_guard) = self.partitioning.try_lock() else {
             // NB: there are two callers, one is the compaction task, of which there is only one per struct Tenant and hence Timeline.
             // The other is the initdb optimization in flush_frozen_layer, used by `boostrap_timeline`, which runs before `.activate()`
             // and hence before the compaction task starts.
             anyhow::bail!("repartition() called concurrently, this should not happen");
         };
-        if lsn < partitioning_guard.1 {
+        let ((dense_partition, sparse_partition), partition_lsn) = &*partitioning_guard;
+        if lsn < *partition_lsn {
             anyhow::bail!("repartition() called with LSN going backwards, this should not happen");
         }
 
-        let distance = lsn.0 - partitioning_guard.1 .0;
-        if partitioning_guard.1 != Lsn(0)
+        let distance = lsn.0 - partition_lsn.0;
+        if *partition_lsn != Lsn(0)
             && distance <= self.repartition_threshold
             && !flags.contains(CompactFlags::ForceRepartition)
         {
@@ -3664,37 +4047,24 @@ impl Timeline {
                 threshold = self.repartition_threshold,
                 "no repartitioning needed"
             );
-            return Ok((partitioning_guard.0.clone(), partitioning_guard.1));
+            return Ok((
+                (dense_partition.clone(), sparse_partition.clone()),
+                *partition_lsn,
+            ));
         }
 
-        let keyspace = self.collect_keyspace(lsn, ctx).await?;
-        let partitioning = keyspace.partition(partition_size);
-
-        *partitioning_guard = (partitioning, lsn);
+        let (dense_ks, sparse_ks) = self.collect_keyspace(lsn, ctx).await?;
+        let dense_partitioning = dense_ks.partition(&self.shard_identity, partition_size);
+        let sparse_partitioning = SparseKeyPartitioning {
+            parts: vec![sparse_ks],
+        }; // no partitioning for metadata keys for now
+        *partitioning_guard = ((dense_partitioning, sparse_partitioning), lsn);
 
         Ok((partitioning_guard.0.clone(), partitioning_guard.1))
     }
 
     // Is it time to create a new image layer for the given partition?
     async fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> bool {
-        let last = self.last_image_layer_creation_check_at.load();
-        if lsn != Lsn(0) {
-            let distance = lsn
-                .checked_sub(last)
-                .expect("Attempt to compact with LSN going backwards");
-
-            let min_distance = self.get_image_layer_creation_check_threshold() as u64
-                * self.get_checkpoint_distance();
-
-            // Skip the expensive delta layer counting below if we've not ingested
-            // sufficient WAL since the last check.
-            if distance.0 < min_distance {
-                return false;
-            }
-        }
-
-        self.last_image_layer_creation_check_at.store(lsn);
-
         let threshold = self.get_image_creation_threshold();
 
         let guard = self.layers.read().await;
@@ -3744,12 +4114,12 @@ impl Timeline {
         false
     }
 
-    #[tracing::instrument(skip_all, fields(%lsn, %force))]
+    #[tracing::instrument(skip_all, fields(%lsn, %mode))]
     async fn create_image_layers(
         self: &Arc<Timeline>,
         partitioning: &KeyPartitioning,
         lsn: Lsn,
-        force: bool,
+        mode: ImageLayerCreationMode,
         ctx: &RequestContext,
     ) -> Result<Vec<ResidentLayer>, CreateImageLayersError> {
         let timer = self.metrics.create_images_time_histo.start_timer();
@@ -3766,11 +4136,46 @@ impl Timeline {
         // image layers  <100000000..100000099> and <200000000..200000199> are not completely covering it.
         let mut start = Key::MIN;
 
+        let check_for_image_layers = {
+            let last_checks_at = self.last_image_layer_creation_check_at.load();
+            let distance = lsn
+                .checked_sub(last_checks_at)
+                .expect("Attempt to compact with LSN going backwards");
+            let min_distance = self.get_image_layer_creation_check_threshold() as u64
+                * self.get_checkpoint_distance();
+
+            // Skip the expensive delta layer counting if this timeline has not ingested sufficient
+            // WAL since the last check.
+            distance.0 >= min_distance
+        };
+
+        if check_for_image_layers {
+            self.last_image_layer_creation_check_at.store(lsn);
+        }
+
         for partition in partitioning.parts.iter() {
             let img_range = start..partition.ranges.last().unwrap().end;
-            if !force && !self.time_for_new_image_layer(partition, lsn).await {
-                start = img_range.end;
-                continue;
+
+            if partition.overlaps(&Key::metadata_key_range()) {
+                // TODO(chi): The next patch will correctly create image layers for metadata keys, and it would be a
+                // rather big change. Keep this patch small for now.
+                match mode {
+                    ImageLayerCreationMode::Force | ImageLayerCreationMode::Try => {
+                        // skip image layer creation anyways for metadata keys.
+                        start = img_range.end;
+                        continue;
+                    }
+                    ImageLayerCreationMode::Initial => {
+                        return Err(CreateImageLayersError::Other(anyhow::anyhow!("no image layer should be created for metadata keys when flushing frozen layers")));
+                    }
+                }
+            } else if let ImageLayerCreationMode::Try = mode {
+                // check_for_image_layers = false -> skip
+                // check_for_image_layers = true -> check time_for_new_image_layer -> skip/generate
+                if !check_for_image_layers || !self.time_for_new_image_layer(partition, lsn).await {
+                    start = img_range.end;
+                    continue;
+                }
             }
 
             let mut image_layer_writer = ImageLayerWriter::new(
@@ -3811,7 +4216,7 @@ impl Timeline {
                     key = key.next();
 
                     // Maybe flush `key_rest_accum`
-                    if key_request_accum.size() >= Timeline::MAX_GET_VECTORED_KEYS
+                    if key_request_accum.raw_size() >= Timeline::MAX_GET_VECTORED_KEYS
                         || last_key_in_range
                     {
                         let results = self
@@ -3850,7 +4255,7 @@ impl Timeline {
                             };
 
                             // Write all the keys we just read into our new image layer.
-                            image_layer_writer.put_image(img_key, img).await?;
+                            image_layer_writer.put_image(img_key, img, ctx).await?;
                             wrote_keys = true;
                         }
                     }
@@ -3861,7 +4266,7 @@ impl Timeline {
                 // Normal path: we have written some data into the new image layer for this
                 // partition, so flush it to disk.
                 start = img_range.end;
-                let image_layer = image_layer_writer.finish(self).await?;
+                let image_layer = image_layer_writer.finish(self, ctx).await?;
                 image_layers.push(image_layer);
             } else {
                 // Special case: the image layer may be empty if this is a sharded tenant and the
@@ -4054,7 +4459,7 @@ impl Timeline {
         Ok(())
     }
 
-    /// Update information about which layer files need to be retained on
+    /// Find the Lsns above which layer files need to be retained on
     /// garbage collection. This is separate from actually performing the GC,
     /// and is updated more frequently, so that compaction can remove obsolete
     /// page versions more aggressively.
@@ -4062,17 +4467,6 @@ impl Timeline {
     /// TODO: that's wishful thinking, compaction doesn't actually do that
     /// currently.
     ///
-    /// The caller specifies how much history is needed with the 3 arguments:
-    ///
-    /// retain_lsns: keep a version of each page at these LSNs
-    /// cutoff_horizon: also keep everything newer than this LSN
-    /// pitr: the time duration required to keep data for PITR
-    ///
-    /// The 'retain_lsns' list is currently used to prevent removing files that
-    /// are needed by child timelines. In the future, the user might be able to
-    /// name additional points in time to retain. The caller is responsible for
-    /// collecting that information.
-    ///
     /// The 'cutoff_horizon' point is used to retain recent versions that might still be
     /// needed by read-only nodes. (As of this writing, the caller just passes
     /// the latest LSN subtracted by a constant, and doesn't do anything smart
@@ -4080,23 +4474,22 @@ impl Timeline {
     ///
     /// The 'pitr' duration is used to calculate a 'pitr_cutoff', which can be used to determine
     /// whether a record is needed for PITR.
-    ///
-    /// NOTE: This function holds a short-lived lock to protect the 'gc_info'
-    /// field, so that the three values passed as argument are stored
-    /// atomically. But the caller is responsible for ensuring that no new
-    /// branches are created that would need to be included in 'retain_lsns',
-    /// for example. The caller should hold `Tenant::gc_cs` lock to ensure
-    /// that.
-    ///
     #[instrument(skip_all, fields(timeline_id=%self.timeline_id))]
-    pub(super) async fn update_gc_info(
+    pub(super) async fn find_gc_cutoffs(
         &self,
-        retain_lsns: Vec<Lsn>,
         cutoff_horizon: Lsn,
         pitr: Duration,
         cancel: &CancellationToken,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> anyhow::Result<GcCutoffs> {
+        let _timer = self
+            .metrics
+            .find_gc_cutoffs_histo
+            .start_timer()
+            .record_on_drop();
+
+        pausable_failpoint!("Timeline::find_gc_cutoffs-pausable");
+
         // First, calculate pitr_cutoff_timestamp and then convert it to LSN.
         //
         // Some unit tests depend on garbage-collection working even when
@@ -4142,19 +4535,14 @@ impl Timeline {
                 *self.get_latest_gc_cutoff_lsn()
             }
         } else {
-            // No time-based retention was configured. Set time-based cutoff to
-            // same as LSN based.
-            cutoff_horizon
+            // No time-based retention was configured. Interpret this as "keep no history".
+            self.get_last_record_lsn()
         };
 
-        // Grab the lock and update the values
-        *self.gc_info.write().unwrap() = GcInfo {
-            retain_lsns,
-            horizon_cutoff: cutoff_horizon,
-            pitr_cutoff,
-        };
-
-        Ok(())
+        Ok(GcCutoffs {
+            horizon: cutoff_horizon,
+            pitr: pitr_cutoff,
+        })
     }
 
     /// Garbage collect layer files on a timeline that are no longer needed.
@@ -4183,8 +4571,8 @@ impl Timeline {
         let (horizon_cutoff, pitr_cutoff, retain_lsns) = {
             let gc_info = self.gc_info.read().unwrap();
 
-            let horizon_cutoff = min(gc_info.horizon_cutoff, self.get_disk_consistent_lsn());
-            let pitr_cutoff = gc_info.pitr_cutoff;
+            let horizon_cutoff = min(gc_info.cutoffs.horizon, self.get_disk_consistent_lsn());
+            let pitr_cutoff = gc_info.cutoffs.pitr;
             let retain_lsns = gc_info.retain_lsns.clone();
             (horizon_cutoff, pitr_cutoff, retain_lsns)
         };
@@ -4664,35 +5052,7 @@ impl Timeline {
     }
 }
 
-type TraversalPathItem = (
-    ValueReconstructResult,
-    Lsn,
-    Box<dyn Send + FnOnce() -> TraversalId>,
-);
-
-/// Helper function for get_reconstruct_data() to add the path of layers traversed
-/// to an error, as anyhow context information.
-fn layer_traversal_error(msg: String, path: Vec<TraversalPathItem>) -> PageReconstructError {
-    // We want the original 'msg' to be the outermost context. The outermost context
-    // is the most high-level information, which also gets propagated to the client.
-    let mut msg_iter = path
-        .into_iter()
-        .map(|(r, c, l)| {
-            format!(
-                "layer traversal: result {:?}, cont_lsn {}, layer: {}",
-                r,
-                c,
-                l(),
-            )
-        })
-        .chain(std::iter::once(msg));
-    // Construct initial message from the first traversed layer
-    let err = anyhow!(msg_iter.next().unwrap());
-
-    // Append all subsequent traversals, and the error message 'msg', as contexts.
-    let msg = msg_iter.fold(err, |err, msg| err.context(msg));
-    PageReconstructError::from(msg)
-}
+type TraversalPathItem = (ValueReconstructResult, Lsn, TraversalId);
 
 struct TimelineWriterState {
     open_layer: Arc<InMemoryLayer>,
@@ -4703,23 +5063,16 @@ struct TimelineWriterState {
     max_lsn: Option<Lsn>,
     // Cached details of the last freeze. Avoids going trough the atomic/lock on every put.
     cached_last_freeze_at: Lsn,
-    cached_last_freeze_ts: Instant,
 }
 
 impl TimelineWriterState {
-    fn new(
-        open_layer: Arc<InMemoryLayer>,
-        current_size: u64,
-        last_freeze_at: Lsn,
-        last_freeze_ts: Instant,
-    ) -> Self {
+    fn new(open_layer: Arc<InMemoryLayer>, current_size: u64, last_freeze_at: Lsn) -> Self {
         Self {
             open_layer,
             current_size,
             prev_lsn: None,
             max_lsn: None,
             cached_last_freeze_at: last_freeze_at,
-            cached_last_freeze_ts: last_freeze_ts,
         }
     }
 }
@@ -4818,12 +5171,10 @@ impl<'a> TimelineWriter<'a> {
         let initial_size = layer.size().await?;
 
         let last_freeze_at = self.last_freeze_at.load();
-        let last_freeze_ts = *self.last_freeze_ts.read().unwrap();
         self.write_guard.replace(TimelineWriterState::new(
             layer,
             initial_size,
             last_freeze_at,
-            last_freeze_ts,
         ));
 
         Ok(())
@@ -4870,7 +5221,7 @@ impl<'a> TimelineWriter<'a> {
             self.get_checkpoint_distance(),
             lsn,
             state.cached_last_freeze_at,
-            state.cached_last_freeze_ts,
+            state.open_layer.get_opened_at(),
         ) {
             OpenLayerAction::Roll
         } else {
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 8075775bbc..1088101a13 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -9,13 +9,13 @@ use std::ops::{Deref, Range};
 use std::sync::Arc;
 
 use super::layer_manager::LayerManager;
-use super::{CompactFlags, DurationRecorder, RecordedDuration, Timeline};
+use super::{CompactFlags, DurationRecorder, ImageLayerCreationMode, RecordedDuration, Timeline};
 
 use anyhow::{anyhow, Context};
 use enumset::EnumSet;
 use fail::fail_point;
 use itertools::Itertools;
-use pageserver_api::shard::TenantShardId;
+use pageserver_api::shard::{ShardIdentity, TenantShardId};
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, info, info_span, trace, warn, Instrument};
 use utils::id::TimelineId;
@@ -102,7 +102,7 @@ impl Timeline {
             )
             .await
         {
-            Ok((partitioning, lsn)) => {
+            Ok(((dense_partitioning, sparse_partitioning), lsn)) => {
                 // Disables access_stats updates, so that the files we read remain candidates for eviction after we're done with them
                 let image_ctx = RequestContextBuilder::extend(ctx)
                     .access_stats_behavior(AccessStatsBehavior::Skip)
@@ -115,17 +115,37 @@ impl Timeline {
 
                 // 3. Create new image layers for partitions that have been modified
                 // "enough".
-                let layers = self
+                let dense_layers = self
                     .create_image_layers(
-                        &partitioning,
+                        &dense_partitioning,
                         lsn,
-                        flags.contains(CompactFlags::ForceImageLayerCreation),
+                        if flags.contains(CompactFlags::ForceImageLayerCreation) {
+                            ImageLayerCreationMode::Force
+                        } else {
+                            ImageLayerCreationMode::Try
+                        },
                         &image_ctx,
                     )
                     .await
                     .map_err(anyhow::Error::from)?;
 
-                self.upload_new_image_layers(layers)?;
+                // For now, nothing will be produced...
+                let sparse_layers = self
+                    .create_image_layers(
+                        &sparse_partitioning.clone().into_dense(),
+                        lsn,
+                        if flags.contains(CompactFlags::ForceImageLayerCreation) {
+                            ImageLayerCreationMode::Force
+                        } else {
+                            ImageLayerCreationMode::Try
+                        },
+                        &image_ctx,
+                    )
+                    .await
+                    .map_err(anyhow::Error::from)?;
+                assert!(sparse_layers.is_empty());
+
+                self.upload_new_image_layers(dense_layers)?;
             }
             Err(err) => {
                 // no partitioning? This is normal, if the timeline was just created
@@ -500,7 +520,7 @@ impl Timeline {
                             writer
                                 .take()
                                 .unwrap()
-                                .finish(prev_key.unwrap().next(), self)
+                                .finish(prev_key.unwrap().next(), self, ctx)
                                 .await?,
                         );
                         writer = None;
@@ -542,7 +562,11 @@ impl Timeline {
                     );
                 }
 
-                writer.as_mut().unwrap().put_value(key, lsn, value).await?;
+                writer
+                    .as_mut()
+                    .unwrap()
+                    .put_value(key, lsn, value, ctx)
+                    .await?;
             } else {
                 debug!(
                     "Dropping key {} during compaction (it belongs on shard {:?})",
@@ -558,7 +582,7 @@ impl Timeline {
             prev_key = Some(key);
         }
         if let Some(writer) = writer {
-            new_layers.push(writer.finish(prev_key.unwrap().next(), self).await?);
+            new_layers.push(writer.finish(prev_key.unwrap().next(), self, ctx).await?);
         }
 
         // Sync layers
@@ -758,8 +782,9 @@ impl Timeline {
             return Err(CompactionError::ShuttingDown);
         }
 
-        let keyspace = self.collect_keyspace(end_lsn, ctx).await?;
-        let mut adaptor = TimelineAdaptor::new(self, (end_lsn, keyspace));
+        let (dense_ks, _sparse_ks) = self.collect_keyspace(end_lsn, ctx).await?;
+        // TODO(chi): ignore sparse_keyspace for now, compact it in the future.
+        let mut adaptor = TimelineAdaptor::new(self, (end_lsn, dense_ks));
 
         pageserver_compaction::compact_tiered::compact_tiered(
             &mut adaptor,
@@ -831,6 +856,10 @@ impl CompactionJobExecutor for TimelineAdaptor {
 
     type RequestContext = crate::context::RequestContext;
 
+    fn get_shard_identity(&self) -> &ShardIdentity {
+        self.timeline.get_shard_identity()
+    }
+
     async fn get_layers(
         &mut self,
         key_range: &Range<Key>,
@@ -947,7 +976,7 @@ impl CompactionJobExecutor for TimelineAdaptor {
 
             let value = val.load(ctx).await?;
 
-            writer.put_value(key, lsn, value).await?;
+            writer.put_value(key, lsn, value, ctx).await?;
 
             prev = Some((key, lsn));
         }
@@ -963,7 +992,7 @@ impl CompactionJobExecutor for TimelineAdaptor {
         });
 
         let new_delta_layer = writer
-            .finish(prev.unwrap().0.next(), &self.timeline)
+            .finish(prev.unwrap().0.next(), &self.timeline, ctx)
             .await?;
 
         self.new_deltas.push(new_delta_layer);
@@ -1033,11 +1062,11 @@ impl TimelineAdaptor {
                         }
                     }
                 };
-                image_layer_writer.put_image(key, img).await?;
+                image_layer_writer.put_image(key, img, ctx).await?;
                 key = key.next();
             }
         }
-        let image_layer = image_layer_writer.finish(&self.timeline).await?;
+        let image_layer = image_layer_writer.finish(&self.timeline, ctx).await?;
 
         self.new_images.push(image_layer);
 
diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs
index 522c5b57de..3567761b9a 100644
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -188,24 +188,10 @@ impl Timeline {
     ) -> ControlFlow<()> {
         let now = SystemTime::now();
 
-        let acquire_permit = crate::tenant::tasks::concurrent_background_tasks_rate_limit_permit(
-            BackgroundLoopKind::Eviction,
-            ctx,
-        );
+        let permit = self.acquire_imitation_permit(cancel, ctx).await?;
 
-        let _permit = tokio::select! {
-            permit = acquire_permit => permit,
-            _ = cancel.cancelled() => return ControlFlow::Break(()),
-            _ = self.cancel.cancelled() => return ControlFlow::Break(()),
-        };
-
-        match self
-            .imitate_layer_accesses(tenant, p, cancel, gate, ctx)
-            .await
-        {
-            ControlFlow::Break(()) => return ControlFlow::Break(()),
-            ControlFlow::Continue(()) => (),
-        }
+        self.imitate_layer_accesses(tenant, p, cancel, gate, permit, ctx)
+            .await?;
 
         #[derive(Debug, Default)]
         struct EvictionStats {
@@ -330,19 +316,27 @@ impl Timeline {
         gate: &GateGuard,
         ctx: &RequestContext,
     ) -> ControlFlow<()> {
+        let permit = self.acquire_imitation_permit(cancel, ctx).await?;
+
+        self.imitate_layer_accesses(tenant, p, cancel, gate, permit, ctx)
+            .await
+    }
+
+    async fn acquire_imitation_permit(
+        &self,
+        cancel: &CancellationToken,
+        ctx: &RequestContext,
+    ) -> ControlFlow<(), tokio::sync::SemaphorePermit<'static>> {
         let acquire_permit = crate::tenant::tasks::concurrent_background_tasks_rate_limit_permit(
             BackgroundLoopKind::Eviction,
             ctx,
         );
 
-        let _permit = tokio::select! {
-            permit = acquire_permit => permit,
-            _ = cancel.cancelled() => return ControlFlow::Break(()),
-            _ = self.cancel.cancelled() => return ControlFlow::Break(()),
-        };
-
-        self.imitate_layer_accesses(tenant, p, cancel, gate, ctx)
-            .await
+        tokio::select! {
+            permit = acquire_permit => ControlFlow::Continue(permit),
+            _ = cancel.cancelled() => ControlFlow::Break(()),
+            _ = self.cancel.cancelled() => ControlFlow::Break(()),
+        }
     }
 
     /// If we evict layers but keep cached values derived from those layers, then
@@ -376,9 +370,10 @@ impl Timeline {
         p: &EvictionPolicyLayerAccessThreshold,
         cancel: &CancellationToken,
         gate: &GateGuard,
+        permit: tokio::sync::SemaphorePermit<'static>,
         ctx: &RequestContext,
     ) -> ControlFlow<()> {
-        if !self.tenant_shard_id.is_zero() {
+        if !self.tenant_shard_id.is_shard_zero() {
             // Shards !=0 do not maintain accurate relation sizes, and do not need to calculate logical size
             // for consumption metrics (consumption metrics are only sent from shard 0).  We may therefore
             // skip imitating logical size accesses for eviction purposes.
@@ -408,7 +403,28 @@ impl Timeline {
         // Make one of the tenant's timelines draw the short straw and run the calculation.
         // The others wait until the calculation is done so that they take into account the
         // imitated accesses that the winner made.
-        let mut state = tenant.eviction_task_tenant_state.lock().await;
+        let (mut state, _permit) = {
+            if let Ok(locked) = tenant.eviction_task_tenant_state.try_lock() {
+                (locked, permit)
+            } else {
+                // we might need to wait for a long time here in case of pathological synthetic
+                // size calculation performance
+                drop(permit);
+                let locked = tokio::select! {
+                    locked = tenant.eviction_task_tenant_state.lock() => locked,
+                    _ = self.cancel.cancelled() => {
+                        return ControlFlow::Break(())
+                    },
+                    _ = cancel.cancelled() => {
+                        return ControlFlow::Break(())
+                    }
+                };
+                // then reacquire -- this will be bad if there is a lot of traffic, but because we
+                // released the permit, the overall latency will be much better.
+                let permit = self.acquire_imitation_permit(cancel, ctx).await?;
+                (locked, permit)
+            }
+        };
         match state.last_layer_access_imitation {
             Some(ts) if ts.elapsed() < inter_imitate_period => { /* no need to run */ }
             _ => {
diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
index dae31934ad..991e4ac045 100644
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -22,10 +22,12 @@ use crate::tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeli
 use anyhow::Context;
 use chrono::{NaiveDateTime, Utc};
 use pageserver_api::models::TimelineState;
-use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey;
-use storage_broker::proto::SafekeeperTimelineInfo;
-use storage_broker::proto::SubscribeSafekeeperInfoRequest;
+
 use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
+use storage_broker::proto::{
+    FilterTenantTimelineId, MessageType, SafekeeperDiscoveryRequest, SafekeeperDiscoveryResponse,
+    SubscribeByFilterRequest, TypeSubscription, TypedMessage,
+};
 use storage_broker::{BrokerClientChannel, Code, Streaming};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
@@ -89,6 +91,14 @@ pub(super) async fn connection_manager_loop_step(
         .timeline
         .subscribe_for_state_updates();
 
+    let mut wait_lsn_status = connection_manager_state
+        .timeline
+        .subscribe_for_wait_lsn_updates();
+
+    // TODO: create a separate config option for discovery request interval
+    let discovery_request_interval = connection_manager_state.conf.lagging_wal_timeout;
+    let mut last_discovery_ts: Option<std::time::Instant> = None;
+
     // Subscribe to the broker updates. Stream shares underlying TCP connection
     // with other streams on this client (other connection managers). When
     // object goes out of scope, stream finishes in drop() automatically.
@@ -97,10 +107,12 @@ pub(super) async fn connection_manager_loop_step(
 
     loop {
         let time_until_next_retry = connection_manager_state.time_until_next_retry();
+        let any_activity = connection_manager_state.wal_connection.is_some()
+            || !connection_manager_state.wal_stream_candidates.is_empty();
 
         // These things are happening concurrently:
         //
-        // - cancellation request
+        //  - cancellation request
         //  - keep receiving WAL on the current connection
         //      - if the shared state says we need to change connection, disconnect and return
         //      - this runs in a separate task and we receive updates via a watch channel
@@ -108,6 +120,7 @@ pub(super) async fn connection_manager_loop_step(
         //  - receive updates from broker
         //      - this might change the current desired connection
         //  - timeline state changes to something that does not allow walreceiver to run concurrently
+        //  - if there's no connection and no candidates, try to send a discovery request
 
         // NB: make sure each of the select expressions are cancellation-safe
         // (no need for arms to be cancellation-safe).
@@ -214,6 +227,65 @@ pub(super) async fn connection_manager_loop_step(
                     }
                 }
             } => debug!("Waking up for the next retry after waiting for {time_until_next_retry:?}"),
+
+            Some(()) = async {
+                // Reminder: this match arm needs to be cancellation-safe.
+                // Calculating time needed to wait until sending the next discovery request.
+                // Current implementation is conservative and sends discovery requests only when there are no candidates.
+
+                if any_activity {
+                    // No need to send discovery requests if there is an active connection or candidates.
+                    return None;
+                }
+
+                // Waiting for an active wait_lsn request.
+                while wait_lsn_status.borrow().is_none() {
+                    if wait_lsn_status.changed().await.is_err() {
+                        // wait_lsn_status channel was closed, exiting
+                        warn!("wait_lsn_status channel was closed in connection_manager_loop_step");
+                        return None;
+                    }
+                }
+
+                // All preconditions met, preparing to send a discovery request.
+                let now = std::time::Instant::now();
+                let next_discovery_ts = last_discovery_ts
+                    .map(|ts| ts + discovery_request_interval)
+                    .unwrap_or_else(|| now);
+
+                if next_discovery_ts > now {
+                    // Prevent sending discovery requests too frequently.
+                    tokio::time::sleep(next_discovery_ts - now).await;
+                }
+
+                let tenant_timeline_id = Some(ProtoTenantTimelineId {
+                    tenant_id: id.tenant_id.as_ref().to_owned(),
+                    timeline_id: id.timeline_id.as_ref().to_owned(),
+                });
+                let request = SafekeeperDiscoveryRequest { tenant_timeline_id };
+                let msg = TypedMessage {
+                    r#type: MessageType::SafekeeperDiscoveryRequest as i32,
+                    safekeeper_timeline_info: None,
+                    safekeeper_discovery_request: Some(request),
+                    safekeeper_discovery_response: None,
+                    };
+
+                last_discovery_ts = Some(std::time::Instant::now());
+                debug!("No active connection and no candidates, sending discovery request to the broker");
+
+                // Cancellation safety: we want to send a message to the broker, but publish_one()
+                // function can get cancelled by the other select! arm. This is absolutely fine, because
+                // we just want to receive broker updates and discovery is not important if we already
+                // receive updates.
+                //
+                // It is possible that `last_discovery_ts` will be updated, but the message will not be sent.
+                // This is totally fine because of the reason above.
+
+                // This is a fire-and-forget request, we don't care about the response
+                let _ = broker_client.publish_one(msg).await;
+                debug!("Discovery request sent to the broker");
+                None
+            } => {}
         }
 
         if let Some(new_candidate) = connection_manager_state.next_connection_candidate() {
@@ -231,7 +303,7 @@ async fn subscribe_for_timeline_updates(
     broker_client: &mut BrokerClientChannel,
     id: TenantTimelineId,
     cancel: &CancellationToken,
-) -> Result<Streaming<SafekeeperTimelineInfo>, Cancelled> {
+) -> Result<Streaming<TypedMessage>, Cancelled> {
     let mut attempt = 0;
     loop {
         exponential_backoff(
@@ -244,17 +316,27 @@ async fn subscribe_for_timeline_updates(
         attempt += 1;
 
         // subscribe to the specific timeline
-        let key = SubscriptionKey::TenantTimelineId(ProtoTenantTimelineId {
-            tenant_id: id.tenant_id.as_ref().to_owned(),
-            timeline_id: id.timeline_id.as_ref().to_owned(),
-        });
-        let request = SubscribeSafekeeperInfoRequest {
-            subscription_key: Some(key),
+        let request = SubscribeByFilterRequest {
+            types: vec![
+                TypeSubscription {
+                    r#type: MessageType::SafekeeperTimelineInfo as i32,
+                },
+                TypeSubscription {
+                    r#type: MessageType::SafekeeperDiscoveryResponse as i32,
+                },
+            ],
+            tenant_timeline_id: Some(FilterTenantTimelineId {
+                enabled: true,
+                tenant_timeline_id: Some(ProtoTenantTimelineId {
+                    tenant_id: id.tenant_id.as_ref().to_owned(),
+                    timeline_id: id.timeline_id.as_ref().to_owned(),
+                }),
+            }),
         };
 
         match {
             tokio::select! {
-                r = broker_client.subscribe_safekeeper_info(request) => { r }
+                r = broker_client.subscribe_by_filter(request) => { r }
                 _ = cancel.cancelled() => { return Err(Cancelled); }
             }
         } {
@@ -398,7 +480,7 @@ struct RetryInfo {
 /// Data about the timeline to connect to, received from the broker.
 #[derive(Debug, Clone)]
 struct BrokerSkTimeline {
-    timeline: SafekeeperTimelineInfo,
+    timeline: SafekeeperDiscoveryResponse,
     /// Time at which the data was fetched from the broker last time, to track the stale data.
     latest_update: NaiveDateTime,
 }
@@ -606,7 +688,41 @@ impl ConnectionManagerState {
     }
 
     /// Adds another broker timeline into the state, if its more recent than the one already added there for the same key.
-    fn register_timeline_update(&mut self, timeline_update: SafekeeperTimelineInfo) {
+    fn register_timeline_update(&mut self, typed_msg: TypedMessage) {
+        let mut is_discovery = false;
+        let timeline_update = match typed_msg.r#type() {
+            MessageType::SafekeeperTimelineInfo => {
+                let info = match typed_msg.safekeeper_timeline_info {
+                    Some(info) => info,
+                    None => {
+                        warn!("bad proto message from broker: no safekeeper_timeline_info");
+                        return;
+                    }
+                };
+                SafekeeperDiscoveryResponse {
+                    safekeeper_id: info.safekeeper_id,
+                    tenant_timeline_id: info.tenant_timeline_id,
+                    commit_lsn: info.commit_lsn,
+                    safekeeper_connstr: info.safekeeper_connstr,
+                    availability_zone: info.availability_zone,
+                }
+            }
+            MessageType::SafekeeperDiscoveryResponse => {
+                is_discovery = true;
+                match typed_msg.safekeeper_discovery_response {
+                    Some(response) => response,
+                    None => {
+                        warn!("bad proto message from broker: no safekeeper_discovery_response");
+                        return;
+                    }
+                }
+            }
+            _ => {
+                // unexpected message
+                return;
+            }
+        };
+
         WALRECEIVER_BROKER_UPDATES.inc();
 
         let new_safekeeper_id = NodeId(timeline_update.safekeeper_id);
@@ -619,7 +735,11 @@ impl ConnectionManagerState {
         );
 
         if old_entry.is_none() {
-            info!("New SK node was added: {new_safekeeper_id}");
+            info!(
+                ?is_discovery,
+                %new_safekeeper_id,
+                "New SK node was added",
+            );
             WALRECEIVER_CANDIDATES_ADDED.inc();
         }
     }
@@ -818,7 +938,7 @@ impl ConnectionManagerState {
     fn select_connection_candidate(
         &self,
         node_to_omit: Option<NodeId>,
-    ) -> Option<(NodeId, &SafekeeperTimelineInfo, PgConnectionConfig)> {
+    ) -> Option<(NodeId, &SafekeeperDiscoveryResponse, PgConnectionConfig)> {
         self.applicable_connection_candidates()
             .filter(|&(sk_id, _, _)| Some(sk_id) != node_to_omit)
             .max_by_key(|(_, info, _)| info.commit_lsn)
@@ -828,7 +948,7 @@ impl ConnectionManagerState {
     /// Some safekeepers are filtered by the retry cooldown.
     fn applicable_connection_candidates(
         &self,
-    ) -> impl Iterator<Item = (NodeId, &SafekeeperTimelineInfo, PgConnectionConfig)> {
+    ) -> impl Iterator<Item = (NodeId, &SafekeeperDiscoveryResponse, PgConnectionConfig)> {
         let now = Utc::now().naive_utc();
 
         self.wal_stream_candidates
@@ -968,19 +1088,11 @@ mod tests {
         latest_update: NaiveDateTime,
     ) -> BrokerSkTimeline {
         BrokerSkTimeline {
-            timeline: SafekeeperTimelineInfo {
+            timeline: SafekeeperDiscoveryResponse {
                 safekeeper_id: 0,
                 tenant_timeline_id: None,
-                term: 0,
-                last_log_term: 0,
-                flush_lsn: 0,
                 commit_lsn,
-                backup_lsn: 0,
-                remote_consistent_lsn: 0,
-                peer_horizon_lsn: 0,
-                local_start_lsn: 0,
                 safekeeper_connstr: safekeeper_connstr.to_owned(),
-                http_connstr: safekeeper_connstr.to_owned(),
                 availability_zone: None,
             },
             latest_update,
@@ -1423,7 +1535,7 @@ mod tests {
 
         let harness = TenantHarness::create("switch_to_same_availability_zone")?;
         let mut state = dummy_state(&harness).await;
-        state.conf.availability_zone = test_az.clone();
+        state.conf.availability_zone.clone_from(&test_az);
         let current_lsn = Lsn(100_000).align();
         let now = Utc::now().naive_utc();
 
@@ -1456,7 +1568,7 @@ mod tests {
         // We have another safekeeper with the same commit_lsn, and it have the same availability zone as
         // the current pageserver.
         let mut same_az_sk = dummy_broker_sk_timeline(current_lsn.0, "same_az", now);
-        same_az_sk.timeline.availability_zone = test_az.clone();
+        same_az_sk.timeline.availability_zone.clone_from(&test_az);
 
         state.wal_stream_candidates = HashMap::from([
             (
diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index 3f3419e886..c6ee6b90c4 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -427,7 +427,7 @@ pub(super) async fn handle_walreceiver_connection(
 
             // Send the replication feedback message.
             // Regular standby_status_update fields are put into this message.
-            let current_timeline_size = if timeline.tenant_shard_id.is_zero() {
+            let current_timeline_size = if timeline.tenant_shard_id.is_shard_zero() {
                 timeline
                     .get_current_logical_size(
                         crate::tenant::timeline::GetLogicalSizePriority::User,
diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs
index 3a6950cf88..91934d5e0e 100644
--- a/pageserver/src/tenant/vectored_blob_io.rs
+++ b/pageserver/src/tenant/vectored_blob_io.rs
@@ -61,18 +61,18 @@ pub struct VectoredRead {
 }
 
 impl VectoredRead {
-    pub fn size(&self) -> usize {
+    pub(crate) fn size(&self) -> usize {
         (self.end - self.start) as usize
     }
 }
 
 #[derive(Eq, PartialEq)]
-enum VectoredReadExtended {
+pub(crate) enum VectoredReadExtended {
     Yes,
     No,
 }
 
-struct VectoredReadBuilder {
+pub(crate) struct VectoredReadBuilder {
     start: u64,
     end: u64,
     blobs_at: VecMap<u64, BlobMeta>,
@@ -80,7 +80,17 @@ struct VectoredReadBuilder {
 }
 
 impl VectoredReadBuilder {
-    fn new(start_offset: u64, end_offset: u64, meta: BlobMeta, max_read_size: usize) -> Self {
+    /// Start building a new vectored read.
+    ///
+    /// Note that by design, this does not check against reading more than `max_read_size` to
+    /// support reading larger blobs than the configuration value. The builder will be single use
+    /// however after that.
+    pub(crate) fn new(
+        start_offset: u64,
+        end_offset: u64,
+        meta: BlobMeta,
+        max_read_size: usize,
+    ) -> Self {
         let mut blobs_at = VecMap::default();
         blobs_at
             .append(start_offset, meta)
@@ -97,7 +107,8 @@ impl VectoredReadBuilder {
     /// Attempt to extend the current read with a new blob if the start
     /// offset matches with the current end of the vectored read
     /// and the resuting size is below the max read size
-    fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended {
+    pub(crate) fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended {
+        tracing::trace!(start, end, "trying to extend");
         let size = (end - start) as usize;
         if self.end == start && self.size() + size <= self.max_read_size {
             self.end = end;
@@ -111,11 +122,11 @@ impl VectoredReadBuilder {
         VectoredReadExtended::No
     }
 
-    fn size(&self) -> usize {
+    pub(crate) fn size(&self) -> usize {
         (self.end - self.start) as usize
     }
 
-    fn build(self) -> VectoredRead {
+    pub(crate) fn build(self) -> VectoredRead {
         VectoredRead {
             start: self.start,
             end: self.end,
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index 0cf6a0019b..a17488a286 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -10,6 +10,7 @@
 //! This is similar to PostgreSQL's virtual file descriptor facility in
 //! src/backend/storage/file/fd.c
 //!
+use crate::context::RequestContext;
 use crate::metrics::{StorageIoOperation, STORAGE_IO_SIZE, STORAGE_IO_TIME_METRIC};
 
 use crate::page_cache::PageWriteGuard;
@@ -32,11 +33,11 @@ pub use io_engine::feature_test as io_engine_feature_test;
 pub use io_engine::FeatureTestResult as IoEngineFeatureTestResult;
 mod metadata;
 mod open_options;
+use self::owned_buffers_io::write::OwnedAsyncWriter;
 pub(crate) use io_engine::IoEngineKind;
 pub(crate) use metadata::Metadata;
 pub(crate) use open_options::*;
 
-#[cfg_attr(not(target_os = "linux"), allow(dead_code))]
 pub(crate) mod owned_buffers_io {
     //! Abstractions for IO with owned buffers.
     //!
@@ -615,6 +616,7 @@ impl VirtualFile {
         &self,
         buf: B,
         mut offset: u64,
+        ctx: &RequestContext,
     ) -> (B::Buf, Result<(), Error>) {
         let buf_len = buf.bytes_init();
         if buf_len == 0 {
@@ -623,7 +625,7 @@ impl VirtualFile {
         let mut buf = buf.slice(0..buf_len);
         while !buf.is_empty() {
             let res;
-            (buf, res) = self.write_at(buf, offset).await;
+            (buf, res) = self.write_at(buf, offset, ctx).await;
             match res {
                 Ok(0) => {
                     return (
@@ -652,6 +654,7 @@ impl VirtualFile {
     pub async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
         &mut self,
         buf: B,
+        ctx: &RequestContext,
     ) -> (B::Buf, Result<usize, Error>) {
         let nbytes = buf.bytes_init();
         if nbytes == 0 {
@@ -660,7 +663,7 @@ impl VirtualFile {
         let mut buf = buf.slice(0..nbytes);
         while !buf.is_empty() {
             let res;
-            (buf, res) = self.write(buf).await;
+            (buf, res) = self.write(buf, ctx).await;
             match res {
                 Ok(0) => {
                     return (
@@ -684,9 +687,10 @@ impl VirtualFile {
     async fn write<B: IoBuf + Send>(
         &mut self,
         buf: Slice<B>,
+        ctx: &RequestContext,
     ) -> (Slice<B>, Result<usize, std::io::Error>) {
         let pos = self.pos;
-        let (buf, res) = self.write_at(buf, pos).await;
+        let (buf, res) = self.write_at(buf, pos, ctx).await;
         let n = match res {
             Ok(n) => n,
             Err(e) => return (buf, Err(e)),
@@ -724,6 +728,7 @@ impl VirtualFile {
         &self,
         buf: Slice<B>,
         offset: u64,
+        _ctx: &RequestContext, /* TODO: use for metrics: https://github.com/neondatabase/neon/issues/6107 */
     ) -> (Slice<B>, Result<usize, Error>) {
         let file_guard = match self.lock_file().await {
             Ok(file_guard) => file_guard,
@@ -1083,6 +1088,18 @@ impl Drop for VirtualFile {
     }
 }
 
+impl OwnedAsyncWriter for VirtualFile {
+    #[inline(always)]
+    async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
+        &mut self,
+        buf: B,
+        ctx: &RequestContext,
+    ) -> std::io::Result<(usize, B::Buf)> {
+        let (buf, res) = VirtualFile::write_all(self, buf, ctx).await;
+        res.map(move |v| (v, buf))
+    }
+}
+
 impl OpenFiles {
     fn new(num_slots: usize) -> OpenFiles {
         let mut slots = Box::new(Vec::with_capacity(num_slots));
@@ -1135,6 +1152,9 @@ fn get_open_files() -> &'static OpenFiles {
 
 #[cfg(test)]
 mod tests {
+    use crate::context::DownloadBehavior;
+    use crate::task_mgr::TaskKind;
+
     use super::*;
     use rand::seq::SliceRandom;
     use rand::thread_rng;
@@ -1166,10 +1186,11 @@ mod tests {
             &self,
             buf: B,
             offset: u64,
+            ctx: &RequestContext,
         ) -> Result<(), Error> {
             match self {
                 MaybeVirtualFile::VirtualFile(file) => {
-                    let (_buf, res) = file.write_all_at(buf, offset).await;
+                    let (_buf, res) = file.write_all_at(buf, offset, ctx).await;
                     res
                 }
                 MaybeVirtualFile::File(file) => {
@@ -1190,10 +1211,11 @@ mod tests {
         async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
             &mut self,
             buf: B,
+            ctx: &RequestContext,
         ) -> Result<(), Error> {
             match self {
                 MaybeVirtualFile::VirtualFile(file) => {
-                    let (_buf, res) = file.write_all(buf).await;
+                    let (_buf, res) = file.write_all(buf, ctx).await;
                     res.map(|_| ())
                 }
                 MaybeVirtualFile::File(file) => {
@@ -1264,6 +1286,7 @@ mod tests {
         OF: Fn(Utf8PathBuf, OpenOptions) -> FT,
         FT: Future<Output = Result<MaybeVirtualFile, std::io::Error>>,
     {
+        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
         let testdir = crate::config::PageServerConf::test_repo_dir(testname);
         std::fs::create_dir_all(&testdir)?;
 
@@ -1277,7 +1300,7 @@ mod tests {
                 .to_owned(),
         )
         .await?;
-        file_a.write_all(b"foobar".to_vec()).await?;
+        file_a.write_all(b"foobar".to_vec(), &ctx).await?;
 
         // cannot read from a file opened in write-only mode
         let _ = file_a.read_string().await.unwrap_err();
@@ -1286,7 +1309,7 @@ mod tests {
         let mut file_a = openfunc(path_a, OpenOptions::new().read(true).to_owned()).await?;
 
         // cannot write to a file opened in read-only mode
-        let _ = file_a.write_all(b"bar".to_vec()).await.unwrap_err();
+        let _ = file_a.write_all(b"bar".to_vec(), &ctx).await.unwrap_err();
 
         // Try simple read
         assert_eq!("foobar", file_a.read_string().await?);
@@ -1328,8 +1351,8 @@ mod tests {
                 .to_owned(),
         )
         .await?;
-        file_b.write_all_at(b"BAR".to_vec(), 3).await?;
-        file_b.write_all_at(b"FOO".to_vec(), 0).await?;
+        file_b.write_all_at(b"BAR".to_vec(), 3, &ctx).await?;
+        file_b.write_all_at(b"FOO".to_vec(), 0, &ctx).await?;
 
         assert_eq!(file_b.read_string_at(2, 3).await?, "OBA");
 
diff --git a/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs b/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs
index 7505b7487e..55b1d0b46b 100644
--- a/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs
@@ -1,33 +1,46 @@
-use crate::virtual_file::{owned_buffers_io::write::OwnedAsyncWriter, VirtualFile};
+use crate::{context::RequestContext, virtual_file::owned_buffers_io::write::OwnedAsyncWriter};
 use tokio_epoll_uring::{BoundedBuf, IoBuf};
 
-pub struct Writer {
-    dst: VirtualFile,
+pub struct Writer<W> {
+    dst: W,
     bytes_amount: u64,
 }
 
-impl Writer {
-    pub fn new(dst: VirtualFile) -> Self {
+impl<W> Writer<W> {
+    pub fn new(dst: W) -> Self {
         Self {
             dst,
             bytes_amount: 0,
         }
     }
+
+    pub fn bytes_written(&self) -> u64 {
+        self.bytes_amount
+    }
+
+    pub fn as_inner(&self) -> &W {
+        &self.dst
+    }
+
     /// Returns the wrapped `VirtualFile` object as well as the number
     /// of bytes that were written to it through this object.
-    pub fn into_inner(self) -> (u64, VirtualFile) {
+    #[cfg_attr(target_os = "macos", allow(dead_code))]
+    pub fn into_inner(self) -> (u64, W) {
         (self.bytes_amount, self.dst)
     }
 }
 
-impl OwnedAsyncWriter for Writer {
+impl<W> OwnedAsyncWriter for Writer<W>
+where
+    W: OwnedAsyncWriter,
+{
     #[inline(always)]
     async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
         &mut self,
         buf: B,
+        ctx: &RequestContext,
     ) -> std::io::Result<(usize, B::Buf)> {
-        let (buf, res) = self.dst.write_all(buf).await;
-        let nwritten = res?;
+        let (nwritten, buf) = self.dst.write_all(buf, ctx).await?;
         self.bytes_amount += u64::try_from(nwritten).unwrap();
         Ok((nwritten, buf))
     }
diff --git a/pageserver/src/virtual_file/owned_buffers_io/write.rs b/pageserver/src/virtual_file/owned_buffers_io/write.rs
index f1812d9b51..885a9221c5 100644
--- a/pageserver/src/virtual_file/owned_buffers_io/write.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/write.rs
@@ -1,23 +1,26 @@
 use bytes::BytesMut;
 use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice};
 
+use crate::context::RequestContext;
+
 /// A trait for doing owned-buffer write IO.
 /// Think [`tokio::io::AsyncWrite`] but with owned buffers.
 pub trait OwnedAsyncWriter {
     async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
         &mut self,
         buf: B,
+        ctx: &RequestContext,
     ) -> std::io::Result<(usize, B::Buf)>;
 }
 
-/// A wrapper aorund an [`OwnedAsyncWriter`] that batches smaller writers
-/// into `BUFFER_SIZE`-sized writes.
+/// A wrapper aorund an [`OwnedAsyncWriter`] that uses a [`Buffer`] to batch
+/// small writes into larger writes of size [`Buffer::cap`].
 ///
 /// # Passthrough Of Large Writers
 ///
-/// Buffered writes larger than the `BUFFER_SIZE` cause the internal
-/// buffer to be flushed, even if it is not full yet. Then, the large
-/// buffered write is passed through to the unerlying [`OwnedAsyncWriter`].
+/// Calls to [`BufferedWriter::write_buffered`] that are larger than [`Buffer::cap`]
+/// cause the internal buffer to be flushed prematurely so that the large
+/// buffered write is passed through to the underlying [`OwnedAsyncWriter`].
 ///
 /// This pass-through is generally beneficial for throughput, but if
 /// the storage backend of the [`OwnedAsyncWriter`] is a shared resource,
@@ -25,93 +28,194 @@ pub trait OwnedAsyncWriter {
 ///
 /// In such cases, a different implementation that always buffers in memory
 /// may be preferable.
-pub struct BufferedWriter<const BUFFER_SIZE: usize, W> {
+pub struct BufferedWriter<B, W> {
     writer: W,
-    // invariant: always remains Some(buf)
-    // with buf.capacity() == BUFFER_SIZE except
-    // - while IO is ongoing => goes back to Some() once the IO completed successfully
-    // - after an IO error => stays `None` forever
-    // In these exceptional cases, it's `None`.
-    buf: Option<BytesMut>,
+    /// invariant: always remains Some(buf) except
+    /// - while IO is ongoing => goes back to Some() once the IO completed successfully
+    /// - after an IO error => stays `None` forever
+    /// In these exceptional cases, it's `None`.
+    buf: Option<B>,
 }
 
-impl<const BUFFER_SIZE: usize, W> BufferedWriter<BUFFER_SIZE, W>
+impl<B, Buf, W> BufferedWriter<B, W>
 where
+    B: Buffer<IoBuf = Buf> + Send,
+    Buf: IoBuf + Send,
     W: OwnedAsyncWriter,
 {
-    pub fn new(writer: W) -> Self {
+    pub fn new(writer: W, buf: B) -> Self {
         Self {
             writer,
-            buf: Some(BytesMut::with_capacity(BUFFER_SIZE)),
+            buf: Some(buf),
         }
     }
 
-    pub async fn flush_and_into_inner(mut self) -> std::io::Result<W> {
-        self.flush().await?;
+    pub fn as_inner(&self) -> &W {
+        &self.writer
+    }
+
+    /// Panics if used after any of the write paths returned an error
+    pub fn inspect_buffer(&self) -> &B {
+        self.buf()
+    }
+
+    #[cfg_attr(target_os = "macos", allow(dead_code))]
+    pub async fn flush_and_into_inner(mut self, ctx: &RequestContext) -> std::io::Result<W> {
+        self.flush(ctx).await?;
+
         let Self { buf, writer } = self;
         assert!(buf.is_some());
         Ok(writer)
     }
 
-    pub async fn write_buffered<B: IoBuf>(&mut self, chunk: Slice<B>) -> std::io::Result<()>
-    where
-        B: IoBuf + Send,
-    {
+    #[inline(always)]
+    fn buf(&self) -> &B {
+        self.buf
+            .as_ref()
+            .expect("must not use after we returned an error")
+    }
+
+    #[cfg_attr(target_os = "macos", allow(dead_code))]
+    pub async fn write_buffered<S: IoBuf + Send>(
+        &mut self,
+        chunk: Slice<S>,
+        ctx: &RequestContext,
+    ) -> std::io::Result<(usize, S)> {
+        let chunk_len = chunk.len();
         // avoid memcpy for the middle of the chunk
-        if chunk.len() >= BUFFER_SIZE {
-            self.flush().await?;
+        if chunk.len() >= self.buf().cap() {
+            self.flush(ctx).await?;
             // do a big write, bypassing `buf`
             assert_eq!(
                 self.buf
                     .as_ref()
                     .expect("must not use after an error")
-                    .len(),
+                    .pending(),
                 0
             );
-            let chunk_len = chunk.len();
-            let (nwritten, chunk) = self.writer.write_all(chunk).await?;
+            let (nwritten, chunk) = self.writer.write_all(chunk, ctx).await?;
             assert_eq!(nwritten, chunk_len);
-            drop(chunk);
-            return Ok(());
+            return Ok((nwritten, chunk));
         }
         // in-memory copy the < BUFFER_SIZED tail of the chunk
-        assert!(chunk.len() < BUFFER_SIZE);
-        let mut chunk = &chunk[..];
+        assert!(chunk.len() < self.buf().cap());
+        let mut slice = &chunk[..];
+        while !slice.is_empty() {
+            let buf = self.buf.as_mut().expect("must not use after an error");
+            let need = buf.cap() - buf.pending();
+            let have = slice.len();
+            let n = std::cmp::min(need, have);
+            buf.extend_from_slice(&slice[..n]);
+            slice = &slice[n..];
+            if buf.pending() >= buf.cap() {
+                assert_eq!(buf.pending(), buf.cap());
+                self.flush(ctx).await?;
+            }
+        }
+        assert!(slice.is_empty(), "by now we should have drained the chunk");
+        Ok((chunk_len, chunk.into_inner()))
+    }
+
+    /// Strictly less performant variant of [`Self::write_buffered`] that allows writing borrowed data.
+    ///
+    /// It is less performant because we always have to copy the borrowed data into the internal buffer
+    /// before we can do the IO. The [`Self::write_buffered`] can avoid this, which is more performant
+    /// for large writes.
+    pub async fn write_buffered_borrowed(
+        &mut self,
+        mut chunk: &[u8],
+        ctx: &RequestContext,
+    ) -> std::io::Result<usize> {
+        let chunk_len = chunk.len();
         while !chunk.is_empty() {
             let buf = self.buf.as_mut().expect("must not use after an error");
-            let need = BUFFER_SIZE - buf.len();
+            let need = buf.cap() - buf.pending();
             let have = chunk.len();
             let n = std::cmp::min(need, have);
             buf.extend_from_slice(&chunk[..n]);
             chunk = &chunk[n..];
-            if buf.len() >= BUFFER_SIZE {
-                assert_eq!(buf.len(), BUFFER_SIZE);
-                self.flush().await?;
+            if buf.pending() >= buf.cap() {
+                assert_eq!(buf.pending(), buf.cap());
+                self.flush(ctx).await?;
             }
         }
-        assert!(chunk.is_empty(), "by now we should have drained the chunk");
-        Ok(())
+        Ok(chunk_len)
     }
 
-    async fn flush(&mut self) -> std::io::Result<()> {
+    async fn flush(&mut self, ctx: &RequestContext) -> std::io::Result<()> {
         let buf = self.buf.take().expect("must not use after an error");
-        if buf.is_empty() {
+        let buf_len = buf.pending();
+        if buf_len == 0 {
             self.buf = Some(buf);
-            return std::io::Result::Ok(());
+            return Ok(());
         }
-        let buf_len = buf.len();
-        let (nwritten, mut buf) = self.writer.write_all(buf).await?;
+        let (nwritten, io_buf) = self.writer.write_all(buf.flush(), ctx).await?;
         assert_eq!(nwritten, buf_len);
-        buf.clear();
-        self.buf = Some(buf);
+        self.buf = Some(Buffer::reuse_after_flush(io_buf));
         Ok(())
     }
 }
 
+/// A [`Buffer`] is used by [`BufferedWriter`] to batch smaller writes into larger ones.
+pub trait Buffer {
+    type IoBuf: IoBuf;
+
+    /// Capacity of the buffer. Must not change over the lifetime `self`.`
+    fn cap(&self) -> usize;
+
+    /// Add data to the buffer.
+    /// Panics if there is not enough room to accomodate `other`'s content, i.e.,
+    /// panics if `other.len() > self.cap() - self.pending()`.
+    fn extend_from_slice(&mut self, other: &[u8]);
+
+    /// Number of bytes in the buffer.
+    fn pending(&self) -> usize;
+
+    /// Turns `self` into a [`tokio_epoll_uring::Slice`] of the pending data
+    /// so we can use [`tokio_epoll_uring`] to write it to disk.
+    fn flush(self) -> Slice<Self::IoBuf>;
+
+    /// After the write to disk is done and we have gotten back the slice,
+    /// [`BufferedWriter`] uses this method to re-use the io buffer.
+    fn reuse_after_flush(iobuf: Self::IoBuf) -> Self;
+}
+
+impl Buffer for BytesMut {
+    type IoBuf = BytesMut;
+
+    #[inline(always)]
+    fn cap(&self) -> usize {
+        self.capacity()
+    }
+
+    fn extend_from_slice(&mut self, other: &[u8]) {
+        BytesMut::extend_from_slice(self, other)
+    }
+
+    #[inline(always)]
+    fn pending(&self) -> usize {
+        self.len()
+    }
+
+    fn flush(self) -> Slice<BytesMut> {
+        if self.is_empty() {
+            return self.slice_full();
+        }
+        let len = self.len();
+        self.slice(0..len)
+    }
+
+    fn reuse_after_flush(mut iobuf: BytesMut) -> Self {
+        iobuf.clear();
+        iobuf
+    }
+}
+
 impl OwnedAsyncWriter for Vec<u8> {
     async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
         &mut self,
         buf: B,
+        _: &RequestContext,
     ) -> std::io::Result<(usize, B::Buf)> {
         let nbytes = buf.bytes_init();
         if nbytes == 0 {
@@ -125,7 +229,11 @@ impl OwnedAsyncWriter for Vec<u8> {
 
 #[cfg(test)]
 mod tests {
+    use bytes::BytesMut;
+
     use super::*;
+    use crate::context::{DownloadBehavior, RequestContext};
+    use crate::task_mgr::TaskKind;
 
     #[derive(Default)]
     struct RecorderWriter {
@@ -135,6 +243,7 @@ mod tests {
         async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
             &mut self,
             buf: B,
+            _: &RequestContext,
         ) -> std::io::Result<(usize, B::Buf)> {
             let nbytes = buf.bytes_init();
             if nbytes == 0 {
@@ -147,10 +256,14 @@ mod tests {
         }
     }
 
+    fn test_ctx() -> RequestContext {
+        RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error)
+    }
+
     macro_rules! write {
         ($writer:ident, $data:literal) => {{
             $writer
-                .write_buffered(::bytes::Bytes::from_static($data).slice_full())
+                .write_buffered(::bytes::Bytes::from_static($data).slice_full(), &test_ctx())
                 .await?;
         }};
     }
@@ -158,13 +271,13 @@ mod tests {
     #[tokio::test]
     async fn test_buffered_writes_only() -> std::io::Result<()> {
         let recorder = RecorderWriter::default();
-        let mut writer = BufferedWriter::<2, _>::new(recorder);
+        let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2));
         write!(writer, b"a");
         write!(writer, b"b");
         write!(writer, b"c");
         write!(writer, b"d");
         write!(writer, b"e");
-        let recorder = writer.flush_and_into_inner().await?;
+        let recorder = writer.flush_and_into_inner(&test_ctx()).await?;
         assert_eq!(
             recorder.writes,
             vec![Vec::from(b"ab"), Vec::from(b"cd"), Vec::from(b"e")]
@@ -175,12 +288,12 @@ mod tests {
     #[tokio::test]
     async fn test_passthrough_writes_only() -> std::io::Result<()> {
         let recorder = RecorderWriter::default();
-        let mut writer = BufferedWriter::<2, _>::new(recorder);
+        let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2));
         write!(writer, b"abc");
         write!(writer, b"de");
         write!(writer, b"");
         write!(writer, b"fghijk");
-        let recorder = writer.flush_and_into_inner().await?;
+        let recorder = writer.flush_and_into_inner(&test_ctx()).await?;
         assert_eq!(
             recorder.writes,
             vec![Vec::from(b"abc"), Vec::from(b"de"), Vec::from(b"fghijk")]
@@ -191,16 +304,45 @@ mod tests {
     #[tokio::test]
     async fn test_passthrough_write_with_nonempty_buffer() -> std::io::Result<()> {
         let recorder = RecorderWriter::default();
-        let mut writer = BufferedWriter::<2, _>::new(recorder);
+        let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2));
         write!(writer, b"a");
         write!(writer, b"bc");
         write!(writer, b"d");
         write!(writer, b"e");
-        let recorder = writer.flush_and_into_inner().await?;
+        let recorder = writer.flush_and_into_inner(&test_ctx()).await?;
         assert_eq!(
             recorder.writes,
             vec![Vec::from(b"a"), Vec::from(b"bc"), Vec::from(b"de")]
         );
         Ok(())
     }
+
+    #[tokio::test]
+    async fn test_write_all_borrowed_always_goes_through_buffer() -> std::io::Result<()> {
+        let ctx = test_ctx();
+        let ctx = &ctx;
+        let recorder = RecorderWriter::default();
+        let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2));
+
+        writer.write_buffered_borrowed(b"abc", ctx).await?;
+        writer.write_buffered_borrowed(b"d", ctx).await?;
+        writer.write_buffered_borrowed(b"e", ctx).await?;
+        writer.write_buffered_borrowed(b"fg", ctx).await?;
+        writer.write_buffered_borrowed(b"hi", ctx).await?;
+        writer.write_buffered_borrowed(b"j", ctx).await?;
+        writer.write_buffered_borrowed(b"klmno", ctx).await?;
+
+        let recorder = writer.flush_and_into_inner(ctx).await?;
+        assert_eq!(
+            recorder.writes,
+            {
+                let expect: &[&[u8]] = &[b"ab", b"cd", b"ef", b"gh", b"ij", b"kl", b"mn", b"o"];
+                expect
+            }
+            .iter()
+            .map(|v| v[..].to_vec())
+            .collect::<Vec<_>>()
+        );
+        Ok(())
+    }
 }
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 9c7e8748d5..79f075b877 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -403,7 +403,7 @@ impl WalIngest {
             );
 
             if !key_is_local {
-                if self.shard.is_zero() {
+                if self.shard.is_shard_zero() {
                     // Shard 0 tracks relation sizes.  Although we will not store this block, we will observe
                     // its blkno in case it implicitly extends a relation.
                     self.observe_decoded_block(modification, blk, ctx).await?;
@@ -1034,7 +1034,7 @@ impl WalIngest {
 
             let nblocks = modification
                 .tline
-                .get_rel_size(src_rel, Version::Modified(modification), true, ctx)
+                .get_rel_size(src_rel, Version::Modified(modification), ctx)
                 .await?;
             let dst_rel = RelTag {
                 spcnode: tablespace_id,
@@ -1068,13 +1068,7 @@ impl WalIngest {
 
                 let content = modification
                     .tline
-                    .get_rel_page_at_lsn(
-                        src_rel,
-                        blknum,
-                        Version::Modified(modification),
-                        true,
-                        ctx,
-                    )
+                    .get_rel_page_at_lsn(src_rel, blknum, Version::Modified(modification), ctx)
                     .await?;
                 modification.put_rel_page_image(dst_rel, blknum, content)?;
                 num_blocks_copied += 1;
@@ -1242,7 +1236,7 @@ impl WalIngest {
                 };
                 if modification
                     .tline
-                    .get_rel_exists(rel, Version::Modified(modification), true, ctx)
+                    .get_rel_exists(rel, Version::Modified(modification), ctx)
                     .await?
                 {
                     self.put_rel_drop(modification, rel, ctx).await?;
@@ -1541,7 +1535,7 @@ impl WalIngest {
             nblocks
         } else if !modification
             .tline
-            .get_rel_exists(rel, Version::Modified(modification), true, ctx)
+            .get_rel_exists(rel, Version::Modified(modification), ctx)
             .await?
         {
             // create it with 0 size initially, the logic below will extend it
@@ -1553,7 +1547,7 @@ impl WalIngest {
         } else {
             modification
                 .tline
-                .get_rel_size(rel, Version::Modified(modification), true, ctx)
+                .get_rel_size(rel, Version::Modified(modification), ctx)
                 .await?
         };
 
@@ -1650,14 +1644,14 @@ async fn get_relsize(
 ) -> anyhow::Result<BlockNumber> {
     let nblocks = if !modification
         .tline
-        .get_rel_exists(rel, Version::Modified(modification), true, ctx)
+        .get_rel_exists(rel, Version::Modified(modification), ctx)
         .await?
     {
         0
     } else {
         modification
             .tline
-            .get_rel_size(rel, Version::Modified(modification), true, ctx)
+            .get_rel_size(rel, Version::Modified(modification), ctx)
             .await?
     };
     Ok(nblocks)
@@ -1732,29 +1726,29 @@ mod tests {
         // The relation was created at LSN 2, not visible at LSN 1 yet.
         assert_eq!(
             tline
-                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx)
+                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx)
                 .await?,
             false
         );
         assert!(tline
-            .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx)
+            .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx)
             .await
             .is_err());
         assert_eq!(
             tline
-                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
+                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx)
                 .await?,
             true
         );
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx)
                 .await?,
             1
         );
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), &ctx)
                 .await?,
             3
         );
@@ -1762,46 +1756,46 @@ mod tests {
         // Check page contents at each LSN
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x20)), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x20)), &ctx)
                 .await?,
             test_img("foo blk 0 at 2")
         );
 
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x30)), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x30)), &ctx)
                 .await?,
             test_img("foo blk 0 at 3")
         );
 
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x40)), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x40)), &ctx)
                 .await?,
             test_img("foo blk 0 at 3")
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x40)), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x40)), &ctx)
                 .await?,
             test_img("foo blk 1 at 4")
         );
 
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x50)), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x50)), &ctx)
                 .await?,
             test_img("foo blk 0 at 3")
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x50)), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x50)), &ctx)
                 .await?,
             test_img("foo blk 1 at 4")
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), &ctx)
                 .await?,
             test_img("foo blk 2 at 5")
         );
@@ -1817,19 +1811,19 @@ mod tests {
         // Check reported size and contents after truncation
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), &ctx)
                 .await?,
             2
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x60)), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x60)), &ctx)
                 .await?,
             test_img("foo blk 0 at 3")
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x60)), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x60)), &ctx)
                 .await?,
             test_img("foo blk 1 at 4")
         );
@@ -1837,13 +1831,13 @@ mod tests {
         // should still see the truncated block with older LSN
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), &ctx)
                 .await?,
             3
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), &ctx)
                 .await?,
             test_img("foo blk 2 at 5")
         );
@@ -1856,7 +1850,7 @@ mod tests {
         m.commit(&ctx).await?;
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x68)), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x68)), &ctx)
                 .await?,
             0
         );
@@ -1869,19 +1863,19 @@ mod tests {
         m.commit(&ctx).await?;
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x70)), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x70)), &ctx)
                 .await?,
             2
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x70)), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x70)), &ctx)
                 .await?,
             ZERO_PAGE
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x70)), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x70)), &ctx)
                 .await?,
             test_img("foo blk 1")
         );
@@ -1894,21 +1888,21 @@ mod tests {
         m.commit(&ctx).await?;
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), &ctx)
                 .await?,
             1501
         );
         for blk in 2..1500 {
             assert_eq!(
                 tline
-                    .get_rel_page_at_lsn(TESTREL_A, blk, Version::Lsn(Lsn(0x80)), false, &ctx)
+                    .get_rel_page_at_lsn(TESTREL_A, blk, Version::Lsn(Lsn(0x80)), &ctx)
                     .await?,
                 ZERO_PAGE
             );
         }
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 1500, Version::Lsn(Lsn(0x80)), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 1500, Version::Lsn(Lsn(0x80)), &ctx)
                 .await?,
             test_img("foo blk 1500")
         );
@@ -1935,13 +1929,13 @@ mod tests {
         // Check that rel exists and size is correct
         assert_eq!(
             tline
-                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
+                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx)
                 .await?,
             true
         );
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx)
                 .await?,
             1
         );
@@ -1954,7 +1948,7 @@ mod tests {
         // Check that rel is not visible anymore
         assert_eq!(
             tline
-                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x30)), false, &ctx)
+                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x30)), &ctx)
                 .await?,
             false
         );
@@ -1972,13 +1966,13 @@ mod tests {
         // Check that rel exists and size is correct
         assert_eq!(
             tline
-                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x40)), false, &ctx)
+                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x40)), &ctx)
                 .await?,
             true
         );
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x40)), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x40)), &ctx)
                 .await?,
             1
         );
@@ -2011,24 +2005,24 @@ mod tests {
         // The relation was created at LSN 20, not visible at LSN 1 yet.
         assert_eq!(
             tline
-                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx)
+                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx)
                 .await?,
             false
         );
         assert!(tline
-            .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx)
+            .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx)
             .await
             .is_err());
 
         assert_eq!(
             tline
-                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
+                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx)
                 .await?,
             true
         );
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx)
                 .await?,
             relsize
         );
@@ -2039,7 +2033,7 @@ mod tests {
             let data = format!("foo blk {} at {}", blkno, lsn);
             assert_eq!(
                 tline
-                    .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(lsn), false, &ctx)
+                    .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(lsn), &ctx)
                     .await?,
                 test_img(&data)
             );
@@ -2056,7 +2050,7 @@ mod tests {
         // Check reported size and contents after truncation
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), &ctx)
                 .await?,
             1
         );
@@ -2066,7 +2060,7 @@ mod tests {
             let data = format!("foo blk {} at {}", blkno, lsn);
             assert_eq!(
                 tline
-                    .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x60)), false, &ctx)
+                    .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x60)), &ctx)
                     .await?,
                 test_img(&data)
             );
@@ -2075,7 +2069,7 @@ mod tests {
         // should still see all blocks with older LSN
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), &ctx)
                 .await?,
             relsize
         );
@@ -2084,7 +2078,7 @@ mod tests {
             let data = format!("foo blk {} at {}", blkno, lsn);
             assert_eq!(
                 tline
-                    .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x50)), false, &ctx)
+                    .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x50)), &ctx)
                     .await?,
                 test_img(&data)
             );
@@ -2104,13 +2098,13 @@ mod tests {
 
         assert_eq!(
             tline
-                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x80)), false, &ctx)
+                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x80)), &ctx)
                 .await?,
             true
         );
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), &ctx)
                 .await?,
             relsize
         );
@@ -2120,7 +2114,7 @@ mod tests {
             let data = format!("foo blk {} at {}", blkno, lsn);
             assert_eq!(
                 tline
-                    .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x80)), false, &ctx)
+                    .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x80)), &ctx)
                     .await?,
                 test_img(&data)
             );
@@ -2154,7 +2148,7 @@ mod tests {
 
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), &ctx)
                 .await?,
             RELSEG_SIZE + 1
         );
@@ -2168,7 +2162,7 @@ mod tests {
         m.commit(&ctx).await?;
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), &ctx)
                 .await?,
             RELSEG_SIZE
         );
@@ -2183,7 +2177,7 @@ mod tests {
         m.commit(&ctx).await?;
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), &ctx)
                 .await?,
             RELSEG_SIZE - 1
         );
@@ -2201,7 +2195,7 @@ mod tests {
             m.commit(&ctx).await?;
             assert_eq!(
                 tline
-                    .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx)
+                    .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), &ctx)
                     .await?,
                 size as BlockNumber
             );
diff --git a/pageserver/src/walrecord.rs b/pageserver/src/walrecord.rs
index ae2d996879..02f6f49694 100644
--- a/pageserver/src/walrecord.rs
+++ b/pageserver/src/walrecord.rs
@@ -55,6 +55,7 @@ impl NeonWalRecord {
     /// Does replaying this WAL record initialize the page from scratch, or does
     /// it need to be applied over the previous image of the page?
     pub fn will_init(&self) -> bool {
+        // If you change this function, you'll also need to change ValueBytes::will_init
         match self {
             NeonWalRecord::Postgres { will_init, rec: _ } => *will_init,
 
diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index ca41a576fd..9776d4ce88 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -20,6 +20,7 @@
 
 /// Process lifecycle and abstracction for the IPC protocol.
 mod process;
+pub use process::Kind as ProcessKind;
 
 /// Code to apply [`NeonWalRecord`]s.
 pub(crate) mod apply_neon;
@@ -34,7 +35,7 @@ use crate::walrecord::NeonWalRecord;
 use anyhow::Context;
 use bytes::{Bytes, BytesMut};
 use pageserver_api::key::key_to_rel_block;
-use pageserver_api::models::WalRedoManagerStatus;
+use pageserver_api::models::{WalRedoManagerProcessStatus, WalRedoManagerStatus};
 use pageserver_api::shard::TenantShardId;
 use std::sync::Arc;
 use std::time::Duration;
@@ -54,7 +55,7 @@ pub struct PostgresRedoManager {
     tenant_shard_id: TenantShardId,
     conf: &'static PageServerConf,
     last_redo_at: std::sync::Mutex<Option<Instant>>,
-    /// The current [`process::WalRedoProcess`] that is used by new redo requests.
+    /// The current [`process::Process`] that is used by new redo requests.
     /// We use [`heavier_once_cell`] for coalescing the spawning, but the redo
     /// requests don't use the [`heavier_once_cell::Guard`] to keep ahold of the
     /// their process object; we use [`Arc::clone`] for that.
@@ -66,7 +67,7 @@ pub struct PostgresRedoManager {
     /// still be using the old redo process. But, those other tasks will most likely
     /// encounter an error as well, and errors are an unexpected condition anyway.
     /// So, probably we could get rid of the `Arc` in the future.
-    redo_process: heavier_once_cell::OnceCell<Arc<process::WalRedoProcess>>,
+    redo_process: heavier_once_cell::OnceCell<Arc<process::Process>>,
 }
 
 ///
@@ -139,8 +140,8 @@ impl PostgresRedoManager {
         }
     }
 
-    pub(crate) fn status(&self) -> Option<WalRedoManagerStatus> {
-        Some(WalRedoManagerStatus {
+    pub fn status(&self) -> WalRedoManagerStatus {
+        WalRedoManagerStatus {
             last_redo_at: {
                 let at = *self.last_redo_at.lock().unwrap();
                 at.and_then(|at| {
@@ -149,8 +150,14 @@ impl PostgresRedoManager {
                     chrono::Utc::now().checked_sub_signed(chrono::Duration::from_std(age).ok()?)
                 })
             },
-            pid: self.redo_process.get().map(|p| p.id()),
-        })
+            process: self
+                .redo_process
+                .get()
+                .map(|p| WalRedoManagerProcessStatus {
+                    pid: p.id(),
+                    kind: std::borrow::Cow::Borrowed(p.kind().into()),
+                }),
+        }
     }
 }
 
@@ -208,37 +215,33 @@ impl PostgresRedoManager {
         const MAX_RETRY_ATTEMPTS: u32 = 1;
         let mut n_attempts = 0u32;
         loop {
-            let proc: Arc<process::WalRedoProcess> =
-                match self.redo_process.get_or_init_detached().await {
-                    Ok(guard) => Arc::clone(&guard),
-                    Err(permit) => {
-                        // don't hold poison_guard, the launch code can bail
-                        let start = Instant::now();
-                        let proc = Arc::new(
-                            process::WalRedoProcess::launch(
-                                self.conf,
-                                self.tenant_shard_id,
-                                pg_version,
-                            )
+            let proc: Arc<process::Process> = match self.redo_process.get_or_init_detached().await {
+                Ok(guard) => Arc::clone(&guard),
+                Err(permit) => {
+                    // don't hold poison_guard, the launch code can bail
+                    let start = Instant::now();
+                    let proc = Arc::new(
+                        process::Process::launch(self.conf, self.tenant_shard_id, pg_version)
                             .context("launch walredo process")?,
-                        );
-                        let duration = start.elapsed();
-                        WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64());
-                        info!(
-                            duration_ms = duration.as_millis(),
-                            pid = proc.id(),
-                            "launched walredo process"
-                        );
-                        self.redo_process.set(Arc::clone(&proc), permit);
-                        proc
-                    }
-                };
+                    );
+                    let duration = start.elapsed();
+                    WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64());
+                    info!(
+                        duration_ms = duration.as_millis(),
+                        pid = proc.id(),
+                        "launched walredo process"
+                    );
+                    self.redo_process.set(Arc::clone(&proc), permit);
+                    proc
+                }
+            };
 
             let started_at = std::time::Instant::now();
 
             // Relational WAL records are applied using wal-redo-postgres
             let result = proc
                 .apply_wal_records(rel, blknum, &base_img, records, wal_redo_timeout)
+                .await
                 .context("apply_wal_records");
 
             let duration = started_at.elapsed();
diff --git a/pageserver/src/walredo/process.rs b/pageserver/src/walredo/process.rs
index bcbb263663..ad6b4e5fe9 100644
--- a/pageserver/src/walredo/process.rs
+++ b/pageserver/src/walredo/process.rs
@@ -1,186 +1,67 @@
-use self::no_leak_child::NoLeakChild;
-use crate::{
-    config::PageServerConf,
-    metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER},
-    walrecord::NeonWalRecord,
-};
-use anyhow::Context;
+use std::time::Duration;
+
 use bytes::Bytes;
-use nix::poll::{PollFd, PollFlags};
 use pageserver_api::{reltag::RelTag, shard::TenantShardId};
-use postgres_ffi::BLCKSZ;
-use std::os::fd::AsRawFd;
-#[cfg(feature = "testing")]
-use std::sync::atomic::AtomicUsize;
-use std::{
-    collections::VecDeque,
-    io::{Read, Write},
-    process::{ChildStdin, ChildStdout, Command, Stdio},
-    sync::{Mutex, MutexGuard},
-    time::Duration,
-};
-use tracing::{debug, error, instrument, Instrument};
-use utils::{lsn::Lsn, nonblock::set_nonblock};
+use utils::lsn::Lsn;
+
+use crate::{config::PageServerConf, walrecord::NeonWalRecord};
 
 mod no_leak_child;
 /// The IPC protocol that pageserver and walredo process speak over their shared pipe.
 mod protocol;
 
-pub struct WalRedoProcess {
-    #[allow(dead_code)]
-    conf: &'static PageServerConf,
-    tenant_shard_id: TenantShardId,
-    // Some() on construction, only becomes None on Drop.
-    child: Option<NoLeakChild>,
-    stdout: Mutex<ProcessOutput>,
-    stdin: Mutex<ProcessInput>,
-    /// Counter to separate same sized walredo inputs failing at the same millisecond.
-    #[cfg(feature = "testing")]
-    dump_sequence: AtomicUsize,
+mod process_impl {
+    pub(super) mod process_async;
+    pub(super) mod process_std;
 }
 
-struct ProcessInput {
-    stdin: ChildStdin,
-    n_requests: usize,
+#[derive(
+    Clone,
+    Copy,
+    Debug,
+    PartialEq,
+    Eq,
+    strum_macros::EnumString,
+    strum_macros::Display,
+    strum_macros::IntoStaticStr,
+    serde_with::DeserializeFromStr,
+    serde_with::SerializeDisplay,
+)]
+#[strum(serialize_all = "kebab-case")]
+#[repr(u8)]
+pub enum Kind {
+    Sync,
+    Async,
 }
 
-struct ProcessOutput {
-    stdout: ChildStdout,
-    pending_responses: VecDeque<Option<Bytes>>,
-    n_processed_responses: usize,
+pub(crate) enum Process {
+    Sync(process_impl::process_std::WalRedoProcess),
+    Async(process_impl::process_async::WalRedoProcess),
 }
 
-impl WalRedoProcess {
-    //
-    // Start postgres binary in special WAL redo mode.
-    //
-    #[instrument(skip_all,fields(pg_version=pg_version))]
-    pub(crate) fn launch(
+impl Process {
+    #[inline(always)]
+    pub fn launch(
         conf: &'static PageServerConf,
         tenant_shard_id: TenantShardId,
         pg_version: u32,
     ) -> anyhow::Result<Self> {
-        crate::span::debug_assert_current_span_has_tenant_id();
-
-        let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible.
-        let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?;
-
-        use no_leak_child::NoLeakChildCommandExt;
-        // Start postgres itself
-        let child = Command::new(pg_bin_dir_path.join("postgres"))
-            // the first arg must be --wal-redo so the child process enters into walredo mode
-            .arg("--wal-redo")
-            // the child doesn't process this arg, but, having it in the argv helps indentify the
-            // walredo process for a particular tenant when debugging a pagserver
-            .args(["--tenant-shard-id", &format!("{tenant_shard_id}")])
-            .stdin(Stdio::piped())
-            .stderr(Stdio::piped())
-            .stdout(Stdio::piped())
-            .env_clear()
-            .env("LD_LIBRARY_PATH", &pg_lib_dir_path)
-            .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path)
-            // NB: The redo process is not trusted after we sent it the first
-            // walredo work. Before that, it is trusted. Specifically, we trust
-            // it to
-            // 1. close all file descriptors except stdin, stdout, stderr because
-            //    pageserver might not be 100% diligent in setting FD_CLOEXEC on all
-            //    the files it opens, and
-            // 2. to use seccomp to sandbox itself before processing the first
-            //    walredo request.
-            .spawn_no_leak_child(tenant_shard_id)
-            .context("spawn process")?;
-        WAL_REDO_PROCESS_COUNTERS.started.inc();
-        let mut child = scopeguard::guard(child, |child| {
-            error!("killing wal-redo-postgres process due to a problem during launch");
-            child.kill_and_wait(WalRedoKillCause::Startup);
-        });
-
-        let stdin = child.stdin.take().unwrap();
-        let stdout = child.stdout.take().unwrap();
-        let stderr = child.stderr.take().unwrap();
-        let stderr = tokio::process::ChildStderr::from_std(stderr)
-            .context("convert to tokio::ChildStderr")?;
-        macro_rules! set_nonblock_or_log_err {
-        ($file:ident) => {{
-            let res = set_nonblock($file.as_raw_fd());
-            if let Err(e) = &res {
-                error!(error = %e, file = stringify!($file), pid = child.id(), "set_nonblock failed");
-            }
-            res
-        }};
-    }
-        set_nonblock_or_log_err!(stdin)?;
-        set_nonblock_or_log_err!(stdout)?;
-
-        // all fallible operations post-spawn are complete, so get rid of the guard
-        let child = scopeguard::ScopeGuard::into_inner(child);
-
-        tokio::spawn(
-        async move {
-            scopeguard::defer! {
-                debug!("wal-redo-postgres stderr_logger_task finished");
-                crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc();
-            }
-            debug!("wal-redo-postgres stderr_logger_task started");
-            crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc();
-
-            use tokio::io::AsyncBufReadExt;
-            let mut stderr_lines = tokio::io::BufReader::new(stderr);
-            let mut buf = Vec::new();
-            let res = loop {
-                buf.clear();
-                // TODO we don't trust the process to cap its stderr length.
-                // Currently it can do unbounded Vec allocation.
-                match stderr_lines.read_until(b'\n', &mut buf).await {
-                    Ok(0) => break Ok(()), // eof
-                    Ok(num_bytes) => {
-                        let output = String::from_utf8_lossy(&buf[..num_bytes]);
-                        error!(%output, "received output");
-                    }
-                    Err(e) => {
-                        break Err(e);
-                    }
-                }
-            };
-            match res {
-                Ok(()) => (),
-                Err(e) => {
-                    error!(error=?e, "failed to read from walredo stderr");
-                }
-            }
-        }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version))
-    );
-
-        Ok(Self {
-            conf,
-            tenant_shard_id,
-            child: Some(child),
-            stdin: Mutex::new(ProcessInput {
-                stdin,
-                n_requests: 0,
-            }),
-            stdout: Mutex::new(ProcessOutput {
-                stdout,
-                pending_responses: VecDeque::new(),
-                n_processed_responses: 0,
-            }),
-            #[cfg(feature = "testing")]
-            dump_sequence: AtomicUsize::default(),
+        Ok(match conf.walredo_process_kind {
+            Kind::Sync => Self::Sync(process_impl::process_std::WalRedoProcess::launch(
+                conf,
+                tenant_shard_id,
+                pg_version,
+            )?),
+            Kind::Async => Self::Async(process_impl::process_async::WalRedoProcess::launch(
+                conf,
+                tenant_shard_id,
+                pg_version,
+            )?),
         })
     }
 
-    pub(crate) fn id(&self) -> u32 {
-        self.child
-            .as_ref()
-            .expect("must not call this during Drop")
-            .id()
-    }
-
-    // Apply given WAL records ('records') over an old page image. Returns
-    // new page image.
-    //
-    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))]
-    pub(crate) fn apply_wal_records(
+    #[inline(always)]
+    pub(crate) async fn apply_wal_records(
         &self,
         rel: RelTag,
         blknum: u32,
@@ -188,221 +69,29 @@ impl WalRedoProcess {
         records: &[(Lsn, NeonWalRecord)],
         wal_redo_timeout: Duration,
     ) -> anyhow::Result<Bytes> {
-        let tag = protocol::BufferTag { rel, blknum };
-        let input = self.stdin.lock().unwrap();
-
-        // Serialize all the messages to send the WAL redo process first.
-        //
-        // This could be problematic if there are millions of records to replay,
-        // but in practice the number of records is usually so small that it doesn't
-        // matter, and it's better to keep this code simple.
-        //
-        // Most requests start with a before-image with BLCKSZ bytes, followed by
-        // by some other WAL records. Start with a buffer that can hold that
-        // comfortably.
-        let mut writebuf: Vec<u8> = Vec::with_capacity((BLCKSZ as usize) * 3);
-        protocol::build_begin_redo_for_block_msg(tag, &mut writebuf);
-        if let Some(img) = base_img {
-            protocol::build_push_page_msg(tag, img, &mut writebuf);
-        }
-        for (lsn, rec) in records.iter() {
-            if let NeonWalRecord::Postgres {
-                will_init: _,
-                rec: postgres_rec,
-            } = rec
-            {
-                protocol::build_apply_record_msg(*lsn, postgres_rec, &mut writebuf);
-            } else {
-                anyhow::bail!("tried to pass neon wal record to postgres WAL redo");
+        match self {
+            Process::Sync(p) => {
+                p.apply_wal_records(rel, blknum, base_img, records, wal_redo_timeout)
+                    .await
             }
-        }
-        protocol::build_get_page_msg(tag, &mut writebuf);
-        WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64);
-
-        let res = self.apply_wal_records0(&writebuf, input, wal_redo_timeout);
-
-        if res.is_err() {
-            // not all of these can be caused by this particular input, however these are so rare
-            // in tests so capture all.
-            self.record_and_log(&writebuf);
-        }
-
-        res
-    }
-
-    fn apply_wal_records0(
-        &self,
-        writebuf: &[u8],
-        input: MutexGuard<ProcessInput>,
-        wal_redo_timeout: Duration,
-    ) -> anyhow::Result<Bytes> {
-        let mut proc = { input }; // TODO: remove this legacy rename, but this keep the patch small.
-        let mut nwrite = 0usize;
-
-        while nwrite < writebuf.len() {
-            let mut stdin_pollfds = [PollFd::new(&proc.stdin, PollFlags::POLLOUT)];
-            let n = loop {
-                match nix::poll::poll(&mut stdin_pollfds[..], wal_redo_timeout.as_millis() as i32) {
-                    Err(nix::errno::Errno::EINTR) => continue,
-                    res => break res,
-                }
-            }?;
-
-            if n == 0 {
-                anyhow::bail!("WAL redo timed out");
+            Process::Async(p) => {
+                p.apply_wal_records(rel, blknum, base_img, records, wal_redo_timeout)
+                    .await
             }
-
-            // If 'stdin' is writeable, do write.
-            let in_revents = stdin_pollfds[0].revents().unwrap();
-            if in_revents & (PollFlags::POLLERR | PollFlags::POLLOUT) != PollFlags::empty() {
-                nwrite += proc.stdin.write(&writebuf[nwrite..])?;
-            }
-            if in_revents.contains(PollFlags::POLLHUP) {
-                // We still have more data to write, but the process closed the pipe.
-                anyhow::bail!("WAL redo process closed its stdin unexpectedly");
-            }
-        }
-        let request_no = proc.n_requests;
-        proc.n_requests += 1;
-        drop(proc);
-
-        // To improve walredo performance we separate sending requests and receiving
-        // responses. Them are protected by different mutexes (output and input).
-        // If thread T1, T2, T3 send requests D1, D2, D3 to walredo process
-        // then there is not warranty that T1 will first granted output mutex lock.
-        // To address this issue we maintain number of sent requests, number of processed
-        // responses and ring buffer with pending responses. After sending response
-        // (under input mutex), threads remembers request number. Then it releases
-        // input mutex, locks output mutex and fetch in ring buffer all responses until
-        // its stored request number. The it takes correspondent element from
-        // pending responses ring buffer and truncate all empty elements from the front,
-        // advancing processed responses number.
-
-        let mut output = self.stdout.lock().unwrap();
-        let n_processed_responses = output.n_processed_responses;
-        while n_processed_responses + output.pending_responses.len() <= request_no {
-            // We expect the WAL redo process to respond with an 8k page image. We read it
-            // into this buffer.
-            let mut resultbuf = vec![0; BLCKSZ.into()];
-            let mut nresult: usize = 0; // # of bytes read into 'resultbuf' so far
-            while nresult < BLCKSZ.into() {
-                let mut stdout_pollfds = [PollFd::new(&output.stdout, PollFlags::POLLIN)];
-                // We do two things simultaneously: reading response from stdout
-                // and forward any logging information that the child writes to its stderr to the page server's log.
-                let n = loop {
-                    match nix::poll::poll(
-                        &mut stdout_pollfds[..],
-                        wal_redo_timeout.as_millis() as i32,
-                    ) {
-                        Err(nix::errno::Errno::EINTR) => continue,
-                        res => break res,
-                    }
-                }?;
-
-                if n == 0 {
-                    anyhow::bail!("WAL redo timed out");
-                }
-
-                // If we have some data in stdout, read it to the result buffer.
-                let out_revents = stdout_pollfds[0].revents().unwrap();
-                if out_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
-                    nresult += output.stdout.read(&mut resultbuf[nresult..])?;
-                }
-                if out_revents.contains(PollFlags::POLLHUP) {
-                    anyhow::bail!("WAL redo process closed its stdout unexpectedly");
-                }
-            }
-            output
-                .pending_responses
-                .push_back(Some(Bytes::from(resultbuf)));
-        }
-        // Replace our request's response with None in `pending_responses`.
-        // Then make space in the ring buffer by clearing out any seqence of contiguous
-        // `None`'s from the front of `pending_responses`.
-        // NB: We can't pop_front() because other requests' responses because another
-        // requester might have grabbed the output mutex before us:
-        // T1: grab input mutex
-        // T1: send request_no 23
-        // T1: release input mutex
-        // T2: grab input mutex
-        // T2: send request_no 24
-        // T2: release input mutex
-        // T2: grab output mutex
-        // T2: n_processed_responses + output.pending_responses.len() <= request_no
-        //            23                                0                   24
-        // T2: enters poll loop that reads stdout
-        // T2: put response for 23 into pending_responses
-        // T2: put response for 24 into pending_resposnes
-        // pending_responses now looks like this: Front Some(response_23) Some(response_24) Back
-        // T2: takes its response_24
-        // pending_responses now looks like this: Front Some(response_23) None Back
-        // T2: does the while loop below
-        // pending_responses now looks like this: Front Some(response_23) None Back
-        // T2: releases output mutex
-        // T1: grabs output mutex
-        // T1: n_processed_responses + output.pending_responses.len() > request_no
-        //            23                                2                   23
-        // T1: skips poll loop that reads stdout
-        // T1: takes its response_23
-        // pending_responses now looks like this: Front None None Back
-        // T2: does the while loop below
-        // pending_responses now looks like this: Front Back
-        // n_processed_responses now has value 25
-        let res = output.pending_responses[request_no - n_processed_responses]
-            .take()
-            .expect("we own this request_no, nobody else is supposed to take it");
-        while let Some(front) = output.pending_responses.front() {
-            if front.is_none() {
-                output.pending_responses.pop_front();
-                output.n_processed_responses += 1;
-            } else {
-                break;
-            }
-        }
-        Ok(res)
-    }
-
-    #[cfg(feature = "testing")]
-    fn record_and_log(&self, writebuf: &[u8]) {
-        use std::sync::atomic::Ordering;
-
-        let millis = std::time::SystemTime::now()
-            .duration_since(std::time::SystemTime::UNIX_EPOCH)
-            .unwrap()
-            .as_millis();
-
-        let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed);
-
-        // these files will be collected to an allure report
-        let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len());
-
-        let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename);
-
-        let res = std::fs::OpenOptions::new()
-            .write(true)
-            .create_new(true)
-            .read(true)
-            .open(path)
-            .and_then(|mut f| f.write_all(writebuf));
-
-        // trip up allowed_errors
-        if let Err(e) = res {
-            tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}");
-        } else {
-            tracing::error!(filename, "erroring walredo input saved");
         }
     }
 
-    #[cfg(not(feature = "testing"))]
-    fn record_and_log(&self, _: &[u8]) {}
-}
+    pub(crate) fn id(&self) -> u32 {
+        match self {
+            Process::Sync(p) => p.id(),
+            Process::Async(p) => p.id(),
+        }
+    }
 
-impl Drop for WalRedoProcess {
-    fn drop(&mut self) {
-        self.child
-            .take()
-            .expect("we only do this once")
-            .kill_and_wait(WalRedoKillCause::WalRedoProcessDrop);
-        // no way to wait for stderr_logger_task from Drop because that is async only
+    pub(crate) fn kind(&self) -> Kind {
+        match self {
+            Process::Sync(_) => Kind::Sync,
+            Process::Async(_) => Kind::Async,
+        }
     }
 }
diff --git a/pageserver/src/walredo/process/process_impl/process_async.rs b/pageserver/src/walredo/process/process_impl/process_async.rs
new file mode 100644
index 0000000000..262858b033
--- /dev/null
+++ b/pageserver/src/walredo/process/process_impl/process_async.rs
@@ -0,0 +1,374 @@
+use self::no_leak_child::NoLeakChild;
+use crate::{
+    config::PageServerConf,
+    metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER},
+    walrecord::NeonWalRecord,
+    walredo::process::{no_leak_child, protocol},
+};
+use anyhow::Context;
+use bytes::Bytes;
+use pageserver_api::{reltag::RelTag, shard::TenantShardId};
+use postgres_ffi::BLCKSZ;
+#[cfg(feature = "testing")]
+use std::sync::atomic::AtomicUsize;
+use std::{
+    collections::VecDeque,
+    process::{Command, Stdio},
+    time::Duration,
+};
+use tokio::io::{AsyncReadExt, AsyncWriteExt};
+use tracing::{debug, error, instrument, Instrument};
+use utils::{lsn::Lsn, poison::Poison};
+
+pub struct WalRedoProcess {
+    #[allow(dead_code)]
+    conf: &'static PageServerConf,
+    tenant_shard_id: TenantShardId,
+    // Some() on construction, only becomes None on Drop.
+    child: Option<NoLeakChild>,
+    stdout: tokio::sync::Mutex<Poison<ProcessOutput>>,
+    stdin: tokio::sync::Mutex<Poison<ProcessInput>>,
+    /// Counter to separate same sized walredo inputs failing at the same millisecond.
+    #[cfg(feature = "testing")]
+    dump_sequence: AtomicUsize,
+}
+
+struct ProcessInput {
+    stdin: tokio::process::ChildStdin,
+    n_requests: usize,
+}
+
+struct ProcessOutput {
+    stdout: tokio::process::ChildStdout,
+    pending_responses: VecDeque<Option<Bytes>>,
+    n_processed_responses: usize,
+}
+
+impl WalRedoProcess {
+    //
+    // Start postgres binary in special WAL redo mode.
+    //
+    #[instrument(skip_all,fields(pg_version=pg_version))]
+    pub(crate) fn launch(
+        conf: &'static PageServerConf,
+        tenant_shard_id: TenantShardId,
+        pg_version: u32,
+    ) -> anyhow::Result<Self> {
+        crate::span::debug_assert_current_span_has_tenant_id();
+
+        let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible.
+        let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?;
+
+        use no_leak_child::NoLeakChildCommandExt;
+        // Start postgres itself
+        let child = Command::new(pg_bin_dir_path.join("postgres"))
+            // the first arg must be --wal-redo so the child process enters into walredo mode
+            .arg("--wal-redo")
+            // the child doesn't process this arg, but, having it in the argv helps indentify the
+            // walredo process for a particular tenant when debugging a pagserver
+            .args(["--tenant-shard-id", &format!("{tenant_shard_id}")])
+            .stdin(Stdio::piped())
+            .stderr(Stdio::piped())
+            .stdout(Stdio::piped())
+            .env_clear()
+            .env("LD_LIBRARY_PATH", &pg_lib_dir_path)
+            .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path)
+            // NB: The redo process is not trusted after we sent it the first
+            // walredo work. Before that, it is trusted. Specifically, we trust
+            // it to
+            // 1. close all file descriptors except stdin, stdout, stderr because
+            //    pageserver might not be 100% diligent in setting FD_CLOEXEC on all
+            //    the files it opens, and
+            // 2. to use seccomp to sandbox itself before processing the first
+            //    walredo request.
+            .spawn_no_leak_child(tenant_shard_id)
+            .context("spawn process")?;
+        WAL_REDO_PROCESS_COUNTERS.started.inc();
+        let mut child = scopeguard::guard(child, |child| {
+            error!("killing wal-redo-postgres process due to a problem during launch");
+            child.kill_and_wait(WalRedoKillCause::Startup);
+        });
+
+        let stdin = child.stdin.take().unwrap();
+        let stdout = child.stdout.take().unwrap();
+        let stderr = child.stderr.take().unwrap();
+        let stderr = tokio::process::ChildStderr::from_std(stderr)
+            .context("convert to tokio::ChildStderr")?;
+        let stdin =
+            tokio::process::ChildStdin::from_std(stdin).context("convert to tokio::ChildStdin")?;
+        let stdout = tokio::process::ChildStdout::from_std(stdout)
+            .context("convert to tokio::ChildStdout")?;
+
+        // all fallible operations post-spawn are complete, so get rid of the guard
+        let child = scopeguard::ScopeGuard::into_inner(child);
+
+        tokio::spawn(
+            async move {
+                scopeguard::defer! {
+                    debug!("wal-redo-postgres stderr_logger_task finished");
+                    crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc();
+                }
+                debug!("wal-redo-postgres stderr_logger_task started");
+                crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc();
+
+                use tokio::io::AsyncBufReadExt;
+                let mut stderr_lines = tokio::io::BufReader::new(stderr);
+                let mut buf = Vec::new();
+                let res = loop {
+                    buf.clear();
+                    // TODO we don't trust the process to cap its stderr length.
+                    // Currently it can do unbounded Vec allocation.
+                    match stderr_lines.read_until(b'\n', &mut buf).await {
+                        Ok(0) => break Ok(()), // eof
+                        Ok(num_bytes) => {
+                            let output = String::from_utf8_lossy(&buf[..num_bytes]);
+                            error!(%output, "received output");
+                        }
+                        Err(e) => {
+                            break Err(e);
+                        }
+                    }
+                };
+                match res {
+                    Ok(()) => (),
+                    Err(e) => {
+                        error!(error=?e, "failed to read from walredo stderr");
+                    }
+                }
+            }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version))
+        );
+
+        Ok(Self {
+            conf,
+            tenant_shard_id,
+            child: Some(child),
+            stdin: tokio::sync::Mutex::new(Poison::new(
+                "stdin",
+                ProcessInput {
+                    stdin,
+                    n_requests: 0,
+                },
+            )),
+            stdout: tokio::sync::Mutex::new(Poison::new(
+                "stdout",
+                ProcessOutput {
+                    stdout,
+                    pending_responses: VecDeque::new(),
+                    n_processed_responses: 0,
+                },
+            )),
+            #[cfg(feature = "testing")]
+            dump_sequence: AtomicUsize::default(),
+        })
+    }
+
+    pub(crate) fn id(&self) -> u32 {
+        self.child
+            .as_ref()
+            .expect("must not call this during Drop")
+            .id()
+    }
+
+    /// Apply given WAL records ('records') over an old page image. Returns
+    /// new page image.
+    ///
+    /// # Cancel-Safety
+    ///
+    /// Cancellation safe.
+    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))]
+    pub(crate) async fn apply_wal_records(
+        &self,
+        rel: RelTag,
+        blknum: u32,
+        base_img: &Option<Bytes>,
+        records: &[(Lsn, NeonWalRecord)],
+        wal_redo_timeout: Duration,
+    ) -> anyhow::Result<Bytes> {
+        let tag = protocol::BufferTag { rel, blknum };
+
+        // Serialize all the messages to send the WAL redo process first.
+        //
+        // This could be problematic if there are millions of records to replay,
+        // but in practice the number of records is usually so small that it doesn't
+        // matter, and it's better to keep this code simple.
+        //
+        // Most requests start with a before-image with BLCKSZ bytes, followed by
+        // by some other WAL records. Start with a buffer that can hold that
+        // comfortably.
+        let mut writebuf: Vec<u8> = Vec::with_capacity((BLCKSZ as usize) * 3);
+        protocol::build_begin_redo_for_block_msg(tag, &mut writebuf);
+        if let Some(img) = base_img {
+            protocol::build_push_page_msg(tag, img, &mut writebuf);
+        }
+        for (lsn, rec) in records.iter() {
+            if let NeonWalRecord::Postgres {
+                will_init: _,
+                rec: postgres_rec,
+            } = rec
+            {
+                protocol::build_apply_record_msg(*lsn, postgres_rec, &mut writebuf);
+            } else {
+                anyhow::bail!("tried to pass neon wal record to postgres WAL redo");
+            }
+        }
+        protocol::build_get_page_msg(tag, &mut writebuf);
+        WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64);
+
+        let Ok(res) =
+            tokio::time::timeout(wal_redo_timeout, self.apply_wal_records0(&writebuf)).await
+        else {
+            anyhow::bail!("WAL redo timed out");
+        };
+
+        if res.is_err() {
+            // not all of these can be caused by this particular input, however these are so rare
+            // in tests so capture all.
+            self.record_and_log(&writebuf);
+        }
+
+        res
+    }
+
+    /// # Cancel-Safety
+    ///
+    /// When not polled to completion (e.g. because in `tokio::select!` another
+    /// branch becomes ready before this future), concurrent and subsequent
+    /// calls may fail due to [`utils::poison::Poison::check_and_arm`] calls.
+    /// Dispose of this process instance and create a new one.
+    async fn apply_wal_records0(&self, writebuf: &[u8]) -> anyhow::Result<Bytes> {
+        let request_no = {
+            let mut lock_guard = self.stdin.lock().await;
+            let mut poison_guard = lock_guard.check_and_arm()?;
+            let input = poison_guard.data_mut();
+            input
+                .stdin
+                .write_all(writebuf)
+                .await
+                .context("write to walredo stdin")?;
+            let request_no = input.n_requests;
+            input.n_requests += 1;
+            poison_guard.disarm();
+            request_no
+        };
+
+        // To improve walredo performance we separate sending requests and receiving
+        // responses. Them are protected by different mutexes (output and input).
+        // If thread T1, T2, T3 send requests D1, D2, D3 to walredo process
+        // then there is not warranty that T1 will first granted output mutex lock.
+        // To address this issue we maintain number of sent requests, number of processed
+        // responses and ring buffer with pending responses. After sending response
+        // (under input mutex), threads remembers request number. Then it releases
+        // input mutex, locks output mutex and fetch in ring buffer all responses until
+        // its stored request number. The it takes correspondent element from
+        // pending responses ring buffer and truncate all empty elements from the front,
+        // advancing processed responses number.
+
+        let mut lock_guard = self.stdout.lock().await;
+        let mut poison_guard = lock_guard.check_and_arm()?;
+        let output = poison_guard.data_mut();
+        let n_processed_responses = output.n_processed_responses;
+        while n_processed_responses + output.pending_responses.len() <= request_no {
+            // We expect the WAL redo process to respond with an 8k page image. We read it
+            // into this buffer.
+            let mut resultbuf = vec![0; BLCKSZ.into()];
+            output
+                .stdout
+                .read_exact(&mut resultbuf)
+                .await
+                .context("read walredo stdout")?;
+            output
+                .pending_responses
+                .push_back(Some(Bytes::from(resultbuf)));
+        }
+        // Replace our request's response with None in `pending_responses`.
+        // Then make space in the ring buffer by clearing out any seqence of contiguous
+        // `None`'s from the front of `pending_responses`.
+        // NB: We can't pop_front() because other requests' responses because another
+        // requester might have grabbed the output mutex before us:
+        // T1: grab input mutex
+        // T1: send request_no 23
+        // T1: release input mutex
+        // T2: grab input mutex
+        // T2: send request_no 24
+        // T2: release input mutex
+        // T2: grab output mutex
+        // T2: n_processed_responses + output.pending_responses.len() <= request_no
+        //            23                                0                   24
+        // T2: enters poll loop that reads stdout
+        // T2: put response for 23 into pending_responses
+        // T2: put response for 24 into pending_resposnes
+        // pending_responses now looks like this: Front Some(response_23) Some(response_24) Back
+        // T2: takes its response_24
+        // pending_responses now looks like this: Front Some(response_23) None Back
+        // T2: does the while loop below
+        // pending_responses now looks like this: Front Some(response_23) None Back
+        // T2: releases output mutex
+        // T1: grabs output mutex
+        // T1: n_processed_responses + output.pending_responses.len() > request_no
+        //            23                                2                   23
+        // T1: skips poll loop that reads stdout
+        // T1: takes its response_23
+        // pending_responses now looks like this: Front None None Back
+        // T2: does the while loop below
+        // pending_responses now looks like this: Front Back
+        // n_processed_responses now has value 25
+        let res = output.pending_responses[request_no - n_processed_responses]
+            .take()
+            .expect("we own this request_no, nobody else is supposed to take it");
+        while let Some(front) = output.pending_responses.front() {
+            if front.is_none() {
+                output.pending_responses.pop_front();
+                output.n_processed_responses += 1;
+            } else {
+                break;
+            }
+        }
+        poison_guard.disarm();
+        Ok(res)
+    }
+
+    #[cfg(feature = "testing")]
+    fn record_and_log(&self, writebuf: &[u8]) {
+        use std::sync::atomic::Ordering;
+
+        let millis = std::time::SystemTime::now()
+            .duration_since(std::time::SystemTime::UNIX_EPOCH)
+            .unwrap()
+            .as_millis();
+
+        let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed);
+
+        // these files will be collected to an allure report
+        let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len());
+
+        let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename);
+
+        use std::io::Write;
+        let res = std::fs::OpenOptions::new()
+            .write(true)
+            .create_new(true)
+            .read(true)
+            .open(path)
+            .and_then(|mut f| f.write_all(writebuf));
+
+        // trip up allowed_errors
+        if let Err(e) = res {
+            tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}");
+        } else {
+            tracing::error!(filename, "erroring walredo input saved");
+        }
+    }
+
+    #[cfg(not(feature = "testing"))]
+    fn record_and_log(&self, _: &[u8]) {}
+}
+
+impl Drop for WalRedoProcess {
+    fn drop(&mut self) {
+        self.child
+            .take()
+            .expect("we only do this once")
+            .kill_and_wait(WalRedoKillCause::WalRedoProcessDrop);
+        // no way to wait for stderr_logger_task from Drop because that is async only
+    }
+}
diff --git a/pageserver/src/walredo/process/process_impl/process_std.rs b/pageserver/src/walredo/process/process_impl/process_std.rs
new file mode 100644
index 0000000000..e7a6c263c9
--- /dev/null
+++ b/pageserver/src/walredo/process/process_impl/process_std.rs
@@ -0,0 +1,405 @@
+use self::no_leak_child::NoLeakChild;
+use crate::{
+    config::PageServerConf,
+    metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER},
+    walrecord::NeonWalRecord,
+    walredo::process::{no_leak_child, protocol},
+};
+use anyhow::Context;
+use bytes::Bytes;
+use nix::poll::{PollFd, PollFlags};
+use pageserver_api::{reltag::RelTag, shard::TenantShardId};
+use postgres_ffi::BLCKSZ;
+use std::os::fd::AsRawFd;
+#[cfg(feature = "testing")]
+use std::sync::atomic::AtomicUsize;
+use std::{
+    collections::VecDeque,
+    io::{Read, Write},
+    process::{ChildStdin, ChildStdout, Command, Stdio},
+    sync::{Mutex, MutexGuard},
+    time::Duration,
+};
+use tracing::{debug, error, instrument, Instrument};
+use utils::{lsn::Lsn, nonblock::set_nonblock};
+
+pub struct WalRedoProcess {
+    #[allow(dead_code)]
+    conf: &'static PageServerConf,
+    tenant_shard_id: TenantShardId,
+    // Some() on construction, only becomes None on Drop.
+    child: Option<NoLeakChild>,
+    stdout: Mutex<ProcessOutput>,
+    stdin: Mutex<ProcessInput>,
+    /// Counter to separate same sized walredo inputs failing at the same millisecond.
+    #[cfg(feature = "testing")]
+    dump_sequence: AtomicUsize,
+}
+
+struct ProcessInput {
+    stdin: ChildStdin,
+    n_requests: usize,
+}
+
+struct ProcessOutput {
+    stdout: ChildStdout,
+    pending_responses: VecDeque<Option<Bytes>>,
+    n_processed_responses: usize,
+}
+
+impl WalRedoProcess {
+    //
+    // Start postgres binary in special WAL redo mode.
+    //
+    #[instrument(skip_all,fields(pg_version=pg_version))]
+    pub(crate) fn launch(
+        conf: &'static PageServerConf,
+        tenant_shard_id: TenantShardId,
+        pg_version: u32,
+    ) -> anyhow::Result<Self> {
+        crate::span::debug_assert_current_span_has_tenant_id();
+
+        let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible.
+        let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?;
+
+        use no_leak_child::NoLeakChildCommandExt;
+        // Start postgres itself
+        let child = Command::new(pg_bin_dir_path.join("postgres"))
+            // the first arg must be --wal-redo so the child process enters into walredo mode
+            .arg("--wal-redo")
+            // the child doesn't process this arg, but, having it in the argv helps indentify the
+            // walredo process for a particular tenant when debugging a pagserver
+            .args(["--tenant-shard-id", &format!("{tenant_shard_id}")])
+            .stdin(Stdio::piped())
+            .stderr(Stdio::piped())
+            .stdout(Stdio::piped())
+            .env_clear()
+            .env("LD_LIBRARY_PATH", &pg_lib_dir_path)
+            .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path)
+            // NB: The redo process is not trusted after we sent it the first
+            // walredo work. Before that, it is trusted. Specifically, we trust
+            // it to
+            // 1. close all file descriptors except stdin, stdout, stderr because
+            //    pageserver might not be 100% diligent in setting FD_CLOEXEC on all
+            //    the files it opens, and
+            // 2. to use seccomp to sandbox itself before processing the first
+            //    walredo request.
+            .spawn_no_leak_child(tenant_shard_id)
+            .context("spawn process")?;
+        WAL_REDO_PROCESS_COUNTERS.started.inc();
+        let mut child = scopeguard::guard(child, |child| {
+            error!("killing wal-redo-postgres process due to a problem during launch");
+            child.kill_and_wait(WalRedoKillCause::Startup);
+        });
+
+        let stdin = child.stdin.take().unwrap();
+        let stdout = child.stdout.take().unwrap();
+        let stderr = child.stderr.take().unwrap();
+        let stderr = tokio::process::ChildStderr::from_std(stderr)
+            .context("convert to tokio::ChildStderr")?;
+        macro_rules! set_nonblock_or_log_err {
+        ($file:ident) => {{
+            let res = set_nonblock($file.as_raw_fd());
+            if let Err(e) = &res {
+                error!(error = %e, file = stringify!($file), pid = child.id(), "set_nonblock failed");
+            }
+            res
+        }};
+    }
+        set_nonblock_or_log_err!(stdin)?;
+        set_nonblock_or_log_err!(stdout)?;
+
+        // all fallible operations post-spawn are complete, so get rid of the guard
+        let child = scopeguard::ScopeGuard::into_inner(child);
+
+        tokio::spawn(
+            async move {
+                scopeguard::defer! {
+                    debug!("wal-redo-postgres stderr_logger_task finished");
+                    crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc();
+                }
+                debug!("wal-redo-postgres stderr_logger_task started");
+                crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc();
+
+                use tokio::io::AsyncBufReadExt;
+                let mut stderr_lines = tokio::io::BufReader::new(stderr);
+                let mut buf = Vec::new();
+                let res = loop {
+                    buf.clear();
+                    // TODO we don't trust the process to cap its stderr length.
+                    // Currently it can do unbounded Vec allocation.
+                    match stderr_lines.read_until(b'\n', &mut buf).await {
+                        Ok(0) => break Ok(()), // eof
+                        Ok(num_bytes) => {
+                            let output = String::from_utf8_lossy(&buf[..num_bytes]);
+                            error!(%output, "received output");
+                        }
+                        Err(e) => {
+                            break Err(e);
+                        }
+                    }
+                };
+                match res {
+                    Ok(()) => (),
+                    Err(e) => {
+                        error!(error=?e, "failed to read from walredo stderr");
+                    }
+                }
+            }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version))
+        );
+
+        Ok(Self {
+            conf,
+            tenant_shard_id,
+            child: Some(child),
+            stdin: Mutex::new(ProcessInput {
+                stdin,
+                n_requests: 0,
+            }),
+            stdout: Mutex::new(ProcessOutput {
+                stdout,
+                pending_responses: VecDeque::new(),
+                n_processed_responses: 0,
+            }),
+            #[cfg(feature = "testing")]
+            dump_sequence: AtomicUsize::default(),
+        })
+    }
+
+    pub(crate) fn id(&self) -> u32 {
+        self.child
+            .as_ref()
+            .expect("must not call this during Drop")
+            .id()
+    }
+
+    // Apply given WAL records ('records') over an old page image. Returns
+    // new page image.
+    //
+    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))]
+    pub(crate) async fn apply_wal_records(
+        &self,
+        rel: RelTag,
+        blknum: u32,
+        base_img: &Option<Bytes>,
+        records: &[(Lsn, NeonWalRecord)],
+        wal_redo_timeout: Duration,
+    ) -> anyhow::Result<Bytes> {
+        let tag = protocol::BufferTag { rel, blknum };
+        let input = self.stdin.lock().unwrap();
+
+        // Serialize all the messages to send the WAL redo process first.
+        //
+        // This could be problematic if there are millions of records to replay,
+        // but in practice the number of records is usually so small that it doesn't
+        // matter, and it's better to keep this code simple.
+        //
+        // Most requests start with a before-image with BLCKSZ bytes, followed by
+        // by some other WAL records. Start with a buffer that can hold that
+        // comfortably.
+        let mut writebuf: Vec<u8> = Vec::with_capacity((BLCKSZ as usize) * 3);
+        protocol::build_begin_redo_for_block_msg(tag, &mut writebuf);
+        if let Some(img) = base_img {
+            protocol::build_push_page_msg(tag, img, &mut writebuf);
+        }
+        for (lsn, rec) in records.iter() {
+            if let NeonWalRecord::Postgres {
+                will_init: _,
+                rec: postgres_rec,
+            } = rec
+            {
+                protocol::build_apply_record_msg(*lsn, postgres_rec, &mut writebuf);
+            } else {
+                anyhow::bail!("tried to pass neon wal record to postgres WAL redo");
+            }
+        }
+        protocol::build_get_page_msg(tag, &mut writebuf);
+        WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64);
+
+        let res = self.apply_wal_records0(&writebuf, input, wal_redo_timeout);
+
+        if res.is_err() {
+            // not all of these can be caused by this particular input, however these are so rare
+            // in tests so capture all.
+            self.record_and_log(&writebuf);
+        }
+
+        res
+    }
+
+    fn apply_wal_records0(
+        &self,
+        writebuf: &[u8],
+        input: MutexGuard<ProcessInput>,
+        wal_redo_timeout: Duration,
+    ) -> anyhow::Result<Bytes> {
+        let mut proc = { input }; // TODO: remove this legacy rename, but this keep the patch small.
+        let mut nwrite = 0usize;
+
+        while nwrite < writebuf.len() {
+            let mut stdin_pollfds = [PollFd::new(&proc.stdin, PollFlags::POLLOUT)];
+            let n = loop {
+                match nix::poll::poll(&mut stdin_pollfds[..], wal_redo_timeout.as_millis() as i32) {
+                    Err(nix::errno::Errno::EINTR) => continue,
+                    res => break res,
+                }
+            }?;
+
+            if n == 0 {
+                anyhow::bail!("WAL redo timed out");
+            }
+
+            // If 'stdin' is writeable, do write.
+            let in_revents = stdin_pollfds[0].revents().unwrap();
+            if in_revents & (PollFlags::POLLERR | PollFlags::POLLOUT) != PollFlags::empty() {
+                nwrite += proc.stdin.write(&writebuf[nwrite..])?;
+            }
+            if in_revents.contains(PollFlags::POLLHUP) {
+                // We still have more data to write, but the process closed the pipe.
+                anyhow::bail!("WAL redo process closed its stdin unexpectedly");
+            }
+        }
+        let request_no = proc.n_requests;
+        proc.n_requests += 1;
+        drop(proc);
+
+        // To improve walredo performance we separate sending requests and receiving
+        // responses. Them are protected by different mutexes (output and input).
+        // If thread T1, T2, T3 send requests D1, D2, D3 to walredo process
+        // then there is not warranty that T1 will first granted output mutex lock.
+        // To address this issue we maintain number of sent requests, number of processed
+        // responses and ring buffer with pending responses. After sending response
+        // (under input mutex), threads remembers request number. Then it releases
+        // input mutex, locks output mutex and fetch in ring buffer all responses until
+        // its stored request number. The it takes correspondent element from
+        // pending responses ring buffer and truncate all empty elements from the front,
+        // advancing processed responses number.
+
+        let mut output = self.stdout.lock().unwrap();
+        let n_processed_responses = output.n_processed_responses;
+        while n_processed_responses + output.pending_responses.len() <= request_no {
+            // We expect the WAL redo process to respond with an 8k page image. We read it
+            // into this buffer.
+            let mut resultbuf = vec![0; BLCKSZ.into()];
+            let mut nresult: usize = 0; // # of bytes read into 'resultbuf' so far
+            while nresult < BLCKSZ.into() {
+                let mut stdout_pollfds = [PollFd::new(&output.stdout, PollFlags::POLLIN)];
+                // We do two things simultaneously: reading response from stdout
+                // and forward any logging information that the child writes to its stderr to the page server's log.
+                let n = loop {
+                    match nix::poll::poll(
+                        &mut stdout_pollfds[..],
+                        wal_redo_timeout.as_millis() as i32,
+                    ) {
+                        Err(nix::errno::Errno::EINTR) => continue,
+                        res => break res,
+                    }
+                }?;
+
+                if n == 0 {
+                    anyhow::bail!("WAL redo timed out");
+                }
+
+                // If we have some data in stdout, read it to the result buffer.
+                let out_revents = stdout_pollfds[0].revents().unwrap();
+                if out_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
+                    nresult += output.stdout.read(&mut resultbuf[nresult..])?;
+                }
+                if out_revents.contains(PollFlags::POLLHUP) {
+                    anyhow::bail!("WAL redo process closed its stdout unexpectedly");
+                }
+            }
+            output
+                .pending_responses
+                .push_back(Some(Bytes::from(resultbuf)));
+        }
+        // Replace our request's response with None in `pending_responses`.
+        // Then make space in the ring buffer by clearing out any seqence of contiguous
+        // `None`'s from the front of `pending_responses`.
+        // NB: We can't pop_front() because other requests' responses because another
+        // requester might have grabbed the output mutex before us:
+        // T1: grab input mutex
+        // T1: send request_no 23
+        // T1: release input mutex
+        // T2: grab input mutex
+        // T2: send request_no 24
+        // T2: release input mutex
+        // T2: grab output mutex
+        // T2: n_processed_responses + output.pending_responses.len() <= request_no
+        //            23                                0                   24
+        // T2: enters poll loop that reads stdout
+        // T2: put response for 23 into pending_responses
+        // T2: put response for 24 into pending_resposnes
+        // pending_responses now looks like this: Front Some(response_23) Some(response_24) Back
+        // T2: takes its response_24
+        // pending_responses now looks like this: Front Some(response_23) None Back
+        // T2: does the while loop below
+        // pending_responses now looks like this: Front Some(response_23) None Back
+        // T2: releases output mutex
+        // T1: grabs output mutex
+        // T1: n_processed_responses + output.pending_responses.len() > request_no
+        //            23                                2                   23
+        // T1: skips poll loop that reads stdout
+        // T1: takes its response_23
+        // pending_responses now looks like this: Front None None Back
+        // T2: does the while loop below
+        // pending_responses now looks like this: Front Back
+        // n_processed_responses now has value 25
+        let res = output.pending_responses[request_no - n_processed_responses]
+            .take()
+            .expect("we own this request_no, nobody else is supposed to take it");
+        while let Some(front) = output.pending_responses.front() {
+            if front.is_none() {
+                output.pending_responses.pop_front();
+                output.n_processed_responses += 1;
+            } else {
+                break;
+            }
+        }
+        Ok(res)
+    }
+
+    #[cfg(feature = "testing")]
+    fn record_and_log(&self, writebuf: &[u8]) {
+        use std::sync::atomic::Ordering;
+
+        let millis = std::time::SystemTime::now()
+            .duration_since(std::time::SystemTime::UNIX_EPOCH)
+            .unwrap()
+            .as_millis();
+
+        let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed);
+
+        // these files will be collected to an allure report
+        let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len());
+
+        let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename);
+
+        let res = std::fs::OpenOptions::new()
+            .write(true)
+            .create_new(true)
+            .read(true)
+            .open(path)
+            .and_then(|mut f| f.write_all(writebuf));
+
+        // trip up allowed_errors
+        if let Err(e) = res {
+            tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}");
+        } else {
+            tracing::error!(filename, "erroring walredo input saved");
+        }
+    }
+
+    #[cfg(not(feature = "testing"))]
+    fn record_and_log(&self, _: &[u8]) {}
+}
+
+impl Drop for WalRedoProcess {
+    fn drop(&mut self) {
+        self.child
+            .take()
+            .expect("we only do this once")
+            .kill_and_wait(WalRedoKillCause::WalRedoProcessDrop);
+        // no way to wait for stderr_logger_task from Drop because that is async only
+    }
+}
diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index 2276b4e807..b7b1e7ccbf 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -49,6 +49,8 @@ char	   *neon_auth_token;
 int			readahead_buffer_size = 128;
 int			flush_every_n_requests = 8;
 
+int         neon_protocol_version = 1;
+
 static int	n_reconnect_attempts = 0;
 static int	max_reconnect_attempts = 60;
 static int	stripe_size;
@@ -379,7 +381,17 @@ pageserver_connect(shardno_t shard_no, int elevel)
 		pfree(msg);
 		return false;
 	}
-	query = psprintf("pagestream %s %s", neon_tenant, neon_timeline);
+	switch (neon_protocol_version)
+	{
+		case 2:
+			query = psprintf("pagestream_v2 %s %s", neon_tenant, neon_timeline);
+			break;
+		case 1:
+			query = psprintf("pagestream %s %s", neon_tenant, neon_timeline);
+			break;
+		default:
+			elog(ERROR, "unexpected neon_protocol_version %d", neon_protocol_version);
+	}
 	ret = PQsendQuery(conn, query);
 	pfree(query);
 	if (ret != 1)
@@ -440,7 +452,7 @@ pageserver_connect(shardno_t shard_no, int elevel)
 		return false;
 	}
 
-	neon_shard_log(shard_no, LOG, "libpagestore: connected to '%s'", connstr);
+	neon_shard_log(shard_no, LOG, "libpagestore: connected to '%s' with protocol version %d", connstr, neon_protocol_version);
 	page_servers[shard_no].conn = conn;
 	page_servers[shard_no].wes = wes;
 
@@ -844,6 +856,16 @@ pg_init_libpagestore(void)
 							PGC_USERSET,
 							0,	/* no flags required */
 							NULL, (GucIntAssignHook) &readahead_buffer_resize, NULL);
+	DefineCustomIntVariable("neon.protocol_version",
+							"Version of compute<->page server protocol",
+							NULL,
+							&neon_protocol_version,
+							1, /* default to old protocol for now */
+							1, /* min */
+							2, /* max */
+							PGC_SU_BACKEND,
+							0,	/* no flags required */
+							NULL, NULL, NULL);
 
 	relsize_hash_init();
 
diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h
index 44ae766f76..7709ab9d42 100644
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -69,18 +69,33 @@ typedef enum {
 	SLRU_MULTIXACT_OFFSETS
 } SlruKind;
 
-/*
- * supertype of all the Neon*Request structs below
+/*--
+ * supertype of all the Neon*Request structs below.
  *
- * If 'latest' is true, we are requesting the latest page version, and 'lsn'
- * is just a hint to the server that we know there are no versions of the page
- * (or relation size, for exists/nblocks requests) later than the 'lsn'.
+ * All requests contain two LSNs:
+ *
+ * lsn:                request page (or relation size, etc) at this LSN
+ * not_modified_since: Hint that the page hasn't been modified between
+ *                     this LSN and the request LSN (`lsn`).
+ *
+ * To request the latest version of a page, you can use MAX_LSN as the request
+ * LSN.
+ *
+ * If you don't know any better, you can always set 'not_modified_since' equal
+ * to 'lsn', but providing a lower value can speed up processing the request
+ * in the pageserver, as it doesn't need to wait for the WAL to arrive, and it
+ * can skip traversing through recent layers which we know to not contain any
+ * versions for the requested page.
+ *
+ * These structs describe the V2 of these requests. The old V1 protocol contained
+ * just one LSN and a boolean 'latest' flag. If the neon_protocol_version GUC is
+ * set to 1, we will convert these to the V1 requests before sending.
  */
 typedef struct
 {
 	NeonMessageTag tag;
-	bool		latest;			/* if true, request latest page version */
-	XLogRecPtr	lsn;			/* request page version @ this LSN */
+	XLogRecPtr	lsn;
+	XLogRecPtr	not_modified_since;
 } NeonRequest;
 
 typedef struct
@@ -193,6 +208,7 @@ extern int	readahead_buffer_size;
 extern char *neon_timeline;
 extern char *neon_tenant;
 extern int32 max_cluster_size;
+extern int  neon_protocol_version;
 
 extern shardno_t get_shard_number(BufferTag* tag);
 
@@ -225,14 +241,14 @@ extern bool neon_prefetch(SMgrRelation reln, ForkNumber forknum,
 extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 					  char *buffer);
 extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno,
-										 XLogRecPtr request_lsn, bool request_latest, char *buffer);
+										 XLogRecPtr request_lsn, XLogRecPtr not_modified_since, char *buffer);
 extern void neon_write(SMgrRelation reln, ForkNumber forknum,
 					   BlockNumber blocknum, char *buffer, bool skipFsync);
 #else
 extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 					  void *buffer);
 extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno,
-										 XLogRecPtr request_lsn, bool request_latest, void *buffer);
+										 XLogRecPtr request_lsn, XLogRecPtr not_modified_since, void *buffer);
 extern void neon_write(SMgrRelation reln, ForkNumber forknum,
 					   BlockNumber blocknum, const void *buffer, bool skipFsync);
 #endif
diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 57a16e00ca..44ecdbd9aa 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -168,8 +168,8 @@ typedef enum PrefetchStatus
 typedef struct PrefetchRequest
 {
 	BufferTag	buftag;			/* must be first entry in the struct */
-	XLogRecPtr	effective_request_lsn;
-	XLogRecPtr	actual_request_lsn;
+	XLogRecPtr	request_lsn;
+	XLogRecPtr	not_modified_since;
 	NeonResponse *response;		/* may be null */
 	PrefetchStatus status;
 	shardno_t   shard_no;
@@ -269,19 +269,19 @@ static PrefetchState *MyPState;
 	) \
 )
 
-static XLogRecPtr prefetch_lsn = 0;
-
 static bool compact_prefetch_buffers(void);
 static void consume_prefetch_responses(void);
-static uint64 prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_lsn);
+static uint64 prefetch_register_buffer(BufferTag tag, XLogRecPtr *force_request_lsn, XLogRecPtr *force_not_modified_since);
 static bool prefetch_read(PrefetchRequest *slot);
-static void prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force_lsn);
+static void prefetch_do_request(PrefetchRequest *slot, XLogRecPtr *force_request_lsn, XLogRecPtr *force_not_modified_since);
 static bool prefetch_wait_for(uint64 ring_index);
 static void prefetch_cleanup_trailing_unused(void);
 static inline void prefetch_set_unused(uint64 ring_index);
 
-static XLogRecPtr neon_get_request_lsn(bool *latest, NRelFileInfo rinfo,
-									   ForkNumber forknum, BlockNumber blkno);
+static void neon_get_request_lsn(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
+								 XLogRecPtr *request_lsn, XLogRecPtr *not_modified_since);
+static bool neon_prefetch_response_usable(XLogRecPtr request_lsn, XLogRecPtr not_modified_since,
+										  PrefetchRequest *slot);
 
 static bool
 compact_prefetch_buffers(void)
@@ -338,8 +338,8 @@ compact_prefetch_buffers(void)
 		target_slot->shard_no = source_slot->shard_no;
 		target_slot->status = source_slot->status;
 		target_slot->response = source_slot->response;
-		target_slot->effective_request_lsn = source_slot->effective_request_lsn;
-		target_slot->actual_request_lsn = source_slot->actual_request_lsn;
+		target_slot->request_lsn = source_slot->request_lsn;
+		target_slot->not_modified_since = source_slot->not_modified_since;
 		target_slot->my_ring_index = empty_ring_index;
 
 		prfh_delete(MyPState->prf_hash, source_slot);
@@ -358,7 +358,8 @@ compact_prefetch_buffers(void)
 		};
 		source_slot->response = NULL;
 		source_slot->my_ring_index = 0;
-		source_slot->effective_request_lsn = 0;
+		source_slot->request_lsn = InvalidXLogRecPtr;
+		source_slot->not_modified_since = InvalidXLogRecPtr;
 
 		/* update bookkeeping */
 		n_moved++;
@@ -683,56 +684,39 @@ prefetch_set_unused(uint64 ring_index)
 		compact_prefetch_buffers();
 }
 
+/*
+ * Send one prefetch request to the pageserver. To wait for the response, call
+ * prefetch_wait_for().
+ */
 static void
-prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force_lsn)
+prefetch_do_request(PrefetchRequest *slot, XLogRecPtr *force_request_lsn, XLogRecPtr *force_not_modified_since)
 {
 	bool		found;
 	NeonGetPageRequest request = {
 		.req.tag = T_NeonGetPageRequest,
-		.req.latest = false,
-		.req.lsn = 0,
+		/* lsn and not_modified_since are filled in below */
 		.rinfo = BufTagGetNRelFileInfo(slot->buftag),
 		.forknum = slot->buftag.forkNum,
 		.blkno = slot->buftag.blockNum,
 	};
 
-	if (force_lsn && force_latest)
+	Assert(((force_request_lsn != NULL) == (force_not_modified_since != NULL)));
+
+	if (force_request_lsn)
 	{
-		request.req.lsn = *force_lsn;
-		request.req.latest = *force_latest;
-		slot->actual_request_lsn = slot->effective_request_lsn = *force_lsn;
+		request.req.lsn = *force_request_lsn;
+		request.req.not_modified_since = *force_not_modified_since;
 	}
 	else
 	{
-		XLogRecPtr	lsn = neon_get_request_lsn(
-											   &request.req.latest,
-											   BufTagGetNRelFileInfo(slot->buftag),
-											   slot->buftag.forkNum,
-											   slot->buftag.blockNum
-			);
-
-		/*
-		 * Note: effective_request_lsn is potentially higher than the
-		 * requested LSN, but still correct:
-		 *
-		 * We know there are no changes between the actual requested LSN and
-		 * the value of effective_request_lsn: If there were, the page would
-		 * have been in cache and evicted between those LSN values, which then
-		 * would have had to result in a larger request LSN for this page.
-		 *
-		 * It is possible that a concurrent backend loads the page, modifies
-		 * it and then evicts it again, but the LSN of that eviction cannot be
-		 * smaller than the current WAL insert/redo pointer, which is already
-		 * larger than this prefetch_lsn. So in any case, that would
-		 * invalidate this cache.
-		 *
-		 * The best LSN to use for effective_request_lsn would be
-		 * XLogCtl->Insert.RedoRecPtr, but that's expensive to access.
-		 */
-		slot->actual_request_lsn = request.req.lsn = lsn;
-		prefetch_lsn = Max(prefetch_lsn, lsn);
-		slot->effective_request_lsn = prefetch_lsn;
+		neon_get_request_lsn(BufTagGetNRelFileInfo(slot->buftag),
+							 slot->buftag.forkNum,
+							 slot->buftag.blockNum,
+							 &request.req.lsn,
+							 &request.req.not_modified_since);
 	}
+	slot->request_lsn = request.req.lsn;
+	slot->not_modified_since = request.req.not_modified_since;
 
 	Assert(slot->response == NULL);
 	Assert(slot->my_ring_index == MyPState->ring_unused);
@@ -749,7 +733,6 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force
 	/* update slot state */
 	slot->status = PRFS_REQUESTED;
 
-
 	prfh_insert(MyPState->prf_hash, slot, &found);
 	Assert(!found);
 }
@@ -759,22 +742,25 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force
  *
  * Register that we may want the contents of BufferTag in the near future.
  *
- * If force_latest and force_lsn are not NULL, those values are sent to the
- * pageserver. If they are NULL, we utilize the lastWrittenLsn -infrastructure
- * to fill in these values manually.
+ * If force_request_lsn and force_not_modified_since are not NULL, those
+ * values are sent to the pageserver. If they are NULL, we utilize the
+ * lastWrittenLsn -infrastructure to fill them in.
  *
  * NOTE: this function may indirectly update MyPState->pfs_hash; which
  * invalidates any active pointers into the hash table.
  */
 
 static uint64
-prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_lsn)
+prefetch_register_buffer(BufferTag tag, XLogRecPtr *force_request_lsn,
+						 XLogRecPtr *force_not_modified_since)
 {
 	uint64		ring_index;
 	PrefetchRequest req;
 	PrefetchRequest *slot;
 	PrfHashEntry *entry;
 
+	Assert(((force_request_lsn != NULL) == (force_not_modified_since != NULL)));
+
 	/* use an intermediate PrefetchRequest struct to ensure correct alignment */
 	req.buftag = tag;
 Retry:
@@ -792,40 +778,19 @@ Retry:
 		Assert(BUFFERTAGS_EQUAL(slot->buftag, tag));
 
 		/*
-		 * If we want a specific lsn, we do not accept requests that were made
-		 * with a potentially different LSN.
+		 * If the caller specified a request LSN to use, only accept prefetch
+		 * responses that satisfy that request.
 		 */
-		if (force_latest && force_lsn)
+		if (force_request_lsn)
 		{
-			/*
-			 * if we want the latest version, any effective_request_lsn <
-			 * request lsn is OK
-			 */
-			if (*force_latest)
+			if (!neon_prefetch_response_usable(*force_request_lsn,
+											   *force_not_modified_since, slot))
 			{
-				if (*force_lsn > slot->effective_request_lsn)
-				{
-					if (!prefetch_wait_for(ring_index))
-						goto Retry;
-					prefetch_set_unused(ring_index);
-					entry = NULL;
-				}
-
-			}
-
-			/*
-			 * if we don't want the latest version, only accept requests with
-			 * the exact same LSN
-			 */
-			else
-			{
-				if (*force_lsn != slot->effective_request_lsn)
-				{
-					if (!prefetch_wait_for(ring_index))
-						goto Retry;
-					prefetch_set_unused(ring_index);
-					entry = NULL;
-				}
+				/* Wait for the old request to finish and discard it */
+				if (!prefetch_wait_for(ring_index))
+					goto Retry;
+				prefetch_set_unused(ring_index);
+				entry = NULL;
 			}
 		}
 
@@ -921,7 +886,7 @@ Retry:
 	slot->shard_no = get_shard_number(&tag);
 	slot->my_ring_index = ring_index;
 
-	prefetch_do_request(slot, force_latest, force_lsn);
+	prefetch_do_request(slot, force_request_lsn, force_not_modified_since);
 	Assert(slot->status == PRFS_REQUESTED);
 	Assert(MyPState->ring_last <= ring_index &&
 		   ring_index < MyPState->ring_unused);
@@ -950,7 +915,7 @@ page_server_request(void const *req)
 	BufferTag tag = {0};
 	shardno_t shard_no;
 
-	switch (((NeonRequest *) req)->tag)
+	switch (messageTag(req))
 	{
 		case T_NeonExistsRequest:
 			CopyNRelFileInfoToBufTag(tag, ((NeonExistsRequest *) req)->rinfo);
@@ -966,11 +931,10 @@ page_server_request(void const *req)
 			tag.blockNum = ((NeonGetPageRequest *) req)->blkno;
 			break;
 		default:
-			neon_log(ERROR, "Unexpected request tag: %d", ((NeonRequest *) req)->tag);
+			neon_log(ERROR, "Unexpected request tag: %d", messageTag(req));
 	}
 	shard_no = get_shard_number(&tag);
 
-
 	/*
 	 * Current sharding model assumes that all metadata is present only at shard 0.
 	 * We still need to call get_shard_no() to check if shard map is up-to-date.
@@ -997,8 +961,52 @@ nm_pack_request(NeonRequest *msg)
 	StringInfoData s;
 
 	initStringInfo(&s);
-	pq_sendbyte(&s, msg->tag);
 
+	if (neon_protocol_version >= 2)
+	{
+		pq_sendbyte(&s, msg->tag);
+		pq_sendint64(&s, msg->lsn);
+		pq_sendint64(&s, msg->not_modified_since);
+	}
+	else
+	{
+		bool		latest;
+		XLogRecPtr	lsn;
+
+		/*
+		 * In primary, we always request the latest page version.
+		 */
+		if (!RecoveryInProgress())
+		{
+			latest = true;
+			lsn = msg->not_modified_since;
+		}
+		else
+		{
+			/*
+			 * In the protocol V1, we cannot represent that we want to read
+			 * page at LSN X, and we know that it hasn't been modified since
+			 * Y. We can either use 'not_modified_lsn' as the request LSN, and
+			 * risk getting an error if that LSN is too old and has already
+			 * fallen out of the pageserver's GC horizon, or we can send
+			 * 'request_lsn', causing the pageserver to possibly wait for the
+			 * recent WAL to arrive unnecessarily. Or something in between. We
+			 * choose to use the old LSN and risk GC errors, because that's
+			 * what we've done historically.
+			 */
+			latest = false;
+			lsn = msg->not_modified_since;
+		}
+
+		pq_sendbyte(&s, msg->tag);
+		pq_sendbyte(&s, latest);
+		pq_sendint64(&s, lsn);
+	}
+
+	/*
+	 * The rest of the request messages are the same between protocol V1 and
+	 * V2
+	 */
 	switch (messageTag(msg))
 	{
 			/* pagestore_client -> pagestore */
@@ -1006,8 +1014,6 @@ nm_pack_request(NeonRequest *msg)
 			{
 				NeonExistsRequest *msg_req = (NeonExistsRequest *) msg;
 
-				pq_sendbyte(&s, msg_req->req.latest);
-				pq_sendint64(&s, msg_req->req.lsn);
 				pq_sendint32(&s, NInfoGetSpcOid(msg_req->rinfo));
 				pq_sendint32(&s, NInfoGetDbOid(msg_req->rinfo));
 				pq_sendint32(&s, NInfoGetRelNumber(msg_req->rinfo));
@@ -1019,8 +1025,6 @@ nm_pack_request(NeonRequest *msg)
 			{
 				NeonNblocksRequest *msg_req = (NeonNblocksRequest *) msg;
 
-				pq_sendbyte(&s, msg_req->req.latest);
-				pq_sendint64(&s, msg_req->req.lsn);
 				pq_sendint32(&s, NInfoGetSpcOid(msg_req->rinfo));
 				pq_sendint32(&s, NInfoGetDbOid(msg_req->rinfo));
 				pq_sendint32(&s, NInfoGetRelNumber(msg_req->rinfo));
@@ -1032,8 +1036,6 @@ nm_pack_request(NeonRequest *msg)
 			{
 				NeonDbSizeRequest *msg_req = (NeonDbSizeRequest *) msg;
 
-				pq_sendbyte(&s, msg_req->req.latest);
-				pq_sendint64(&s, msg_req->req.lsn);
 				pq_sendint32(&s, msg_req->dbNode);
 
 				break;
@@ -1042,8 +1044,6 @@ nm_pack_request(NeonRequest *msg)
 			{
 				NeonGetPageRequest *msg_req = (NeonGetPageRequest *) msg;
 
-				pq_sendbyte(&s, msg_req->req.latest);
-				pq_sendint64(&s, msg_req->req.lsn);
 				pq_sendint32(&s, NInfoGetSpcOid(msg_req->rinfo));
 				pq_sendint32(&s, NInfoGetDbOid(msg_req->rinfo));
 				pq_sendint32(&s, NInfoGetRelNumber(msg_req->rinfo));
@@ -1057,8 +1057,6 @@ nm_pack_request(NeonRequest *msg)
 			{
 				NeonGetSlruSegmentRequest *msg_req = (NeonGetSlruSegmentRequest *) msg;
 
-				pq_sendbyte(&s, msg_req->req.latest);
-				pq_sendint64(&s, msg_req->req.lsn);
 				pq_sendbyte(&s, msg_req->kind);
 				pq_sendint32(&s, msg_req->segno);
 
@@ -1209,7 +1207,7 @@ nm_to_string(NeonMessage *msg)
 				appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo));
 				appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
 				appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
-				appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest);
+				appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since));
 				appendStringInfoChar(&s, '}');
 				break;
 			}
@@ -1222,7 +1220,7 @@ nm_to_string(NeonMessage *msg)
 				appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo));
 				appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
 				appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
-				appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest);
+				appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since));
 				appendStringInfoChar(&s, '}');
 				break;
 			}
@@ -1236,7 +1234,7 @@ nm_to_string(NeonMessage *msg)
 				appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
 				appendStringInfo(&s, ", \"blkno\": %u", msg_req->blkno);
 				appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
-				appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest);
+				appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since));
 				appendStringInfoChar(&s, '}');
 				break;
 			}
@@ -1247,7 +1245,7 @@ nm_to_string(NeonMessage *msg)
 				appendStringInfoString(&s, "{\"type\": \"NeonDbSizeRequest\"");
 				appendStringInfo(&s, ", \"dbnode\": \"%u\"", msg_req->dbNode);
 				appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
-				appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest);
+				appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since));
 				appendStringInfoChar(&s, '}');
 				break;
 			}
@@ -1259,7 +1257,7 @@ nm_to_string(NeonMessage *msg)
 				appendStringInfo(&s, ", \"kind\": %u", msg_req->kind);
 				appendStringInfo(&s, ", \"segno\": %u", msg_req->segno);
 				appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
-				appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest);
+				appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since));
 				appendStringInfoChar(&s, '}');
 				break;
 			}
@@ -1531,44 +1529,38 @@ nm_adjust_lsn(XLogRecPtr lsn)
 /*
  * Return LSN for requesting pages and number of blocks from page server
  */
-static XLogRecPtr
-neon_get_request_lsn(bool *latest, NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno)
+static void
+neon_get_request_lsn(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
+					 XLogRecPtr *request_lsn, XLogRecPtr *not_modified_since)
 {
-	XLogRecPtr	lsn;
+	XLogRecPtr	last_written_lsn;
+
+	last_written_lsn = GetLastWrittenLSN(rinfo, forknum, blkno);
+	last_written_lsn = nm_adjust_lsn(last_written_lsn);
+	Assert(last_written_lsn != InvalidXLogRecPtr);
 
 	if (RecoveryInProgress())
 	{
-		/*
-		 * We don't know if WAL has been generated but not yet replayed, so
-		 * we're conservative in our estimates about latest pages.
-		 */
-		*latest = false;
+		/* Request the page at the last replayed LSN. */
+		*request_lsn = GetXLogReplayRecPtr(NULL);
+		*not_modified_since = last_written_lsn;
+		Assert(last_written_lsn <= *request_lsn);
 
-		/*
-		 * Get the last written LSN of this page.
-		 */
-		lsn = GetLastWrittenLSN(rinfo, forknum, blkno);
-		lsn = nm_adjust_lsn(lsn);
-
-		neon_log(DEBUG1, "neon_get_request_lsn GetXLogReplayRecPtr %X/%X request lsn 0 ",
-			 (uint32) ((lsn) >> 32), (uint32) (lsn));
+		neon_log(DEBUG1, "neon_get_request_lsn request lsn %X/%X, not_modified_since %X/%X",
+				 LSN_FORMAT_ARGS(*request_lsn), LSN_FORMAT_ARGS(*not_modified_since));
 	}
 	else
 	{
 		XLogRecPtr	flushlsn;
 
 		/*
-		 * Use the latest LSN that was evicted from the buffer cache. Any
-		 * pages modified by later WAL records must still in the buffer cache,
-		 * so our request cannot concern those.
+		 * Use the latest LSN that was evicted from the buffer cache as the
+		 * 'not_modified_since' hint. Any pages modified by later WAL records
+		 * must still in the buffer cache, so our request cannot concern
+		 * those.
 		 */
-		*latest = true;
-		lsn = GetLastWrittenLSN(rinfo, forknum, blkno);
-		Assert(lsn != InvalidXLogRecPtr);
 		neon_log(DEBUG1, "neon_get_request_lsn GetLastWrittenLSN lsn %X/%X ",
-			 (uint32) ((lsn) >> 32), (uint32) (lsn));
-
-		lsn = nm_adjust_lsn(lsn);
+				 LSN_FORMAT_ARGS(last_written_lsn));
 
 		/*
 		 * Is it possible that the last-written LSN is ahead of last flush
@@ -1583,16 +1575,109 @@ neon_get_request_lsn(bool *latest, NRelFileInfo rinfo, ForkNumber forknum, Block
 #else
 		flushlsn = GetFlushRecPtr();
 #endif
-		if (lsn > flushlsn)
+		if (last_written_lsn > flushlsn)
 		{
 			neon_log(DEBUG5, "last-written LSN %X/%X is ahead of last flushed LSN %X/%X",
-				 (uint32) (lsn >> 32), (uint32) lsn,
-				 (uint32) (flushlsn >> 32), (uint32) flushlsn);
-			XLogFlush(lsn);
+					 LSN_FORMAT_ARGS(last_written_lsn),
+					 LSN_FORMAT_ARGS(flushlsn));
+			XLogFlush(last_written_lsn);
+			flushlsn = last_written_lsn;
 		}
+
+		/*
+		 * Request the latest version of the page. The most up-to-date request
+		 * LSN we could use would be the current insert LSN, but to avoid the
+		 * overhead of looking it up, use 'flushlsn' instead. This relies on
+		 * the assumption that if the page was modified since the last WAL
+		 * flush, it should still be in the buffer cache, and we wouldn't be
+		 * requesting it.
+		 */
+		*request_lsn = flushlsn;
+		*not_modified_since = last_written_lsn;
+	}
+}
+
+/*
+ *  neon_prefetch_response_usable -- Can a new request be satisfied by old one?
+ *
+ * This is used to check if the response to a prefetch request can be used to
+ * satisfy a page read now.
+ */
+static bool
+neon_prefetch_response_usable(XLogRecPtr request_lsn, XLogRecPtr not_modified_since,
+							  PrefetchRequest *slot)
+{
+	/* sanity check the LSN's on the old and the new request */
+	Assert(request_lsn >= not_modified_since);
+	Assert(slot->request_lsn >= slot->not_modified_since);
+	Assert(slot->status != PRFS_UNUSED);
+
+	/*
+	 * The new request's LSN should never be older than the old one.  This
+	 * could be an Assert, except that for testing purposes, we do provide an
+	 * interface in neon_test_utils to fetch pages at arbitary LSNs, which
+	 * violates this.
+	 *
+	 * Similarly, the not_modified_since value calculated for a page should
+	 * never move backwards. This assumption is a bit fragile; if we updated
+	 * the last-written cache when we read in a page, for example, then it
+	 * might. But as the code stands, it should not.
+	 *
+	 * (If two backends issue a request at the same time, they might race and
+	 * calculate LSNs "out of order" with each other, but the prefetch queue
+	 * is backend-private at the moment.)
+	 */
+	if (request_lsn < slot->request_lsn || not_modified_since < slot->not_modified_since)
+	{
+		ereport(LOG,
+				(errcode(ERRCODE_IO_ERROR),
+				 errmsg(NEON_TAG "request with unexpected LSN after prefetch"),
+				 errdetail("Request %X/%X not_modified_since %X/%X, prefetch %X/%X not_modified_since %X/%X)",
+						   LSN_FORMAT_ARGS(request_lsn), LSN_FORMAT_ARGS(not_modified_since),
+						   LSN_FORMAT_ARGS(slot->request_lsn), LSN_FORMAT_ARGS(slot->not_modified_since))));
+		return false;
 	}
 
-	return lsn;
+	/*---
+	 * Each request to the pageserver carries two LSN values:
+	 * `not_modified_since` and `request_lsn`. The (not_modified_since,
+	 * request_lsn] range of each request is effectively a claim that the page
+	 * has not been modified between those LSNs.  If the range of the old
+	 * request in the queue overlaps with the new request, we know that the
+	 * page hasn't been modified in the union of the ranges. We can use the
+	 * response to old request to satisfy the new request in that case. For
+	 * example:
+	 *
+	 *              100      500
+	 * Old request:  +--------+
+	 *
+	 *                     400      800
+	 * New request:         +--------+
+	 *
+	 * The old request claims that the page was not modified between LSNs 100
+	 * and 500, and the second claims that it was not modified between 400 and
+	 * 800. Together they mean that the page was not modified between 100 and
+	 * 800. Therefore the response to the old request is also valid for the
+	 * new request.
+	 *
+	 * This logic also holds at the boundary case that the old request's LSN
+	 * matches the new request's not_modified_since LSN exactly:
+	 *
+	 *              100      500
+	 * Old request:  +--------+
+	 *
+	 *                       500      900
+	 * New request:           +--------+
+	 *
+	 * The response to the old request is the page as it was at LSN 500, and
+	 * the page hasn't been changed in the range (500, 900], therefore the
+	 * response is valid also for the new request.
+	 */
+
+	/* this follows from the checks above */
+	Assert(request_lsn >= slot->not_modified_since);
+
+	return not_modified_since <= slot->request_lsn;
 }
 
 /*
@@ -1604,8 +1689,8 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
 	bool		exists;
 	NeonResponse *resp;
 	BlockNumber n_blocks;
-	bool		latest;
 	XLogRecPtr	request_lsn;
+	XLogRecPtr	not_modified_since;
 
 	switch (reln->smgr_relpersistence)
 	{
@@ -1660,12 +1745,13 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
 		return false;
 	}
 
-	request_lsn = neon_get_request_lsn(&latest, InfoFromSMgrRel(reln), forkNum, REL_METADATA_PSEUDO_BLOCKNO);
+	neon_get_request_lsn(InfoFromSMgrRel(reln), forkNum, REL_METADATA_PSEUDO_BLOCKNO,
+						 &request_lsn, &not_modified_since);
 	{
 		NeonExistsRequest request = {
 			.req.tag = T_NeonExistsRequest,
-			.req.latest = latest,
 			.req.lsn = request_lsn,
+			.req.not_modified_since = not_modified_since,
 			.rinfo = InfoFromSMgrRel(reln),
 		.forknum = forkNum};
 
@@ -2102,10 +2188,10 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum,
 void
 #if PG_MAJORVERSION_NUM < 16
 neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
-				 XLogRecPtr request_lsn, bool request_latest, char *buffer)
+				 XLogRecPtr request_lsn, XLogRecPtr not_modified_since, char *buffer)
 #else
 neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
-				 XLogRecPtr request_lsn, bool request_latest, void *buffer)
+				 XLogRecPtr request_lsn, XLogRecPtr not_modified_since, void *buffer)
 #endif
 {
 	NeonResponse *resp;
@@ -2148,15 +2234,16 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	if (entry != NULL)
 	{
 		slot = entry->slot;
-		if (slot->effective_request_lsn >= request_lsn)
+		if (neon_prefetch_response_usable(request_lsn, not_modified_since, slot))
 		{
 			ring_index = slot->my_ring_index;
 			pgBufferUsage.prefetch.hits += 1;
 		}
-		else					/* the current prefetch LSN is not large
-								 * enough, so drop the prefetch */
+		else
 		{
 			/*
+			 * Cannot use this prefetch, discard it
+			 *
 			 * We can't drop cache for not-yet-received requested items. It is
 			 * unlikely this happens, but it can happen if prefetch distance
 			 * is large enough and a backend didn't consume all prefetch
@@ -2181,8 +2268,8 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		{
 			pgBufferUsage.prefetch.misses += 1;
 
-			ring_index = prefetch_register_buffer(buftag, &request_latest,
-												  &request_lsn);
+			ring_index = prefetch_register_buffer(buftag, &request_lsn,
+												  &not_modified_since);
 			slot = GetPrfSlot(ring_index);
 		}
 		else
@@ -2246,8 +2333,8 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, char *buffer
 neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer)
 #endif
 {
-	bool		latest;
 	XLogRecPtr	request_lsn;
+	XLogRecPtr	not_modified_since;
 
 	switch (reln->smgr_relpersistence)
 	{
@@ -2272,8 +2359,9 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
 		return;
 	}
 
-	request_lsn = neon_get_request_lsn(&latest, InfoFromSMgrRel(reln), forkNum, blkno);
-	neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsn, latest, buffer);
+	neon_get_request_lsn(InfoFromSMgrRel(reln), forkNum, blkno,
+						 &request_lsn, &not_modified_since);
+	neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsn, not_modified_since, buffer);
 
 #ifdef DEBUG_COMPARE_LOCAL
 	if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln))
@@ -2442,8 +2530,8 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
 {
 	NeonResponse *resp;
 	BlockNumber n_blocks;
-	bool		latest;
 	XLogRecPtr	request_lsn;
+	XLogRecPtr	not_modified_since;
 
 	switch (reln->smgr_relpersistence)
 	{
@@ -2470,12 +2558,13 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
 		return n_blocks;
 	}
 
-	request_lsn = neon_get_request_lsn(&latest, InfoFromSMgrRel(reln), forknum, REL_METADATA_PSEUDO_BLOCKNO);
+	neon_get_request_lsn(InfoFromSMgrRel(reln), forknum, REL_METADATA_PSEUDO_BLOCKNO,
+						 &request_lsn, &not_modified_since);
 	{
 		NeonNblocksRequest request = {
 			.req.tag = T_NeonNblocksRequest,
-			.req.latest = latest,
 			.req.lsn = request_lsn,
+			.req.not_modified_since = not_modified_since,
 			.rinfo = InfoFromSMgrRel(reln),
 			.forknum = forknum,
 		};
@@ -2523,16 +2612,17 @@ neon_dbsize(Oid dbNode)
 {
 	NeonResponse *resp;
 	int64		db_size;
-	XLogRecPtr	request_lsn;
-	bool		latest;
+	XLogRecPtr	request_lsn,
+				not_modified_since;
 	NRelFileInfo dummy_node = {0};
 
-	request_lsn = neon_get_request_lsn(&latest, dummy_node, MAIN_FORKNUM, REL_METADATA_PSEUDO_BLOCKNO);
+	neon_get_request_lsn(dummy_node, MAIN_FORKNUM, REL_METADATA_PSEUDO_BLOCKNO,
+						 &request_lsn, &not_modified_since);
 	{
 		NeonDbSizeRequest request = {
 			.req.tag = T_NeonDbSizeRequest,
-			.req.latest = latest,
 			.req.lsn = request_lsn,
+			.req.not_modified_since = not_modified_since,
 			.dbNode = dbNode,
 		};
 
@@ -2605,7 +2695,6 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 	 * the most recently inserted WAL record's LSN.
 	 */
 	lsn = GetXLogInsertRecPtr();
-
 	lsn = nm_adjust_lsn(lsn);
 
 	/*
@@ -2805,14 +2894,33 @@ neon_end_unlogged_build(SMgrRelation reln)
 static int
 neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buffer)
 {
-	XLogRecPtr request_lsn;
-	/*
-	 * GetRedoStartLsn() returns LSN of basebackup.
-	 * We need to download SLRU segments only once after node startup,
-	 * then SLRUs are maintained locally.
-	 */
-	request_lsn = GetRedoStartLsn();
+	XLogRecPtr request_lsn,
+		not_modified_since;
+
+	if (RecoveryInProgress())
+	{
+		request_lsn = GetXLogReplayRecPtr(NULL);
+		if (request_lsn == InvalidXLogRecPtr)
+		{
+			/*
+			 * This happens in neon startup, we start up without replaying any
+			 * records.
+			 */
+			request_lsn = GetRedoStartLsn();
+		}
+	}
+	else
+		request_lsn = GetXLogInsertRecPtr();
 	request_lsn = nm_adjust_lsn(request_lsn);
+
+	/*
+	 * GetRedoStartLsn() returns LSN of basebackup. We know that the SLRU
+	 * segment has not changed since the basebackup, because in order to
+	 * modify it, we would have had to download it already. And once
+	 * downloaded, we never evict SLRU segments from local disk.
+	 */
+	not_modified_since = GetRedoStartLsn();
+
 	SlruKind kind;
 
     if (STRPREFIX(path, "pg_xact"))
@@ -2827,8 +2935,8 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf
 	NeonResponse *resp;
 	NeonGetSlruSegmentRequest request = {
 		.req.tag = T_NeonGetSlruSegmentRequest,
-		.req.latest = false,
 		.req.lsn = request_lsn,
+		.req.not_modified_since = not_modified_since,
 
 		.kind = kind,
 		.segno = segno
@@ -2956,6 +3064,9 @@ neon_extend_rel_size(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
 {
 	BlockNumber relsize;
 
+	/* This is only used in WAL replay */
+	Assert(RecoveryInProgress());
+
 	/* Extend the relation if we know its size */
 	if (get_cached_relsize(rinfo, forknum, &relsize))
 	{
@@ -2974,14 +3085,13 @@ neon_extend_rel_size(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
 		 * This length is later reused when we open the smgr to read the
 		 * block, which is fine and expected.
 		 */
-
 		NeonResponse *response;
 		NeonNblocksResponse *nbresponse;
 		NeonNblocksRequest request = {
 			.req = (NeonRequest) {
-				.lsn = end_recptr,
-				.latest = false,
 				.tag = T_NeonNblocksRequest,
+				.lsn = end_recptr,
+				.not_modified_since = end_recptr,
 			},
 			.rinfo = rinfo,
 			.forknum = forknum,
diff --git a/pgxn/neon_test_utils/Makefile b/pgxn/neon_test_utils/Makefile
index 9c774ec185..1ee87357e5 100644
--- a/pgxn/neon_test_utils/Makefile
+++ b/pgxn/neon_test_utils/Makefile
@@ -7,7 +7,7 @@ OBJS = \
 	neontest.o
 
 EXTENSION = neon_test_utils
-DATA = neon_test_utils--1.0.sql
+DATA = neon_test_utils--1.1.sql
 PGFILEDESC = "neon_test_utils - helpers for neon testing and debugging"
 
 PG_CONFIG = pg_config
diff --git a/pgxn/neon_test_utils/neon_test_utils--1.0.sql b/pgxn/neon_test_utils/neon_test_utils--1.1.sql
similarity index 89%
rename from pgxn/neon_test_utils/neon_test_utils--1.0.sql
rename to pgxn/neon_test_utils/neon_test_utils--1.1.sql
index 23340e352e..534784f319 100644
--- a/pgxn/neon_test_utils/neon_test_utils--1.0.sql
+++ b/pgxn/neon_test_utils/neon_test_utils--1.1.sql
@@ -31,12 +31,12 @@ AS 'MODULE_PATHNAME', 'clear_buffer_cache'
 LANGUAGE C STRICT
 PARALLEL UNSAFE;
 
-CREATE FUNCTION get_raw_page_at_lsn(relname text, forkname text, blocknum int8, lsn pg_lsn)
+CREATE FUNCTION get_raw_page_at_lsn(relname text, forkname text, blocknum int8, request_lsn pg_lsn, not_modified_since pg_lsn)
 RETURNS bytea
 AS 'MODULE_PATHNAME', 'get_raw_page_at_lsn'
 LANGUAGE C PARALLEL UNSAFE;
 
-CREATE FUNCTION get_raw_page_at_lsn(tbspc oid, db oid, relfilenode oid, forknum int8, blocknum int8, lsn pg_lsn)
+CREATE FUNCTION get_raw_page_at_lsn(tbspc oid, db oid, relfilenode oid, forknum int8, blocknum int8, request_lsn pg_lsn, not_modified_since pg_lsn)
 RETURNS bytea
 AS 'MODULE_PATHNAME', 'get_raw_page_at_lsn_ex'
 LANGUAGE C PARALLEL UNSAFE;
diff --git a/pgxn/neon_test_utils/neon_test_utils.control b/pgxn/neon_test_utils/neon_test_utils.control
index 5219571f11..5f6d640835 100644
--- a/pgxn/neon_test_utils/neon_test_utils.control
+++ b/pgxn/neon_test_utils/neon_test_utils.control
@@ -1,6 +1,6 @@
 # neon_test_utils extension
 comment = 'helpers for neon testing and debugging'
-default_version = '1.0'
+default_version = '1.1'
 module_pathname = '$libdir/neon_test_utils'
 relocatable = true
 trusted = true
diff --git a/pgxn/neon_test_utils/neontest.c b/pgxn/neon_test_utils/neontest.c
index 82ce5be9f6..677006923d 100644
--- a/pgxn/neon_test_utils/neontest.c
+++ b/pgxn/neon_test_utils/neontest.c
@@ -48,10 +48,10 @@ PG_FUNCTION_INFO_V1(neon_xlogflush);
  */
 #if PG_MAJORVERSION_NUM < 16
 typedef void (*neon_read_at_lsn_type) (NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
-									   XLogRecPtr request_lsn, bool request_latest, char *buffer);
+									   XLogRecPtr request_lsn, XLogRecPtr not_modified_since, char *buffer);
 #else
 typedef void (*neon_read_at_lsn_type) (NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
-									   XLogRecPtr request_lsn, bool request_latest, void *buffer);
+									   XLogRecPtr request_lsn, XLogRecPtr not_modified_since, void *buffer);
 #endif
 
 static neon_read_at_lsn_type neon_read_at_lsn_ptr;
@@ -299,8 +299,11 @@ get_raw_page_at_lsn(PG_FUNCTION_ARGS)
 	text	   *forkname;
 	uint32		blkno;
 
-	bool		request_latest = PG_ARGISNULL(3);
-	uint64		read_lsn = request_latest ? GetXLogInsertRecPtr() : PG_GETARG_INT64(3);
+	XLogRecPtr	request_lsn;
+	XLogRecPtr	not_modified_since;
+
+	if (PG_NARGS() != 5)
+		elog(ERROR, "unexpected number of arguments in SQL function signature");
 
 	if (PG_ARGISNULL(0) || PG_ARGISNULL(1) || PG_ARGISNULL(2))
 		PG_RETURN_NULL();
@@ -309,6 +312,9 @@ get_raw_page_at_lsn(PG_FUNCTION_ARGS)
 	forkname = PG_GETARG_TEXT_PP(1);
 	blkno = PG_GETARG_UINT32(2);
 
+	request_lsn = PG_ARGISNULL(3) ? GetXLogInsertRecPtr() : PG_GETARG_LSN(3);
+	not_modified_since = PG_ARGISNULL(4) ? request_lsn : PG_GETARG_LSN(4);
+
 	if (!superuser())
 		ereport(ERROR,
 				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
@@ -361,7 +367,7 @@ get_raw_page_at_lsn(PG_FUNCTION_ARGS)
 	SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ);
 	raw_page_data = VARDATA(raw_page);
 
-	neon_read_at_lsn(InfoFromRelation(rel), forknum, blkno, read_lsn, request_latest, raw_page_data);
+	neon_read_at_lsn(InfoFromRelation(rel), forknum, blkno, request_lsn, not_modified_since, raw_page_data);
 
 	relation_close(rel, AccessShareLock);
 
@@ -380,6 +386,9 @@ get_raw_page_at_lsn_ex(PG_FUNCTION_ARGS)
 {
 	char	   *raw_page_data;
 
+	if (PG_NARGS() != 7)
+		elog(ERROR, "unexpected number of arguments in SQL function signature");
+
 	if (!superuser())
 		ereport(ERROR,
 				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
@@ -403,18 +412,20 @@ get_raw_page_at_lsn_ex(PG_FUNCTION_ARGS)
 		};
 
 		ForkNumber	forknum = PG_GETARG_UINT32(3);
-
 		uint32		blkno = PG_GETARG_UINT32(4);
-		bool		request_latest = PG_ARGISNULL(5);
-		uint64		read_lsn = request_latest ? GetXLogInsertRecPtr() : PG_GETARG_INT64(5);
+		XLogRecPtr	request_lsn;
+		XLogRecPtr	not_modified_since;
 
 		/* Initialize buffer to copy to */
 		bytea	   *raw_page = (bytea *) palloc(BLCKSZ + VARHDRSZ);
 
+		request_lsn = PG_ARGISNULL(5) ? GetXLogInsertRecPtr() : PG_GETARG_LSN(5);
+		not_modified_since = PG_ARGISNULL(6) ? request_lsn : PG_GETARG_LSN(6);
+
 		SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ);
 		raw_page_data = VARDATA(raw_page);
 
-		neon_read_at_lsn(rinfo, forknum, blkno, read_lsn, request_latest, raw_page_data);
+		neon_read_at_lsn(rinfo, forknum, blkno, request_lsn, not_modified_since, raw_page_data);
 		PG_RETURN_BYTEA_P(raw_page);
 	}
 }
diff --git a/poetry.lock b/poetry.lock
index b43a3f6e93..c10e1d1e99 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -2,87 +2,87 @@
 
 [[package]]
 name = "aiohttp"
-version = "3.9.2"
+version = "3.9.4"
 description = "Async http client/server framework (asyncio)"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "aiohttp-3.9.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:772fbe371788e61c58d6d3d904268e48a594ba866804d08c995ad71b144f94cb"},
-    {file = "aiohttp-3.9.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:edd4f1af2253f227ae311ab3d403d0c506c9b4410c7fc8d9573dec6d9740369f"},
-    {file = "aiohttp-3.9.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:cfee9287778399fdef6f8a11c9e425e1cb13cc9920fd3a3df8f122500978292b"},
-    {file = "aiohttp-3.9.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3cc158466f6a980a6095ee55174d1de5730ad7dec251be655d9a6a9dd7ea1ff9"},
-    {file = "aiohttp-3.9.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:54ec82f45d57c9a65a1ead3953b51c704f9587440e6682f689da97f3e8defa35"},
-    {file = "aiohttp-3.9.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:abeb813a18eb387f0d835ef51f88568540ad0325807a77a6e501fed4610f864e"},
-    {file = "aiohttp-3.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cc91d07280d7d169f3a0f9179d8babd0ee05c79d4d891447629ff0d7d8089ec2"},
-    {file = "aiohttp-3.9.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b65e861f4bebfb660f7f0f40fa3eb9f2ab9af10647d05dac824390e7af8f75b7"},
-    {file = "aiohttp-3.9.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:04fd8ffd2be73d42bcf55fd78cde7958eeee6d4d8f73c3846b7cba491ecdb570"},
-    {file = "aiohttp-3.9.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:3d8d962b439a859b3ded9a1e111a4615357b01620a546bc601f25b0211f2da81"},
-    {file = "aiohttp-3.9.2-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:8ceb658afd12b27552597cf9a65d9807d58aef45adbb58616cdd5ad4c258c39e"},
-    {file = "aiohttp-3.9.2-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:0e4ee4df741670560b1bc393672035418bf9063718fee05e1796bf867e995fad"},
-    {file = "aiohttp-3.9.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:2dec87a556f300d3211decf018bfd263424f0690fcca00de94a837949fbcea02"},
-    {file = "aiohttp-3.9.2-cp310-cp310-win32.whl", hash = "sha256:3e1a800f988ce7c4917f34096f81585a73dbf65b5c39618b37926b1238cf9bc4"},
-    {file = "aiohttp-3.9.2-cp310-cp310-win_amd64.whl", hash = "sha256:ea510718a41b95c236c992b89fdfc3d04cc7ca60281f93aaada497c2b4e05c46"},
-    {file = "aiohttp-3.9.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:6aaa6f99256dd1b5756a50891a20f0d252bd7bdb0854c5d440edab4495c9f973"},
-    {file = "aiohttp-3.9.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a27d8c70ad87bcfce2e97488652075a9bdd5b70093f50b10ae051dfe5e6baf37"},
-    {file = "aiohttp-3.9.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:54287bcb74d21715ac8382e9de146d9442b5f133d9babb7e5d9e453faadd005e"},
-    {file = "aiohttp-3.9.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5bb3d05569aa83011fcb346b5266e00b04180105fcacc63743fc2e4a1862a891"},
-    {file = "aiohttp-3.9.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c8534e7d69bb8e8d134fe2be9890d1b863518582f30c9874ed7ed12e48abe3c4"},
-    {file = "aiohttp-3.9.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4bd9d5b989d57b41e4ff56ab250c5ddf259f32db17159cce630fd543376bd96b"},
-    {file = "aiohttp-3.9.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fa6904088e6642609981f919ba775838ebf7df7fe64998b1a954fb411ffb4663"},
-    {file = "aiohttp-3.9.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bda42eb410be91b349fb4ee3a23a30ee301c391e503996a638d05659d76ea4c2"},
-    {file = "aiohttp-3.9.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:193cc1ccd69d819562cc7f345c815a6fc51d223b2ef22f23c1a0f67a88de9a72"},
-    {file = "aiohttp-3.9.2-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:b9f1cb839b621f84a5b006848e336cf1496688059d2408e617af33e3470ba204"},
-    {file = "aiohttp-3.9.2-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:d22a0931848b8c7a023c695fa2057c6aaac19085f257d48baa24455e67df97ec"},
-    {file = "aiohttp-3.9.2-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:4112d8ba61fbd0abd5d43a9cb312214565b446d926e282a6d7da3f5a5aa71d36"},
-    {file = "aiohttp-3.9.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:c4ad4241b52bb2eb7a4d2bde060d31c2b255b8c6597dd8deac2f039168d14fd7"},
-    {file = "aiohttp-3.9.2-cp311-cp311-win32.whl", hash = "sha256:ee2661a3f5b529f4fc8a8ffee9f736ae054adfb353a0d2f78218be90617194b3"},
-    {file = "aiohttp-3.9.2-cp311-cp311-win_amd64.whl", hash = "sha256:4deae2c165a5db1ed97df2868ef31ca3cc999988812e82386d22937d9d6fed52"},
-    {file = "aiohttp-3.9.2-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:6f4cdba12539215aaecf3c310ce9d067b0081a0795dd8a8805fdb67a65c0572a"},
-    {file = "aiohttp-3.9.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:84e843b33d5460a5c501c05539809ff3aee07436296ff9fbc4d327e32aa3a326"},
-    {file = "aiohttp-3.9.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8008d0f451d66140a5aa1c17e3eedc9d56e14207568cd42072c9d6b92bf19b52"},
-    {file = "aiohttp-3.9.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:61c47ab8ef629793c086378b1df93d18438612d3ed60dca76c3422f4fbafa792"},
-    {file = "aiohttp-3.9.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bc71f748e12284312f140eaa6599a520389273174b42c345d13c7e07792f4f57"},
-    {file = "aiohttp-3.9.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a1c3a4d0ab2f75f22ec80bca62385db2e8810ee12efa8c9e92efea45c1849133"},
-    {file = "aiohttp-3.9.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9a87aa0b13bbee025faa59fa58861303c2b064b9855d4c0e45ec70182bbeba1b"},
-    {file = "aiohttp-3.9.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e2cc0d04688b9f4a7854c56c18aa7af9e5b0a87a28f934e2e596ba7e14783192"},
-    {file = "aiohttp-3.9.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:1956e3ac376b1711c1533266dec4efd485f821d84c13ce1217d53e42c9e65f08"},
-    {file = "aiohttp-3.9.2-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:114da29f39eccd71b93a0fcacff178749a5c3559009b4a4498c2c173a6d74dff"},
-    {file = "aiohttp-3.9.2-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:3f17999ae3927d8a9a823a1283b201344a0627272f92d4f3e3a4efe276972fe8"},
-    {file = "aiohttp-3.9.2-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:f31df6a32217a34ae2f813b152a6f348154f948c83213b690e59d9e84020925c"},
-    {file = "aiohttp-3.9.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:7a75307ffe31329928a8d47eae0692192327c599113d41b278d4c12b54e1bd11"},
-    {file = "aiohttp-3.9.2-cp312-cp312-win32.whl", hash = "sha256:972b63d589ff8f305463593050a31b5ce91638918da38139b9d8deaba9e0fed7"},
-    {file = "aiohttp-3.9.2-cp312-cp312-win_amd64.whl", hash = "sha256:200dc0246f0cb5405c80d18ac905c8350179c063ea1587580e3335bfc243ba6a"},
-    {file = "aiohttp-3.9.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:158564d0d1020e0d3fe919a81d97aadad35171e13e7b425b244ad4337fc6793a"},
-    {file = "aiohttp-3.9.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:da1346cd0ccb395f0ed16b113ebb626fa43b7b07fd7344fce33e7a4f04a8897a"},
-    {file = "aiohttp-3.9.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:eaa9256de26ea0334ffa25f1913ae15a51e35c529a1ed9af8e6286dd44312554"},
-    {file = "aiohttp-3.9.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1543e7fb00214fb4ccead42e6a7d86f3bb7c34751ec7c605cca7388e525fd0b4"},
-    {file = "aiohttp-3.9.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:186e94570433a004e05f31f632726ae0f2c9dee4762a9ce915769ce9c0a23d89"},
-    {file = "aiohttp-3.9.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d52d20832ac1560f4510d68e7ba8befbc801a2b77df12bd0cd2bcf3b049e52a4"},
-    {file = "aiohttp-3.9.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1c45e4e815ac6af3b72ca2bde9b608d2571737bb1e2d42299fc1ffdf60f6f9a1"},
-    {file = "aiohttp-3.9.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:aa906b9bdfd4a7972dd0628dbbd6413d2062df5b431194486a78f0d2ae87bd55"},
-    {file = "aiohttp-3.9.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:68bbee9e17d66f17bb0010aa15a22c6eb28583edcc8b3212e2b8e3f77f3ebe2a"},
-    {file = "aiohttp-3.9.2-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:4c189b64bd6d9a403a1a3f86a3ab3acbc3dc41a68f73a268a4f683f89a4dec1f"},
-    {file = "aiohttp-3.9.2-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:8a7876f794523123bca6d44bfecd89c9fec9ec897a25f3dd202ee7fc5c6525b7"},
-    {file = "aiohttp-3.9.2-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:d23fba734e3dd7b1d679b9473129cd52e4ec0e65a4512b488981a56420e708db"},
-    {file = "aiohttp-3.9.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:b141753be581fab842a25cb319f79536d19c2a51995d7d8b29ee290169868eab"},
-    {file = "aiohttp-3.9.2-cp38-cp38-win32.whl", hash = "sha256:103daf41ff3b53ba6fa09ad410793e2e76c9d0269151812e5aba4b9dd674a7e8"},
-    {file = "aiohttp-3.9.2-cp38-cp38-win_amd64.whl", hash = "sha256:328918a6c2835861ff7afa8c6d2c70c35fdaf996205d5932351bdd952f33fa2f"},
-    {file = "aiohttp-3.9.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:5264d7327c9464786f74e4ec9342afbbb6ee70dfbb2ec9e3dfce7a54c8043aa3"},
-    {file = "aiohttp-3.9.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:07205ae0015e05c78b3288c1517afa000823a678a41594b3fdc870878d645305"},
-    {file = "aiohttp-3.9.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ae0a1e638cffc3ec4d4784b8b4fd1cf28968febc4bd2718ffa25b99b96a741bd"},
-    {file = "aiohttp-3.9.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d43302a30ba1166325974858e6ef31727a23bdd12db40e725bec0f759abce505"},
-    {file = "aiohttp-3.9.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:16a967685907003765855999af11a79b24e70b34dc710f77a38d21cd9fc4f5fe"},
-    {file = "aiohttp-3.9.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6fa3ee92cd441d5c2d07ca88d7a9cef50f7ec975f0117cd0c62018022a184308"},
-    {file = "aiohttp-3.9.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b500c5ad9c07639d48615a770f49618130e61be36608fc9bc2d9bae31732b8f"},
-    {file = "aiohttp-3.9.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c07327b368745b1ce2393ae9e1aafed7073d9199e1dcba14e035cc646c7941bf"},
-    {file = "aiohttp-3.9.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:cc7d6502c23a0ec109687bf31909b3fb7b196faf198f8cff68c81b49eb316ea9"},
-    {file = "aiohttp-3.9.2-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:07be2be7071723c3509ab5c08108d3a74f2181d4964e869f2504aaab68f8d3e8"},
-    {file = "aiohttp-3.9.2-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:122468f6fee5fcbe67cb07014a08c195b3d4c41ff71e7b5160a7bcc41d585a5f"},
-    {file = "aiohttp-3.9.2-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:00a9abcea793c81e7f8778ca195a1714a64f6d7436c4c0bb168ad2a212627000"},
-    {file = "aiohttp-3.9.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:7a9825fdd64ecac5c670234d80bb52bdcaa4139d1f839165f548208b3779c6c6"},
-    {file = "aiohttp-3.9.2-cp39-cp39-win32.whl", hash = "sha256:5422cd9a4a00f24c7244e1b15aa9b87935c85fb6a00c8ac9b2527b38627a9211"},
-    {file = "aiohttp-3.9.2-cp39-cp39-win_amd64.whl", hash = "sha256:7d579dcd5d82a86a46f725458418458fa43686f6a7b252f2966d359033ffc8ab"},
-    {file = "aiohttp-3.9.2.tar.gz", hash = "sha256:b0ad0a5e86ce73f5368a164c10ada10504bf91869c05ab75d982c6048217fbf7"},
+    {file = "aiohttp-3.9.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:76d32588ef7e4a3f3adff1956a0ba96faabbdee58f2407c122dd45aa6e34f372"},
+    {file = "aiohttp-3.9.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:56181093c10dbc6ceb8a29dfeea1e815e1dfdc020169203d87fd8d37616f73f9"},
+    {file = "aiohttp-3.9.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c7a5b676d3c65e88b3aca41816bf72831898fcd73f0cbb2680e9d88e819d1e4d"},
+    {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d1df528a85fb404899d4207a8d9934cfd6be626e30e5d3a5544a83dbae6d8a7e"},
+    {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f595db1bceabd71c82e92df212dd9525a8a2c6947d39e3c994c4f27d2fe15b11"},
+    {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9c0b09d76e5a4caac3d27752027fbd43dc987b95f3748fad2b924a03fe8632ad"},
+    {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:689eb4356649ec9535b3686200b231876fb4cab4aca54e3bece71d37f50c1d13"},
+    {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a3666cf4182efdb44d73602379a66f5fdfd5da0db5e4520f0ac0dcca644a3497"},
+    {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:b65b0f8747b013570eea2f75726046fa54fa8e0c5db60f3b98dd5d161052004a"},
+    {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:a1885d2470955f70dfdd33a02e1749613c5a9c5ab855f6db38e0b9389453dce7"},
+    {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:0593822dcdb9483d41f12041ff7c90d4d1033ec0e880bcfaf102919b715f47f1"},
+    {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:47f6eb74e1ecb5e19a78f4a4228aa24df7fbab3b62d4a625d3f41194a08bd54f"},
+    {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:c8b04a3dbd54de6ccb7604242fe3ad67f2f3ca558f2d33fe19d4b08d90701a89"},
+    {file = "aiohttp-3.9.4-cp310-cp310-win32.whl", hash = "sha256:8a78dfb198a328bfb38e4308ca8167028920fb747ddcf086ce706fbdd23b2926"},
+    {file = "aiohttp-3.9.4-cp310-cp310-win_amd64.whl", hash = "sha256:e78da6b55275987cbc89141a1d8e75f5070e577c482dd48bd9123a76a96f0bbb"},
+    {file = "aiohttp-3.9.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:c111b3c69060d2bafc446917534150fd049e7aedd6cbf21ba526a5a97b4402a5"},
+    {file = "aiohttp-3.9.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:efbdd51872cf170093998c87ccdf3cb5993add3559341a8e5708bcb311934c94"},
+    {file = "aiohttp-3.9.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7bfdb41dc6e85d8535b00d73947548a748e9534e8e4fddd2638109ff3fb081df"},
+    {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2bd9d334412961125e9f68d5b73c1d0ab9ea3f74a58a475e6b119f5293eee7ba"},
+    {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:35d78076736f4a668d57ade00c65d30a8ce28719d8a42471b2a06ccd1a2e3063"},
+    {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:824dff4f9f4d0f59d0fa3577932ee9a20e09edec8a2f813e1d6b9f89ced8293f"},
+    {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:52b8b4e06fc15519019e128abedaeb56412b106ab88b3c452188ca47a25c4093"},
+    {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:eae569fb1e7559d4f3919965617bb39f9e753967fae55ce13454bec2d1c54f09"},
+    {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:69b97aa5792428f321f72aeb2f118e56893371f27e0b7d05750bcad06fc42ca1"},
+    {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:4d79aad0ad4b980663316f26d9a492e8fab2af77c69c0f33780a56843ad2f89e"},
+    {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:d6577140cd7db19e430661e4b2653680194ea8c22c994bc65b7a19d8ec834403"},
+    {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:9860d455847cd98eb67897f5957b7cd69fbcb436dd3f06099230f16a66e66f79"},
+    {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:69ff36d3f8f5652994e08bd22f093e11cfd0444cea310f92e01b45a4e46b624e"},
+    {file = "aiohttp-3.9.4-cp311-cp311-win32.whl", hash = "sha256:e27d3b5ed2c2013bce66ad67ee57cbf614288bda8cdf426c8d8fe548316f1b5f"},
+    {file = "aiohttp-3.9.4-cp311-cp311-win_amd64.whl", hash = "sha256:d6a67e26daa686a6fbdb600a9af8619c80a332556245fa8e86c747d226ab1a1e"},
+    {file = "aiohttp-3.9.4-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:c5ff8ff44825736a4065d8544b43b43ee4c6dd1530f3a08e6c0578a813b0aa35"},
+    {file = "aiohttp-3.9.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:d12a244627eba4e9dc52cbf924edef905ddd6cafc6513849b4876076a6f38b0e"},
+    {file = "aiohttp-3.9.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:dcad56c8d8348e7e468899d2fb3b309b9bc59d94e6db08710555f7436156097f"},
+    {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4f7e69a7fd4b5ce419238388e55abd220336bd32212c673ceabc57ccf3d05b55"},
+    {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c4870cb049f10d7680c239b55428916d84158798eb8f353e74fa2c98980dcc0b"},
+    {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3b2feaf1b7031ede1bc0880cec4b0776fd347259a723d625357bb4b82f62687b"},
+    {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:939393e8c3f0a5bcd33ef7ace67680c318dc2ae406f15e381c0054dd658397de"},
+    {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7d2334e387b2adcc944680bebcf412743f2caf4eeebd550f67249c1c3696be04"},
+    {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:e0198ea897680e480845ec0ffc5a14e8b694e25b3f104f63676d55bf76a82f1a"},
+    {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:e40d2cd22914d67c84824045861a5bb0fb46586b15dfe4f046c7495bf08306b2"},
+    {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:aba80e77c227f4234aa34a5ff2b6ff30c5d6a827a91d22ff6b999de9175d71bd"},
+    {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:fb68dc73bc8ac322d2e392a59a9e396c4f35cb6fdbdd749e139d1d6c985f2527"},
+    {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:f3460a92638dce7e47062cf088d6e7663adb135e936cb117be88d5e6c48c9d53"},
+    {file = "aiohttp-3.9.4-cp312-cp312-win32.whl", hash = "sha256:32dc814ddbb254f6170bca198fe307920f6c1308a5492f049f7f63554b88ef36"},
+    {file = "aiohttp-3.9.4-cp312-cp312-win_amd64.whl", hash = "sha256:63f41a909d182d2b78fe3abef557fcc14da50c7852f70ae3be60e83ff64edba5"},
+    {file = "aiohttp-3.9.4-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:c3770365675f6be220032f6609a8fbad994d6dcf3ef7dbcf295c7ee70884c9af"},
+    {file = "aiohttp-3.9.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:305edae1dea368ce09bcb858cf5a63a064f3bff4767dec6fa60a0cc0e805a1d3"},
+    {file = "aiohttp-3.9.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:6f121900131d116e4a93b55ab0d12ad72573f967b100e49086e496a9b24523ea"},
+    {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b71e614c1ae35c3d62a293b19eface83d5e4d194e3eb2fabb10059d33e6e8cbf"},
+    {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:419f009fa4cfde4d16a7fc070d64f36d70a8d35a90d71aa27670bba2be4fd039"},
+    {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7b39476ee69cfe64061fd77a73bf692c40021f8547cda617a3466530ef63f947"},
+    {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b33f34c9c7decdb2ab99c74be6443942b730b56d9c5ee48fb7df2c86492f293c"},
+    {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c78700130ce2dcebb1a8103202ae795be2fa8c9351d0dd22338fe3dac74847d9"},
+    {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:268ba22d917655d1259af2d5659072b7dc11b4e1dc2cb9662fdd867d75afc6a4"},
+    {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:17e7c051f53a0d2ebf33013a9cbf020bb4e098c4bc5bce6f7b0c962108d97eab"},
+    {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:7be99f4abb008cb38e144f85f515598f4c2c8932bf11b65add0ff59c9c876d99"},
+    {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:d58a54d6ff08d2547656356eea8572b224e6f9bbc0cf55fa9966bcaac4ddfb10"},
+    {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:7673a76772bda15d0d10d1aa881b7911d0580c980dbd16e59d7ba1422b2d83cd"},
+    {file = "aiohttp-3.9.4-cp38-cp38-win32.whl", hash = "sha256:e4370dda04dc8951012f30e1ce7956a0a226ac0714a7b6c389fb2f43f22a250e"},
+    {file = "aiohttp-3.9.4-cp38-cp38-win_amd64.whl", hash = "sha256:eb30c4510a691bb87081192a394fb661860e75ca3896c01c6d186febe7c88530"},
+    {file = "aiohttp-3.9.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:84e90494db7df3be5e056f91412f9fa9e611fbe8ce4aaef70647297f5943b276"},
+    {file = "aiohttp-3.9.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7d4845f8501ab28ebfdbeab980a50a273b415cf69e96e4e674d43d86a464df9d"},
+    {file = "aiohttp-3.9.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:69046cd9a2a17245c4ce3c1f1a4ff8c70c7701ef222fce3d1d8435f09042bba1"},
+    {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8b73a06bafc8dcc508420db43b4dd5850e41e69de99009d0351c4f3007960019"},
+    {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:418bb0038dfafeac923823c2e63226179976c76f981a2aaad0ad5d51f2229bca"},
+    {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:71a8f241456b6c2668374d5d28398f8e8cdae4cce568aaea54e0f39359cd928d"},
+    {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:935c369bf8acc2dc26f6eeb5222768aa7c62917c3554f7215f2ead7386b33748"},
+    {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:74e4e48c8752d14ecfb36d2ebb3d76d614320570e14de0a3aa7a726ff150a03c"},
+    {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:916b0417aeddf2c8c61291238ce25286f391a6acb6f28005dd9ce282bd6311b6"},
+    {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:9b6787b6d0b3518b2ee4cbeadd24a507756ee703adbac1ab6dc7c4434b8c572a"},
+    {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:221204dbda5ef350e8db6287937621cf75e85778b296c9c52260b522231940ed"},
+    {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:10afd99b8251022ddf81eaed1d90f5a988e349ee7d779eb429fb07b670751e8c"},
+    {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:2506d9f7a9b91033201be9ffe7d89c6a54150b0578803cce5cb84a943d075bc3"},
+    {file = "aiohttp-3.9.4-cp39-cp39-win32.whl", hash = "sha256:e571fdd9efd65e86c6af2f332e0e95dad259bfe6beb5d15b3c3eca3a6eb5d87b"},
+    {file = "aiohttp-3.9.4-cp39-cp39-win_amd64.whl", hash = "sha256:7d29dd5319d20aa3b7749719ac9685fbd926f71ac8c77b2477272725f882072d"},
+    {file = "aiohttp-3.9.4.tar.gz", hash = "sha256:6ff71ede6d9a5a58cfb7b6fffc83ab5d4a63138276c771ac91ceaaddf5459644"},
 ]
 
 [package.dependencies]
@@ -1191,13 +1191,13 @@ files = [
 
 [[package]]
 name = "idna"
-version = "3.3"
+version = "3.7"
 description = "Internationalized Domain Names in Applications (IDNA)"
 optional = false
 python-versions = ">=3.5"
 files = [
-    {file = "idna-3.3-py3-none-any.whl", hash = "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff"},
-    {file = "idna-3.3.tar.gz", hash = "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"},
+    {file = "idna-3.7-py3-none-any.whl", hash = "sha256:82fee1fc78add43492d3a1898bfa6d8a904cc97d8427f683ed8e798d07761aa0"},
+    {file = "idna-3.7.tar.gz", hash = "sha256:028ff3aadf0609c1fd278d8ea3089299412a7a8b9bd005dd08b9f8285bcb5cfc"},
 ]
 
 [[package]]
@@ -2182,6 +2182,7 @@ files = [
     {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
     {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
     {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
+    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"},
     {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
     {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
     {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
@@ -2652,6 +2653,16 @@ files = [
     {file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"},
     {file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"},
     {file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"},
+    {file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"},
+    {file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"},
+    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"},
+    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"},
+    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"},
+    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"},
+    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"},
+    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"},
+    {file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"},
+    {file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"},
@@ -2889,4 +2900,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "c478ef3786bd94a0b7887518ceaf444993b77de873074102d018ac5926bc5d34"
+content-hash = "b3452b50901123fd5f2c385ce8a0c1c492296393b8a7926a322b6df0ea3ac572"
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index b327890be2..0e8d03906b 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -12,6 +12,7 @@ testing = []
 anyhow.workspace = true
 async-compression.workspace = true
 async-trait.workspace = true
+atomic-take.workspace = true
 aws-config.workspace = true
 aws-sdk-iam.workspace = true
 aws-sigv4.workspace = true
@@ -36,10 +37,14 @@ http.workspace = true
 humantime.workspace = true
 hyper-tungstenite.workspace = true
 hyper.workspace = true
+hyper1 = { package = "hyper", version = "1.2", features = ["server"] }
+hyper-util = { version = "0.1", features = ["server", "http1", "http2", "tokio"] }
+http-body-util = { version = "0.1" }
 ipnet.workspace = true
 itertools.workspace = true
 lasso = { workspace = true, features = ["multi-threaded"] }
 md5.workspace = true
+measured = { workspace = true, features = ["lasso"] }
 metrics.workspace = true
 once_cell.workspace = true
 opentelemetry.workspace = true
@@ -54,8 +59,8 @@ prometheus.workspace = true
 rand.workspace = true
 regex.workspace = true
 remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
-reqwest = { workspace = true, features = ["json"] }
-reqwest-middleware.workspace = true
+reqwest.workspace = true
+reqwest-middleware = { workspace = true, features = ["json"] }
 reqwest-retry.workspace = true
 reqwest-tracing.workspace = true
 routerify.workspace = true
@@ -79,6 +84,7 @@ tokio-postgres.workspace = true
 tokio-rustls.workspace = true
 tokio-util.workspace = true
 tokio = { workspace = true, features = ["signal"] }
+tower-service.workspace = true
 tracing-opentelemetry.workspace = true
 tracing-subscriber.workspace = true
 tracing-utils.workspace = true
diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index e421798067..3795e3b608 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -2,8 +2,15 @@ mod classic;
 mod hacks;
 mod link;
 
+use std::net::IpAddr;
+use std::sync::Arc;
+use std::time::Duration;
+
+use ipnet::{Ipv4Net, Ipv6Net};
 pub use link::LinkAuthError;
+use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_postgres::config::AuthKeys;
+use tracing::{info, warn};
 
 use crate::auth::credentials::check_peer_addr_is_in_list;
 use crate::auth::validate_password_and_exchange;
@@ -13,9 +20,10 @@ use crate::console::provider::{CachedRoleSecret, ConsoleBackend};
 use crate::console::{AuthSecret, NodeInfo};
 use crate::context::RequestMonitoring;
 use crate::intern::EndpointIdInt;
-use crate::metrics::{AUTH_RATE_LIMIT_HITS, ENDPOINTS_AUTH_RATE_LIMITED};
+use crate::metrics::Metrics;
 use crate::proxy::connect_compute::ComputeConnectBackend;
 use crate::proxy::NeonOptions;
+use crate::rate_limiter::{BucketRateLimiter, RateBucketInfo};
 use crate::stream::Stream;
 use crate::{
     auth::{self, ComputeUserInfoMaybeEndpoint},
@@ -27,10 +35,7 @@ use crate::{
     },
     stream, url,
 };
-use crate::{scram, EndpointCacheKey, EndpointId, RoleName};
-use std::sync::Arc;
-use tokio::io::{AsyncRead, AsyncWrite};
-use tracing::{info, warn};
+use crate::{scram, EndpointCacheKey, EndpointId, Normalize, RoleName};
 
 /// Alternative to [`std::borrow::Cow`] but doesn't need `T: ToOwned` as we don't need that functionality
 pub enum MaybeOwned<'a, T> {
@@ -176,17 +181,51 @@ impl TryFrom<ComputeUserInfoMaybeEndpoint> for ComputeUserInfo {
     }
 }
 
+#[derive(PartialEq, PartialOrd, Hash, Eq, Ord, Debug, Copy, Clone)]
+pub struct MaskedIp(IpAddr);
+
+impl MaskedIp {
+    fn new(value: IpAddr, prefix: u8) -> Self {
+        match value {
+            IpAddr::V4(v4) => Self(IpAddr::V4(
+                Ipv4Net::new(v4, prefix).map_or(v4, |x| x.trunc().addr()),
+            )),
+            IpAddr::V6(v6) => Self(IpAddr::V6(
+                Ipv6Net::new(v6, prefix).map_or(v6, |x| x.trunc().addr()),
+            )),
+        }
+    }
+}
+
+// This can't be just per IP because that would limit some PaaS that share IP addresses
+pub type AuthRateLimiter = BucketRateLimiter<(EndpointIdInt, MaskedIp)>;
+
+impl RateBucketInfo {
+    /// All of these are per endpoint-maskedip pair.
+    /// Context: 4096 rounds of pbkdf2 take about 1ms of cpu time to execute (1 milli-cpu-second or 1mcpus).
+    ///
+    /// First bucket: 1000mcpus total per endpoint-ip pair
+    /// * 4096000 requests per second with 1 hash rounds.
+    /// * 1000 requests per second with 4096 hash rounds.
+    /// * 6.8 requests per second with 600000 hash rounds.
+    pub const DEFAULT_AUTH_SET: [Self; 3] = [
+        Self::new(1000 * 4096, Duration::from_secs(1)),
+        Self::new(600 * 4096, Duration::from_secs(60)),
+        Self::new(300 * 4096, Duration::from_secs(600)),
+    ];
+}
+
 impl AuthenticationConfig {
     pub fn check_rate_limit(
         &self,
-
         ctx: &mut RequestMonitoring,
+        config: &AuthenticationConfig,
         secret: AuthSecret,
         endpoint: &EndpointId,
         is_cleartext: bool,
     ) -> auth::Result<AuthSecret> {
         // we have validated the endpoint exists, so let's intern it.
-        let endpoint_int = EndpointIdInt::from(endpoint);
+        let endpoint_int = EndpointIdInt::from(endpoint.normalize());
 
         // only count the full hash count if password hack or websocket flow.
         // in other words, if proxy needs to run the hashing
@@ -201,17 +240,25 @@ impl AuthenticationConfig {
             1
         };
 
-        let limit_not_exceeded = self
-            .rate_limiter
-            .check((endpoint_int, ctx.peer_addr), password_weight);
+        let limit_not_exceeded = self.rate_limiter.check(
+            (
+                endpoint_int,
+                MaskedIp::new(ctx.peer_addr, config.rate_limit_ip_subnet),
+            ),
+            password_weight,
+        );
 
         if !limit_not_exceeded {
             warn!(
                 enabled = self.rate_limiter_enabled,
                 "rate limiting authentication"
             );
-            AUTH_RATE_LIMIT_HITS.inc();
-            ENDPOINTS_AUTH_RATE_LIMITED.measure(endpoint);
+            Metrics::get().proxy.requests_auth_rate_limits_total.inc();
+            Metrics::get()
+                .proxy
+                .endpoints_auth_rate_limits
+                .get_metric()
+                .measure(endpoint);
 
             if self.rate_limiter_enabled {
                 return Err(auth::AuthError::too_many_connections());
@@ -267,6 +314,7 @@ async fn auth_quirks(
     let secret = match secret {
         Some(secret) => config.check_rate_limit(
             ctx,
+            config,
             secret,
             &info.endpoint,
             unauthenticated_password.is_some() || allow_cleartext,
@@ -469,7 +517,7 @@ impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, &()> {
 
 #[cfg(test)]
 mod tests {
-    use std::sync::Arc;
+    use std::{net::IpAddr, sync::Arc, time::Duration};
 
     use bytes::BytesMut;
     use fallible_iterator::FallibleIterator;
@@ -482,7 +530,7 @@ mod tests {
     use tokio::io::{AsyncRead, AsyncReadExt, AsyncWriteExt};
 
     use crate::{
-        auth::{ComputeUserInfoMaybeEndpoint, IpPattern},
+        auth::{backend::MaskedIp, ComputeUserInfoMaybeEndpoint, IpPattern},
         config::AuthenticationConfig,
         console::{
             self,
@@ -491,12 +539,12 @@ mod tests {
         },
         context::RequestMonitoring,
         proxy::NeonOptions,
-        rate_limiter::{AuthRateLimiter, RateBucketInfo},
+        rate_limiter::RateBucketInfo,
         scram::ServerSecret,
         stream::{PqStream, Stream},
     };
 
-    use super::auth_quirks;
+    use super::{auth_quirks, AuthRateLimiter};
 
     struct Auth {
         ips: Vec<IpPattern>,
@@ -537,6 +585,7 @@ mod tests {
         scram_protocol_timeout: std::time::Duration::from_secs(5),
         rate_limiter_enabled: true,
         rate_limiter: AuthRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET),
+        rate_limit_ip_subnet: 64,
     });
 
     async fn read_message(r: &mut (impl AsyncRead + Unpin), b: &mut BytesMut) -> PgMessage {
@@ -548,6 +597,51 @@ mod tests {
         }
     }
 
+    #[test]
+    fn masked_ip() {
+        let ip_a = IpAddr::V4([127, 0, 0, 1].into());
+        let ip_b = IpAddr::V4([127, 0, 0, 2].into());
+        let ip_c = IpAddr::V4([192, 168, 1, 101].into());
+        let ip_d = IpAddr::V4([192, 168, 1, 102].into());
+        let ip_e = IpAddr::V6("abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd".parse().unwrap());
+        let ip_f = IpAddr::V6("abcd:abcd:abcd:abcd:1234:abcd:abcd:abcd".parse().unwrap());
+
+        assert_ne!(MaskedIp::new(ip_a, 64), MaskedIp::new(ip_b, 64));
+        assert_ne!(MaskedIp::new(ip_a, 32), MaskedIp::new(ip_b, 32));
+        assert_eq!(MaskedIp::new(ip_a, 30), MaskedIp::new(ip_b, 30));
+        assert_eq!(MaskedIp::new(ip_c, 30), MaskedIp::new(ip_d, 30));
+
+        assert_ne!(MaskedIp::new(ip_e, 128), MaskedIp::new(ip_f, 128));
+        assert_eq!(MaskedIp::new(ip_e, 64), MaskedIp::new(ip_f, 64));
+    }
+
+    #[test]
+    fn test_default_auth_rate_limit_set() {
+        // these values used to exceed u32::MAX
+        assert_eq!(
+            RateBucketInfo::DEFAULT_AUTH_SET,
+            [
+                RateBucketInfo {
+                    interval: Duration::from_secs(1),
+                    max_rpi: 1000 * 4096,
+                },
+                RateBucketInfo {
+                    interval: Duration::from_secs(60),
+                    max_rpi: 600 * 4096 * 60,
+                },
+                RateBucketInfo {
+                    interval: Duration::from_secs(600),
+                    max_rpi: 300 * 4096 * 600,
+                }
+            ]
+        );
+
+        for x in RateBucketInfo::DEFAULT_AUTH_SET {
+            let y = x.to_string().parse().unwrap();
+            assert_eq!(x, y);
+        }
+    }
+
     #[tokio::test]
     async fn auth_quirks_scram() {
         let (mut client, server) = tokio::io::duplex(1024);
diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs
index 89773aa1ff..783a1a5a21 100644
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -4,7 +4,7 @@ use crate::{
     auth::password_hack::parse_endpoint_param,
     context::RequestMonitoring,
     error::{ReportableError, UserFacingError},
-    metrics::NUM_CONNECTION_ACCEPTED_BY_SNI,
+    metrics::{Metrics, SniKind},
     proxy::NeonOptions,
     serverless::SERVERLESS_DRIVER_SNI,
     EndpointId, RoleName,
@@ -144,21 +144,22 @@ impl ComputeUserInfoMaybeEndpoint {
             ctx.set_endpoint_id(ep.clone());
         }
 
+        let metrics = Metrics::get();
         info!(%user, "credentials");
         if sni.is_some() {
             info!("Connection with sni");
-            NUM_CONNECTION_ACCEPTED_BY_SNI
-                .with_label_values(&["sni"])
-                .inc();
+            metrics.proxy.accepted_connections_by_sni.inc(SniKind::Sni);
         } else if endpoint.is_some() {
-            NUM_CONNECTION_ACCEPTED_BY_SNI
-                .with_label_values(&["no_sni"])
-                .inc();
+            metrics
+                .proxy
+                .accepted_connections_by_sni
+                .inc(SniKind::NoSni);
             info!("Connection without sni");
         } else {
-            NUM_CONNECTION_ACCEPTED_BY_SNI
-                .with_label_values(&["password_hack"])
-                .inc();
+            metrics
+                .proxy
+                .accepted_connections_by_sni
+                .inc(SniKind::PasswordHack);
             info!("Connection with password hack");
         }
 
diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs
index c28814b1c8..fb16b76567 100644
--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -9,15 +9,13 @@ use futures::future::Either;
 use itertools::Itertools;
 use proxy::config::TlsServerEndPoint;
 use proxy::context::RequestMonitoring;
-use proxy::proxy::run_until_cancelled;
-use proxy::{BranchId, EndpointId, ProjectId};
+use proxy::proxy::{copy_bidirectional_client_compute, run_until_cancelled};
 use rustls::pki_types::PrivateKeyDer;
 use tokio::net::TcpListener;
 
 use anyhow::{anyhow, bail, ensure, Context};
 use clap::Arg;
 use futures::TryFutureExt;
-use proxy::console::messages::MetricsAuxInfo;
 use proxy::stream::{PqStream, Stream};
 
 use tokio::io::{AsyncRead, AsyncWrite};
@@ -176,7 +174,12 @@ async fn task_main(
                     .context("failed to set socket option")?;
 
                 info!(%peer_addr, "serving");
-                let ctx = RequestMonitoring::new(session_id, peer_addr.ip(), "sni_router", "sni");
+                let ctx = RequestMonitoring::new(
+                    session_id,
+                    peer_addr.ip(),
+                    proxy::metrics::Protocol::SniRouter,
+                    "sni",
+                );
                 handle_client(ctx, dest_suffix, tls_config, tls_server_end_point, socket).await
             }
             .unwrap_or_else(|e| {
@@ -199,6 +202,7 @@ async fn task_main(
 const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)";
 
 async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
+    ctx: &mut RequestMonitoring,
     raw_stream: S,
     tls_config: Arc<rustls::ServerConfig>,
     tls_server_end_point: TlsServerEndPoint,
@@ -228,7 +232,10 @@ async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
             }
 
             Ok(Stream::Tls {
-                tls: Box::new(raw.upgrade(tls_config).await?),
+                tls: Box::new(
+                    raw.upgrade(tls_config, !ctx.has_private_peer_addr())
+                        .await?,
+                ),
                 tls_server_end_point,
             })
         }
@@ -251,7 +258,7 @@ async fn handle_client(
     tls_server_end_point: TlsServerEndPoint,
     stream: impl AsyncRead + AsyncWrite + Unpin,
 ) -> anyhow::Result<()> {
-    let tls_stream = ssl_handshake(stream, tls_config, tls_server_end_point).await?;
+    let mut tls_stream = ssl_handshake(&mut ctx, stream, tls_config, tls_server_end_point).await?;
 
     // Cut off first part of the SNI domain
     // We receive required destination details in the format of
@@ -268,18 +275,15 @@ async fn handle_client(
 
     info!("destination: {}", destination);
 
-    let client = tokio::net::TcpStream::connect(destination).await?;
-
-    let metrics_aux: MetricsAuxInfo = MetricsAuxInfo {
-        endpoint_id: (&EndpointId::from("")).into(),
-        project_id: (&ProjectId::from("")).into(),
-        branch_id: (&BranchId::from("")).into(),
-        cold_start_info: proxy::console::messages::ColdStartInfo::Unknown,
-    };
+    let mut client = tokio::net::TcpStream::connect(destination).await?;
 
     // doesn't yet matter as pg-sni-router doesn't report analytics logs
     ctx.set_success();
-    ctx.log();
+    ctx.log_connect();
 
-    proxy::proxy::passthrough::proxy_pass(tls_stream, client, metrics_aux).await
+    // Starting from here we only proxy the client's traffic.
+    info!("performing the proxy pass...");
+    let _ = copy_bidirectional_client_compute(&mut tls_stream, &mut client).await?;
+
+    Ok(())
 }
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index 56a3ef79cd..0956aae6c0 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -7,6 +7,7 @@ use aws_config::provider_config::ProviderConfig;
 use aws_config::web_identity_token::WebIdentityTokenCredentialsProvider;
 use futures::future::Either;
 use proxy::auth;
+use proxy::auth::backend::AuthRateLimiter;
 use proxy::auth::backend::MaybeOwned;
 use proxy::cancellation::CancelMap;
 use proxy::cancellation::CancellationHandler;
@@ -18,11 +19,10 @@ use proxy::config::ProjectInfoCacheOptions;
 use proxy::console;
 use proxy::context::parquet::ParquetUploadArgs;
 use proxy::http;
-use proxy::metrics::NUM_CANCELLATION_REQUESTS_SOURCE_FROM_CLIENT;
-use proxy::rate_limiter::AuthRateLimiter;
+use proxy::http::health_server::AppMetrics;
+use proxy::metrics::Metrics;
 use proxy::rate_limiter::EndpointRateLimiter;
 use proxy::rate_limiter::RateBucketInfo;
-use proxy::rate_limiter::RateLimiterConfig;
 use proxy::redis::cancellation_publisher::RedisPublisherClient;
 use proxy::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
 use proxy::redis::elasticache;
@@ -42,6 +42,7 @@ use tokio::task::JoinSet;
 use tokio_util::sync::CancellationToken;
 use tracing::info;
 use tracing::warn;
+use tracing::Instrument;
 use utils::{project_build_tag, project_git_version, sentry_init::init_sentry};
 
 project_git_version!(GIT_VERSION);
@@ -117,8 +118,11 @@ struct ProxyCliArgs {
     #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)]
     wake_compute_cache: String,
     /// lock for `wake_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable).
-    #[clap(long, default_value = config::WakeComputeLockOptions::DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK)]
+    #[clap(long, default_value = config::ConcurrencyLockOptions::DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK)]
     wake_compute_lock: String,
+    /// lock for `connect_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable).
+    #[clap(long, default_value = config::ConcurrencyLockOptions::DEFAULT_OPTIONS_CONNECT_COMPUTE_LOCK)]
+    connect_compute_lock: String,
     /// Allow self-signed certificates for compute nodes (for testing)
     #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
     allow_self_signed_compute: bool,
@@ -131,14 +135,8 @@ struct ProxyCliArgs {
     #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
     require_client_ip: bool,
     /// Disable dynamic rate limiter and store the metrics to ensure its production behaviour.
-    #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
+    #[clap(long, default_value_t = true, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
     disable_dynamic_rate_limiter: bool,
-    /// Rate limit algorithm. Makes sense only if `disable_rate_limiter` is `false`.
-    #[clap(value_enum, long, default_value_t = proxy::rate_limiter::RateLimitAlgorithm::Aimd)]
-    rate_limit_algorithm: proxy::rate_limiter::RateLimitAlgorithm,
-    /// Timeout for rate limiter. If it didn't manage to aquire a permit in this time, it will return an error.
-    #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
-    rate_limiter_timeout: tokio::time::Duration,
     /// Endpoint rate limiter max number of requests per second.
     ///
     /// Provided in the form '<Requests Per Second>@<Bucket Duration Size>'.
@@ -151,14 +149,12 @@ struct ProxyCliArgs {
     /// Authentication rate limiter max number of hashes per second.
     #[clap(long, default_values_t = RateBucketInfo::DEFAULT_AUTH_SET)]
     auth_rate_limit: Vec<RateBucketInfo>,
+    /// The IP subnet to use when considering whether two IP addresses are considered the same.
+    #[clap(long, default_value_t = 64)]
+    auth_rate_limit_ip_subnet: u8,
     /// Redis rate limiter max number of requests per second.
     #[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)]
     redis_rps_limit: Vec<RateBucketInfo>,
-    /// Initial limit for dynamic rate limiter. Makes sense only if `rate_limit_algorithm` is *not* `None`.
-    #[clap(long, default_value_t = 100)]
-    initial_limit: usize,
-    #[clap(flatten)]
-    aimd_config: proxy::rate_limiter::AimdConfig,
     /// cache for `allowed_ips` (use `size=0` to disable)
     #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)]
     allowed_ips_cache: String,
@@ -189,7 +185,9 @@ struct ProxyCliArgs {
     /// cache for `project_info` (use `size=0` to disable)
     #[clap(long, default_value = config::ProjectInfoCacheOptions::CACHE_DEFAULT_OPTIONS)]
     project_info_cache: String,
-
+    /// cache for all valid endpoints
+    #[clap(long, default_value = config::EndpointCacheConfig::CACHE_DEFAULT_OPTIONS)]
+    endpoint_cache_config: String,
     #[clap(flatten)]
     parquet_upload: ParquetUploadArgs,
 
@@ -205,6 +203,12 @@ struct ProxyCliArgs {
     /// Size of each event is no more than 400 bytes, so 2**22 is about 200MB before the compression.
     #[clap(long, default_value = "4194304")]
     metric_backup_collection_chunk_size: usize,
+    /// Whether to retry the connection to the compute node
+    #[clap(long, default_value = config::RetryConfig::CONNECT_TO_COMPUTE_DEFAULT_VALUES)]
+    connect_to_compute_retry: String,
+    /// Whether to retry the wake_compute request
+    #[clap(long, default_value = config::RetryConfig::WAKE_COMPUTE_DEFAULT_VALUES)]
+    wake_compute_retry: String,
 }
 
 #[derive(clap::Args, Clone, Copy, Debug)]
@@ -249,14 +253,18 @@ async fn main() -> anyhow::Result<()> {
 
     info!("Version: {GIT_VERSION}");
     info!("Build_tag: {BUILD_TAG}");
-    ::metrics::set_build_info_metric(GIT_VERSION, BUILD_TAG);
+    let neon_metrics = ::metrics::NeonMetrics::new(::metrics::BuildInfo {
+        revision: GIT_VERSION,
+        build_tag: BUILD_TAG,
+    });
 
-    match proxy::jemalloc::MetricRecorder::new(prometheus::default_registry()) {
-        Ok(t) => {
-            t.start();
+    let jemalloc = match proxy::jemalloc::MetricRecorder::new() {
+        Ok(t) => Some(t),
+        Err(e) => {
+            tracing::error!(error = ?e, "could not start jemalloc metrics loop");
+            None
         }
-        Err(e) => tracing::error!(error = ?e, "could not start jemalloc metrics loop"),
-    }
+    };
 
     let args = ProxyCliArgs::parse();
     let config = build_config(&args)?;
@@ -296,27 +304,27 @@ async fn main() -> anyhow::Result<()> {
         ),
         aws_credentials_provider,
     ));
-    let redis_notifications_client =
-        match (args.redis_notifications, (args.redis_host, args.redis_port)) {
-            (Some(url), _) => {
-                info!("Starting redis notifications listener ({url})");
-                Some(ConnectionWithCredentialsProvider::new_with_static_credentials(url))
-            }
-            (None, (Some(host), Some(port))) => Some(
-                ConnectionWithCredentialsProvider::new_with_credentials_provider(
-                    host,
-                    port,
-                    elasticache_credentials_provider.clone(),
-                ),
+    let regional_redis_client = match (args.redis_host, args.redis_port) {
+        (Some(host), Some(port)) => Some(
+            ConnectionWithCredentialsProvider::new_with_credentials_provider(
+                host,
+                port,
+                elasticache_credentials_provider.clone(),
             ),
-            (None, (None, None)) => {
-                warn!("Redis is disabled");
-                None
-            }
-            _ => {
-                bail!("redis-host and redis-port must be specified together");
-            }
-        };
+        ),
+        (None, None) => {
+            warn!("Redis events from console are disabled");
+            None
+        }
+        _ => {
+            bail!("redis-host and redis-port must be specified together");
+        }
+    };
+    let redis_notifications_client = if let Some(url) = args.redis_notifications {
+        Some(ConnectionWithCredentialsProvider::new_with_static_credentials(url))
+    } else {
+        regional_redis_client.clone()
+    };
 
     // Check that we can bind to address before further initialization
     let http_address: SocketAddr = args.http.parse()?;
@@ -332,11 +340,9 @@ async fn main() -> anyhow::Result<()> {
     let proxy_listener = TcpListener::bind(proxy_address).await?;
     let cancellation_token = CancellationToken::new();
 
-    let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(&config.endpoint_rps_limit));
     let cancel_map = CancelMap::default();
 
-    // let redis_notifications_client = redis_notifications_client.map(|x| Box::leak(Box::new(x)));
-    let redis_publisher = match &redis_notifications_client {
+    let redis_publisher = match &regional_redis_client {
         Some(redis_publisher) => Some(Arc::new(Mutex::new(RedisPublisherClient::new(
             redis_publisher.clone(),
             args.region.clone(),
@@ -349,7 +355,7 @@ async fn main() -> anyhow::Result<()> {
     >::new(
         cancel_map.clone(),
         redis_publisher,
-        NUM_CANCELLATION_REQUESTS_SOURCE_FROM_CLIENT,
+        proxy::metrics::CancellationSource::FromClient,
     ));
 
     // client facing tasks. these will exit on error or on cancellation
@@ -359,7 +365,6 @@ async fn main() -> anyhow::Result<()> {
         config,
         proxy_listener,
         cancellation_token.clone(),
-        endpoint_rate_limiter.clone(),
         cancellation_handler.clone(),
     ));
 
@@ -374,7 +379,6 @@ async fn main() -> anyhow::Result<()> {
             config,
             serverless_listener,
             cancellation_token.clone(),
-            endpoint_rate_limiter.clone(),
             cancellation_handler.clone(),
         ));
     }
@@ -387,7 +391,14 @@ async fn main() -> anyhow::Result<()> {
     // maintenance tasks. these never return unless there's an error
     let mut maintenance_tasks = JoinSet::new();
     maintenance_tasks.spawn(proxy::handle_signals(cancellation_token.clone()));
-    maintenance_tasks.spawn(http::health_server::task_main(http_listener));
+    maintenance_tasks.spawn(http::health_server::task_main(
+        http_listener,
+        AppMetrics {
+            jemalloc,
+            neon_metrics,
+            proxy: proxy::metrics::Metrics::get(),
+        },
+    ));
     maintenance_tasks.spawn(console::mgmt::task_main(mgmt_listener));
 
     if let Some(metrics_config) = &config.metric_collection {
@@ -395,21 +406,43 @@ async fn main() -> anyhow::Result<()> {
         maintenance_tasks.spawn(usage_metrics::task_main(metrics_config));
         client_tasks.spawn(usage_metrics::task_backup(
             &metrics_config.backup_metric_collection_config,
-            cancellation_token,
+            cancellation_token.clone(),
         ));
     }
 
     if let auth::BackendType::Console(api, _) = &config.auth_backend {
         if let proxy::console::provider::ConsoleBackend::Console(api) = &**api {
-            if let Some(redis_notifications_client) = redis_notifications_client {
-                let cache = api.caches.project_info.clone();
-                maintenance_tasks.spawn(notifications::task_main(
-                    redis_notifications_client.clone(),
-                    cache.clone(),
-                    cancel_map.clone(),
-                    args.region.clone(),
-                ));
-                maintenance_tasks.spawn(async move { cache.clone().gc_worker().await });
+            match (redis_notifications_client, regional_redis_client.clone()) {
+                (None, None) => {}
+                (client1, client2) => {
+                    let cache = api.caches.project_info.clone();
+                    if let Some(client) = client1 {
+                        maintenance_tasks.spawn(notifications::task_main(
+                            client,
+                            cache.clone(),
+                            cancel_map.clone(),
+                            args.region.clone(),
+                        ));
+                    }
+                    if let Some(client) = client2 {
+                        maintenance_tasks.spawn(notifications::task_main(
+                            client,
+                            cache.clone(),
+                            cancel_map.clone(),
+                            args.region.clone(),
+                        ));
+                    }
+                    maintenance_tasks.spawn(async move { cache.clone().gc_worker().await });
+                }
+            }
+            if let Some(regional_redis_client) = regional_redis_client {
+                let cache = api.caches.endpoints_cache.clone();
+                let con = regional_redis_client;
+                let span = tracing::info_span!("endpoints_cache");
+                maintenance_tasks.spawn(
+                    async move { cache.do_read(con, cancellation_token.clone()).await }
+                        .instrument(span),
+                );
             }
         }
     }
@@ -476,46 +509,54 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
              and metric-collection-interval must be specified"
         ),
     };
-    let rate_limiter_config = RateLimiterConfig {
-        disable: args.disable_dynamic_rate_limiter,
-        algorithm: args.rate_limit_algorithm,
-        timeout: args.rate_limiter_timeout,
-        initial_limit: args.initial_limit,
-        aimd_config: Some(args.aimd_config),
-    };
+    if !args.disable_dynamic_rate_limiter {
+        bail!("dynamic rate limiter should be disabled");
+    }
 
     let auth_backend = match &args.auth_backend {
         AuthBackend::Console => {
             let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?;
             let project_info_cache_config: ProjectInfoCacheOptions =
                 args.project_info_cache.parse()?;
+            let endpoint_cache_config: config::EndpointCacheConfig =
+                args.endpoint_cache_config.parse()?;
 
             info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}");
             info!(
                 "Using AllowedIpsCache (wake_compute) with options={project_info_cache_config:?}"
             );
+            info!("Using EndpointCacheConfig with options={endpoint_cache_config:?}");
             let caches = Box::leak(Box::new(console::caches::ApiCaches::new(
                 wake_compute_cache_config,
                 project_info_cache_config,
+                endpoint_cache_config,
             )));
 
-            let config::WakeComputeLockOptions {
+            let config::ConcurrencyLockOptions {
                 shards,
                 permits,
                 epoch,
                 timeout,
             } = args.wake_compute_lock.parse()?;
             info!(permits, shards, ?epoch, "Using NodeLocks (wake_compute)");
-            let locks = Box::leak(Box::new(
-                console::locks::ApiLocks::new("wake_compute_lock", permits, shards, timeout)
-                    .unwrap(),
-            ));
-            tokio::spawn(locks.garbage_collect_worker(epoch));
+            let locks = Box::leak(Box::new(console::locks::ApiLocks::new(
+                "wake_compute_lock",
+                permits,
+                shards,
+                timeout,
+                epoch,
+                &Metrics::get().wake_compute_lock,
+            )?));
+            tokio::spawn(locks.garbage_collect_worker());
 
             let url = args.auth_endpoint.parse()?;
-            let endpoint = http::Endpoint::new(url, http::new_client(rate_limiter_config));
+            let endpoint = http::Endpoint::new(url, http::new_client());
 
-            let api = console::provider::neon::Api::new(endpoint, caches, locks);
+            let mut endpoint_rps_limit = args.endpoint_rps_limit.clone();
+            RateBucketInfo::validate(&mut endpoint_rps_limit)?;
+            let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(endpoint_rps_limit));
+            let api =
+                console::provider::neon::Api::new(endpoint, caches, locks, endpoint_rate_limiter);
             let api = console::provider::ConsoleBackend::Console(api);
             auth::BackendType::Console(MaybeOwned::Owned(api), ())
         }
@@ -531,6 +572,23 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
             auth::BackendType::Link(MaybeOwned::Owned(url), ())
         }
     };
+
+    let config::ConcurrencyLockOptions {
+        shards,
+        permits,
+        epoch,
+        timeout,
+    } = args.connect_compute_lock.parse()?;
+    info!(permits, shards, ?epoch, "Using NodeLocks (connect_compute)");
+    let connect_compute_locks = console::locks::ApiLocks::new(
+        "connect_compute_lock",
+        permits,
+        shards,
+        timeout,
+        epoch,
+        &Metrics::get().proxy.connect_compute_lock,
+    )?;
+
     let http_config = HttpConfig {
         request_timeout: args.sql_over_http.sql_over_http_timeout,
         pool_options: GlobalConnPoolOptions {
@@ -546,10 +604,9 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
         scram_protocol_timeout: args.scram_protocol_timeout,
         rate_limiter_enabled: args.auth_rate_limit_enabled,
         rate_limiter: AuthRateLimiter::new(args.auth_rate_limit.clone()),
+        rate_limit_ip_subnet: args.auth_rate_limit_ip_subnet,
     };
 
-    let mut endpoint_rps_limit = args.endpoint_rps_limit.clone();
-    RateBucketInfo::validate(&mut endpoint_rps_limit)?;
     let mut redis_rps_limit = args.redis_rps_limit.clone();
     RateBucketInfo::validate(&mut redis_rps_limit)?;
 
@@ -562,13 +619,19 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
         authentication_config,
         require_client_ip: args.require_client_ip,
         disable_ip_check_for_http: args.disable_ip_check_for_http,
-        endpoint_rps_limit,
         redis_rps_limit,
         handshake_timeout: args.handshake_timeout,
         region: args.region.clone(),
         aws_region: args.aws_region.clone(),
+        wake_compute_retry_config: config::RetryConfig::parse(&args.wake_compute_retry)?,
+        connect_compute_locks,
+        connect_to_compute_retry_config: config::RetryConfig::parse(
+            &args.connect_to_compute_retry,
+        )?,
     }));
 
+    tokio::spawn(config.connect_compute_locks.garbage_collect_worker());
+
     Ok(config)
 }
 
diff --git a/proxy/src/cache.rs b/proxy/src/cache.rs
index fc5f416395..d1d4087241 100644
--- a/proxy/src/cache.rs
+++ b/proxy/src/cache.rs
@@ -1,4 +1,5 @@
 pub mod common;
+pub mod endpoints;
 pub mod project_info;
 mod timed_lru;
 
diff --git a/proxy/src/cache/endpoints.rs b/proxy/src/cache/endpoints.rs
new file mode 100644
index 0000000000..4bc10a6020
--- /dev/null
+++ b/proxy/src/cache/endpoints.rs
@@ -0,0 +1,247 @@
+use std::{
+    convert::Infallible,
+    sync::{
+        atomic::{AtomicBool, Ordering},
+        Arc,
+    },
+    time::Duration,
+};
+
+use dashmap::DashSet;
+use redis::{
+    streams::{StreamReadOptions, StreamReadReply},
+    AsyncCommands, FromRedisValue, Value,
+};
+use serde::Deserialize;
+use tokio::sync::Mutex;
+use tokio_util::sync::CancellationToken;
+use tracing::info;
+
+use crate::{
+    config::EndpointCacheConfig,
+    context::RequestMonitoring,
+    intern::{BranchIdInt, EndpointIdInt, ProjectIdInt},
+    metrics::{Metrics, RedisErrors, RedisEventsCount},
+    rate_limiter::GlobalRateLimiter,
+    redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider,
+    EndpointId,
+};
+
+#[derive(Deserialize, Debug, Clone)]
+pub struct ControlPlaneEventKey {
+    endpoint_created: Option<EndpointCreated>,
+    branch_created: Option<BranchCreated>,
+    project_created: Option<ProjectCreated>,
+}
+#[derive(Deserialize, Debug, Clone)]
+struct EndpointCreated {
+    endpoint_id: String,
+}
+#[derive(Deserialize, Debug, Clone)]
+struct BranchCreated {
+    branch_id: String,
+}
+#[derive(Deserialize, Debug, Clone)]
+struct ProjectCreated {
+    project_id: String,
+}
+
+pub struct EndpointsCache {
+    config: EndpointCacheConfig,
+    endpoints: DashSet<EndpointIdInt>,
+    branches: DashSet<BranchIdInt>,
+    projects: DashSet<ProjectIdInt>,
+    ready: AtomicBool,
+    limiter: Arc<Mutex<GlobalRateLimiter>>,
+}
+
+impl EndpointsCache {
+    pub fn new(config: EndpointCacheConfig) -> Self {
+        Self {
+            limiter: Arc::new(Mutex::new(GlobalRateLimiter::new(
+                config.limiter_info.clone(),
+            ))),
+            config,
+            endpoints: DashSet::new(),
+            branches: DashSet::new(),
+            projects: DashSet::new(),
+            ready: AtomicBool::new(false),
+        }
+    }
+    pub async fn is_valid(&self, ctx: &mut RequestMonitoring, endpoint: &EndpointId) -> bool {
+        if !self.ready.load(Ordering::Acquire) {
+            return true;
+        }
+        let rejected = self.should_reject(endpoint);
+        ctx.set_rejected(rejected);
+        info!(?rejected, "check endpoint is valid, disabled cache");
+        // If cache is disabled, just collect the metrics and return or
+        // If the limiter allows, we don't need to check the cache.
+        if self.config.disable_cache || self.limiter.lock().await.check() {
+            return true;
+        }
+        !rejected
+    }
+    fn should_reject(&self, endpoint: &EndpointId) -> bool {
+        if endpoint.is_endpoint() {
+            !self.endpoints.contains(&EndpointIdInt::from(endpoint))
+        } else if endpoint.is_branch() {
+            !self
+                .branches
+                .contains(&BranchIdInt::from(&endpoint.as_branch()))
+        } else {
+            !self
+                .projects
+                .contains(&ProjectIdInt::from(&endpoint.as_project()))
+        }
+    }
+    fn insert_event(&self, key: ControlPlaneEventKey) {
+        // Do not do normalization here, we expect the events to be normalized.
+        if let Some(endpoint_created) = key.endpoint_created {
+            self.endpoints
+                .insert(EndpointIdInt::from(&endpoint_created.endpoint_id.into()));
+            Metrics::get()
+                .proxy
+                .redis_events_count
+                .inc(RedisEventsCount::EndpointCreated);
+        }
+        if let Some(branch_created) = key.branch_created {
+            self.branches
+                .insert(BranchIdInt::from(&branch_created.branch_id.into()));
+            Metrics::get()
+                .proxy
+                .redis_events_count
+                .inc(RedisEventsCount::BranchCreated);
+        }
+        if let Some(project_created) = key.project_created {
+            self.projects
+                .insert(ProjectIdInt::from(&project_created.project_id.into()));
+            Metrics::get()
+                .proxy
+                .redis_events_count
+                .inc(RedisEventsCount::ProjectCreated);
+        }
+    }
+    pub async fn do_read(
+        &self,
+        mut con: ConnectionWithCredentialsProvider,
+        cancellation_token: CancellationToken,
+    ) -> anyhow::Result<Infallible> {
+        let mut last_id = "0-0".to_string();
+        loop {
+            if let Err(e) = con.connect().await {
+                tracing::error!("error connecting to redis: {:?}", e);
+                self.ready.store(false, Ordering::Release);
+            }
+            if let Err(e) = self.read_from_stream(&mut con, &mut last_id).await {
+                tracing::error!("error reading from redis: {:?}", e);
+                self.ready.store(false, Ordering::Release);
+            }
+            if cancellation_token.is_cancelled() {
+                info!("cancellation token is cancelled, exiting");
+                tokio::time::sleep(Duration::from_secs(60 * 60 * 24 * 7)).await;
+                // 1 week.
+            }
+            tokio::time::sleep(self.config.retry_interval).await;
+        }
+    }
+    async fn read_from_stream(
+        &self,
+        con: &mut ConnectionWithCredentialsProvider,
+        last_id: &mut String,
+    ) -> anyhow::Result<()> {
+        tracing::info!("reading endpoints/branches/projects from redis");
+        self.batch_read(
+            con,
+            StreamReadOptions::default().count(self.config.initial_batch_size),
+            last_id,
+            true,
+        )
+        .await?;
+        tracing::info!("ready to filter user requests");
+        self.ready.store(true, Ordering::Release);
+        self.batch_read(
+            con,
+            StreamReadOptions::default()
+                .count(self.config.default_batch_size)
+                .block(self.config.xread_timeout.as_millis() as usize),
+            last_id,
+            false,
+        )
+        .await
+    }
+    fn parse_key_value(value: &Value) -> anyhow::Result<ControlPlaneEventKey> {
+        let s: String = FromRedisValue::from_redis_value(value)?;
+        Ok(serde_json::from_str(&s)?)
+    }
+    async fn batch_read(
+        &self,
+        conn: &mut ConnectionWithCredentialsProvider,
+        opts: StreamReadOptions,
+        last_id: &mut String,
+        return_when_finish: bool,
+    ) -> anyhow::Result<()> {
+        let mut total: usize = 0;
+        loop {
+            let mut res: StreamReadReply = conn
+                .xread_options(&[&self.config.stream_name], &[last_id.as_str()], &opts)
+                .await?;
+
+            if res.keys.is_empty() {
+                if return_when_finish {
+                    if total != 0 {
+                        break;
+                    }
+                    anyhow::bail!(
+                        "Redis stream {} is empty, cannot be used to filter endpoints",
+                        self.config.stream_name
+                    );
+                }
+                // If we are not returning when finish, we should wait for more data.
+                continue;
+            }
+            if res.keys.len() != 1 {
+                anyhow::bail!("Cannot read from redis stream {}", self.config.stream_name);
+            }
+
+            let res = res.keys.pop().expect("Checked length above");
+            let len = res.ids.len();
+            for x in res.ids {
+                total += 1;
+                for (_, v) in x.map {
+                    let key = match Self::parse_key_value(&v) {
+                        Ok(x) => x,
+                        Err(e) => {
+                            Metrics::get().proxy.redis_errors_total.inc(RedisErrors {
+                                channel: &self.config.stream_name,
+                            });
+                            tracing::error!("error parsing value {v:?}: {e:?}");
+                            continue;
+                        }
+                    };
+                    self.insert_event(key);
+                }
+                if total.is_power_of_two() {
+                    tracing::debug!("endpoints read {}", total);
+                }
+                *last_id = x.id;
+            }
+            if return_when_finish && len <= self.config.default_batch_size {
+                break;
+            }
+        }
+        tracing::info!("read {} endpoints/branches/projects from redis", total);
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::ControlPlaneEventKey;
+
+    #[test]
+    fn test() {
+        let s = "{\"branch_created\":null,\"endpoint_created\":{\"endpoint_id\":\"ep-rapid-thunder-w0qqw2q9\"},\"project_created\":null,\"type\":\"endpoint_created\"}";
+        let _: ControlPlaneEventKey = serde_json::from_str(s).unwrap();
+    }
+}
diff --git a/proxy/src/cache/project_info.rs b/proxy/src/cache/project_info.rs
index d8a1d261ce..10cc4ceee1 100644
--- a/proxy/src/cache/project_info.rs
+++ b/proxy/src/cache/project_info.rs
@@ -5,9 +5,11 @@ use std::{
     time::Duration,
 };
 
+use async_trait::async_trait;
 use dashmap::DashMap;
 use rand::{thread_rng, Rng};
 use smol_str::SmolStr;
+use tokio::sync::Mutex;
 use tokio::time::Instant;
 use tracing::{debug, info};
 
@@ -21,11 +23,12 @@ use crate::{
 
 use super::{Cache, Cached};
 
+#[async_trait]
 pub trait ProjectInfoCache {
     fn invalidate_allowed_ips_for_project(&self, project_id: ProjectIdInt);
     fn invalidate_role_secret_for_project(&self, project_id: ProjectIdInt, role_name: RoleNameInt);
-    fn enable_ttl(&self);
-    fn disable_ttl(&self);
+    async fn decrement_active_listeners(&self);
+    async fn increment_active_listeners(&self);
 }
 
 struct Entry<T> {
@@ -116,8 +119,10 @@ pub struct ProjectInfoCacheImpl {
 
     start_time: Instant,
     ttl_disabled_since_us: AtomicU64,
+    active_listeners_lock: Mutex<usize>,
 }
 
+#[async_trait]
 impl ProjectInfoCache for ProjectInfoCacheImpl {
     fn invalidate_allowed_ips_for_project(&self, project_id: ProjectIdInt) {
         info!("invalidating allowed ips for project `{}`", project_id);
@@ -148,15 +153,27 @@ impl ProjectInfoCache for ProjectInfoCacheImpl {
             }
         }
     }
-    fn enable_ttl(&self) {
-        self.ttl_disabled_since_us
-            .store(u64::MAX, std::sync::atomic::Ordering::Relaxed);
+    async fn decrement_active_listeners(&self) {
+        let mut listeners_guard = self.active_listeners_lock.lock().await;
+        if *listeners_guard == 0 {
+            tracing::error!("active_listeners count is already 0, something is broken");
+            return;
+        }
+        *listeners_guard -= 1;
+        if *listeners_guard == 0 {
+            self.ttl_disabled_since_us
+                .store(u64::MAX, std::sync::atomic::Ordering::SeqCst);
+        }
     }
 
-    fn disable_ttl(&self) {
-        let new_ttl = (self.start_time.elapsed() + self.config.ttl).as_micros() as u64;
-        self.ttl_disabled_since_us
-            .store(new_ttl, std::sync::atomic::Ordering::Relaxed);
+    async fn increment_active_listeners(&self) {
+        let mut listeners_guard = self.active_listeners_lock.lock().await;
+        *listeners_guard += 1;
+        if *listeners_guard == 1 {
+            let new_ttl = (self.start_time.elapsed() + self.config.ttl).as_micros() as u64;
+            self.ttl_disabled_since_us
+                .store(new_ttl, std::sync::atomic::Ordering::SeqCst);
+        }
     }
 }
 
@@ -168,6 +185,7 @@ impl ProjectInfoCacheImpl {
             config,
             ttl_disabled_since_us: AtomicU64::new(u64::MAX),
             start_time: Instant::now(),
+            active_listeners_lock: Mutex::new(0),
         }
     }
 
@@ -432,7 +450,7 @@ mod tests {
             ttl: Duration::from_secs(1),
             gc_interval: Duration::from_secs(600),
         }));
-        cache.clone().disable_ttl();
+        cache.clone().increment_active_listeners().await;
         tokio::time::advance(Duration::from_secs(2)).await;
 
         let project_id: ProjectId = "project".into();
@@ -489,7 +507,7 @@ mod tests {
     }
 
     #[tokio::test]
-    async fn test_disable_ttl_invalidate_added_before() {
+    async fn test_increment_active_listeners_invalidate_added_before() {
         tokio::time::pause();
         let cache = Arc::new(ProjectInfoCacheImpl::new(ProjectInfoCacheOptions {
             size: 2,
@@ -514,7 +532,7 @@ mod tests {
             (&user1).into(),
             secret1.clone(),
         );
-        cache.clone().disable_ttl();
+        cache.clone().increment_active_listeners().await;
         tokio::time::advance(Duration::from_millis(100)).await;
         cache.insert_role_secret(
             (&project_id).into(),
diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs
index 6151513614..34512e9f5b 100644
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -10,7 +10,7 @@ use uuid::Uuid;
 
 use crate::{
     error::ReportableError,
-    metrics::NUM_CANCELLATION_REQUESTS,
+    metrics::{CancellationRequest, CancellationSource, Metrics},
     redis::cancellation_publisher::{
         CancellationPublisher, CancellationPublisherMut, RedisPublisherClient,
     },
@@ -28,7 +28,7 @@ pub struct CancellationHandler<P> {
     client: P,
     /// This field used for the monitoring purposes.
     /// Represents the source of the cancellation request.
-    from: &'static str,
+    from: CancellationSource,
 }
 
 #[derive(Debug, Error)]
@@ -89,9 +89,13 @@ impl<P: CancellationPublisher> CancellationHandler<P> {
         // NB: we should immediately release the lock after cloning the token.
         let Some(cancel_closure) = self.map.get(&key).and_then(|x| x.clone()) else {
             tracing::warn!("query cancellation key not found: {key}");
-            NUM_CANCELLATION_REQUESTS
-                .with_label_values(&[self.from, "not_found"])
-                .inc();
+            Metrics::get()
+                .proxy
+                .cancellation_requests_total
+                .inc(CancellationRequest {
+                    source: self.from,
+                    kind: crate::metrics::CancellationOutcome::NotFound,
+                });
             match self.client.try_publish(key, session_id).await {
                 Ok(()) => {} // do nothing
                 Err(e) => {
@@ -103,9 +107,13 @@ impl<P: CancellationPublisher> CancellationHandler<P> {
             }
             return Ok(());
         };
-        NUM_CANCELLATION_REQUESTS
-            .with_label_values(&[self.from, "found"])
-            .inc();
+        Metrics::get()
+            .proxy
+            .cancellation_requests_total
+            .inc(CancellationRequest {
+                source: self.from,
+                kind: crate::metrics::CancellationOutcome::Found,
+            });
         info!("cancelling query per user's request using key {key}");
         cancel_closure.try_cancel_query().await
     }
@@ -122,7 +130,7 @@ impl<P: CancellationPublisher> CancellationHandler<P> {
 }
 
 impl CancellationHandler<()> {
-    pub fn new(map: CancelMap, from: &'static str) -> Self {
+    pub fn new(map: CancelMap, from: CancellationSource) -> Self {
         Self {
             map,
             client: (),
@@ -132,7 +140,7 @@ impl CancellationHandler<()> {
 }
 
 impl<P: CancellationPublisherMut> CancellationHandler<Option<Arc<Mutex<P>>>> {
-    pub fn new(map: CancelMap, client: Option<Arc<Mutex<P>>>, from: &'static str) -> Self {
+    pub fn new(map: CancelMap, client: Option<Arc<Mutex<P>>>, from: CancellationSource) -> Self {
         Self { map, client, from }
     }
 }
@@ -192,15 +200,13 @@ impl<P> Drop for Session<P> {
 
 #[cfg(test)]
 mod tests {
-    use crate::metrics::NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS;
-
     use super::*;
 
     #[tokio::test]
     async fn check_session_drop() -> anyhow::Result<()> {
         let cancellation_handler = Arc::new(CancellationHandler::<()>::new(
             CancelMap::default(),
-            NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS,
+            CancellationSource::FromRedis,
         ));
 
         let session = cancellation_handler.clone().get_session();
@@ -214,7 +220,7 @@ mod tests {
 
     #[tokio::test]
     async fn cancel_session_noop_regression() {
-        let handler = CancellationHandler::<()>::new(Default::default(), "local");
+        let handler = CancellationHandler::<()>::new(Default::default(), CancellationSource::Local);
         handler
             .cancel_session(
                 CancelKeyData {
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index ee33b97fbd..23266ac4ef 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -4,12 +4,12 @@ use crate::{
     console::{errors::WakeComputeError, messages::MetricsAuxInfo},
     context::RequestMonitoring,
     error::{ReportableError, UserFacingError},
-    metrics::NUM_DB_CONNECTIONS_GAUGE,
+    metrics::{Metrics, NumDbConnectionsGuard},
     proxy::neon_option,
+    Host,
 };
 use futures::{FutureExt, TryFutureExt};
 use itertools::Itertools;
-use metrics::IntCounterPairGuard;
 use pq_proto::StartupMessageParams;
 use std::{io, net::SocketAddr, time::Duration};
 use thiserror::Error;
@@ -102,6 +102,16 @@ impl ConnCfg {
         }
     }
 
+    pub fn get_host(&self) -> Result<Host, WakeComputeError> {
+        match self.0.get_hosts() {
+            [tokio_postgres::config::Host::Tcp(s)] => Ok(s.into()),
+            // we should not have multiple address or unix addresses.
+            _ => Err(WakeComputeError::BadComputeAddress(
+                "invalid compute address".into(),
+            )),
+        }
+    }
+
     /// Apply startup message params to the connection config.
     pub fn set_startup_params(&mut self, params: &StartupMessageParams) {
         // Only set `user` if it's not present in the config.
@@ -249,7 +259,7 @@ pub struct PostgresConnection {
     /// Labels for proxy's metrics.
     pub aux: MetricsAuxInfo,
 
-    _guage: IntCounterPairGuard,
+    _guage: NumDbConnectionsGuard<'static>,
 }
 
 impl ConnCfg {
@@ -261,7 +271,9 @@ impl ConnCfg {
         aux: MetricsAuxInfo,
         timeout: Duration,
     ) -> Result<PostgresConnection, ConnectionError> {
+        let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute);
         let (socket_addr, stream, host) = self.connect_raw(timeout).await?;
+        drop(pause);
 
         let tls_connector = native_tls::TlsConnector::builder()
             .danger_accept_invalid_certs(allow_self_signed_compute)
@@ -271,7 +283,9 @@ impl ConnCfg {
         let tls = MakeTlsConnect::<tokio::net::TcpStream>::make_tls_connect(&mut mk_tls, host)?;
 
         // connect_raw() will not use TLS if sslmode is "disable"
+        let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute);
         let (client, connection) = self.0.connect_raw(stream, tls).await?;
+        drop(pause);
         tracing::Span::current().record("pid", &tracing::field::display(client.get_process_id()));
         let stream = connection.stream.into_inner();
 
@@ -295,9 +309,7 @@ impl ConnCfg {
             params,
             cancel_closure,
             aux,
-            _guage: NUM_DB_CONNECTIONS_GAUGE
-                .with_label_values(&[ctx.protocol])
-                .guard(),
+            _guage: Metrics::get().proxy.db_connections.guard(ctx.protocol),
         };
 
         Ok(connection)
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index fc490c7348..0c8e284d0b 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -1,7 +1,9 @@
 use crate::{
-    auth,
-    rate_limiter::{AuthRateLimiter, RateBucketInfo},
+    auth::{self, backend::AuthRateLimiter},
+    console::locks::ApiLocks,
+    rate_limiter::RateBucketInfo,
     serverless::GlobalConnPoolOptions,
+    Host,
 };
 use anyhow::{bail, ensure, Context, Ok};
 use itertools::Itertools;
@@ -29,11 +31,13 @@ pub struct ProxyConfig {
     pub authentication_config: AuthenticationConfig,
     pub require_client_ip: bool,
     pub disable_ip_check_for_http: bool,
-    pub endpoint_rps_limit: Vec<RateBucketInfo>,
     pub redis_rps_limit: Vec<RateBucketInfo>,
     pub region: String,
     pub handshake_timeout: Duration,
     pub aws_region: String,
+    pub wake_compute_retry_config: RetryConfig,
+    pub connect_compute_locks: ApiLocks<Host>,
+    pub connect_to_compute_retry_config: RetryConfig,
 }
 
 #[derive(Debug)]
@@ -58,6 +62,7 @@ pub struct AuthenticationConfig {
     pub scram_protocol_timeout: tokio::time::Duration,
     pub rate_limiter_enabled: bool,
     pub rate_limiter: AuthRateLimiter,
+    pub rate_limit_ip_subnet: u8,
 }
 
 impl TlsConfig {
@@ -313,6 +318,80 @@ impl CertResolver {
     }
 }
 
+#[derive(Debug)]
+pub struct EndpointCacheConfig {
+    /// Batch size to receive all endpoints on the startup.
+    pub initial_batch_size: usize,
+    /// Batch size to receive endpoints.
+    pub default_batch_size: usize,
+    /// Timeouts for the stream read operation.
+    pub xread_timeout: Duration,
+    /// Stream name to read from.
+    pub stream_name: String,
+    /// Limiter info (to distinguish when to enable cache).
+    pub limiter_info: Vec<RateBucketInfo>,
+    /// Disable cache.
+    /// If true, cache is ignored, but reports all statistics.
+    pub disable_cache: bool,
+    /// Retry interval for the stream read operation.
+    pub retry_interval: Duration,
+}
+
+impl EndpointCacheConfig {
+    /// Default options for [`crate::console::provider::NodeInfoCache`].
+    /// Notice that by default the limiter is empty, which means that cache is disabled.
+    pub const CACHE_DEFAULT_OPTIONS: &'static str =
+        "initial_batch_size=1000,default_batch_size=10,xread_timeout=5m,stream_name=controlPlane,disable_cache=true,limiter_info=1000@1s,retry_interval=1s";
+
+    /// Parse cache options passed via cmdline.
+    /// Example: [`Self::CACHE_DEFAULT_OPTIONS`].
+    fn parse(options: &str) -> anyhow::Result<Self> {
+        let mut initial_batch_size = None;
+        let mut default_batch_size = None;
+        let mut xread_timeout = None;
+        let mut stream_name = None;
+        let mut limiter_info = vec![];
+        let mut disable_cache = false;
+        let mut retry_interval = None;
+
+        for option in options.split(',') {
+            let (key, value) = option
+                .split_once('=')
+                .with_context(|| format!("bad key-value pair: {option}"))?;
+
+            match key {
+                "initial_batch_size" => initial_batch_size = Some(value.parse()?),
+                "default_batch_size" => default_batch_size = Some(value.parse()?),
+                "xread_timeout" => xread_timeout = Some(humantime::parse_duration(value)?),
+                "stream_name" => stream_name = Some(value.to_string()),
+                "limiter_info" => limiter_info.push(RateBucketInfo::from_str(value)?),
+                "disable_cache" => disable_cache = value.parse()?,
+                "retry_interval" => retry_interval = Some(humantime::parse_duration(value)?),
+                unknown => bail!("unknown key: {unknown}"),
+            }
+        }
+        RateBucketInfo::validate(&mut limiter_info)?;
+
+        Ok(Self {
+            initial_batch_size: initial_batch_size.context("missing `initial_batch_size`")?,
+            default_batch_size: default_batch_size.context("missing `default_batch_size`")?,
+            xread_timeout: xread_timeout.context("missing `xread_timeout`")?,
+            stream_name: stream_name.context("missing `stream_name`")?,
+            disable_cache,
+            limiter_info,
+            retry_interval: retry_interval.context("missing `retry_interval`")?,
+        })
+    }
+}
+
+impl FromStr for EndpointCacheConfig {
+    type Err = anyhow::Error;
+
+    fn from_str(options: &str) -> Result<Self, Self::Err> {
+        let error = || format!("failed to parse endpoint cache options '{options}'");
+        Self::parse(options).with_context(error)
+    }
+}
 #[derive(Debug)]
 pub struct MetricBackupCollectionConfig {
     pub interval: Duration,
@@ -443,8 +522,61 @@ impl FromStr for ProjectInfoCacheOptions {
     }
 }
 
+/// This is a config for connect to compute and wake compute.
+#[derive(Clone, Copy, Debug)]
+pub struct RetryConfig {
+    /// Number of times we should retry.
+    pub max_retries: u32,
+    /// Retry duration is base_delay * backoff_factor ^ n, where n starts at 0
+    pub base_delay: tokio::time::Duration,
+    /// Exponential base for retry wait duration
+    pub backoff_factor: f64,
+}
+
+impl RetryConfig {
+    /// Default options for RetryConfig.
+
+    /// Total delay for 8 retries with 100ms base delay and 1.6 backoff factor is about 7s.
+    pub const CONNECT_TO_COMPUTE_DEFAULT_VALUES: &'static str =
+        "num_retries=8,base_retry_wait_duration=100ms,retry_wait_exponent_base=1.6";
+    /// Total delay for 8 retries with 100ms base delay and 1.6 backoff factor is about 7s.
+    /// Cplane has timeout of 60s on each request. 8m7s in total.
+    pub const WAKE_COMPUTE_DEFAULT_VALUES: &'static str =
+        "num_retries=8,base_retry_wait_duration=100ms,retry_wait_exponent_base=1.6";
+
+    /// Parse retry options passed via cmdline.
+    /// Example: [`Self::CONNECT_TO_COMPUTE_DEFAULT_VALUES`].
+    pub fn parse(options: &str) -> anyhow::Result<Self> {
+        let mut num_retries = None;
+        let mut base_retry_wait_duration = None;
+        let mut retry_wait_exponent_base = None;
+
+        for option in options.split(',') {
+            let (key, value) = option
+                .split_once('=')
+                .with_context(|| format!("bad key-value pair: {option}"))?;
+
+            match key {
+                "num_retries" => num_retries = Some(value.parse()?),
+                "base_retry_wait_duration" => {
+                    base_retry_wait_duration = Some(humantime::parse_duration(value)?)
+                }
+                "retry_wait_exponent_base" => retry_wait_exponent_base = Some(value.parse()?),
+                unknown => bail!("unknown key: {unknown}"),
+            }
+        }
+
+        Ok(Self {
+            max_retries: num_retries.context("missing `num_retries`")?,
+            base_delay: base_retry_wait_duration.context("missing `base_retry_wait_duration`")?,
+            backoff_factor: retry_wait_exponent_base
+                .context("missing `retry_wait_exponent_base`")?,
+        })
+    }
+}
+
 /// Helper for cmdline cache options parsing.
-pub struct WakeComputeLockOptions {
+pub struct ConcurrencyLockOptions {
     /// The number of shards the lock map should have
     pub shards: usize,
     /// The number of allowed concurrent requests for each endpoitn
@@ -455,9 +587,12 @@ pub struct WakeComputeLockOptions {
     pub timeout: Duration,
 }
 
-impl WakeComputeLockOptions {
+impl ConcurrencyLockOptions {
     /// Default options for [`crate::console::provider::ApiLocks`].
     pub const DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK: &'static str = "permits=0";
+    /// Default options for [`crate::console::provider::ApiLocks`].
+    pub const DEFAULT_OPTIONS_CONNECT_COMPUTE_LOCK: &'static str =
+        "shards=64,permits=50,epoch=10m,timeout=500ms";
 
     // pub const DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK: &'static str = "shards=32,permits=4,epoch=10m,timeout=1s";
 
@@ -507,7 +642,7 @@ impl WakeComputeLockOptions {
     }
 }
 
-impl FromStr for WakeComputeLockOptions {
+impl FromStr for ConcurrencyLockOptions {
     type Err = anyhow::Error;
 
     fn from_str(options: &str) -> Result<Self, Self::Err> {
@@ -543,7 +678,7 @@ mod tests {
 
     #[test]
     fn test_parse_lock_options() -> anyhow::Result<()> {
-        let WakeComputeLockOptions {
+        let ConcurrencyLockOptions {
             epoch,
             permits,
             shards,
@@ -554,7 +689,7 @@ mod tests {
         assert_eq!(shards, 32);
         assert_eq!(permits, 4);
 
-        let WakeComputeLockOptions {
+        let ConcurrencyLockOptions {
             epoch,
             permits,
             shards,
@@ -565,7 +700,7 @@ mod tests {
         assert_eq!(shards, 16);
         assert_eq!(permits, 8);
 
-        let WakeComputeLockOptions {
+        let ConcurrencyLockOptions {
             epoch,
             permits,
             shards,
diff --git a/proxy/src/console/messages.rs b/proxy/src/console/messages.rs
index 45161f5ac8..9869b95768 100644
--- a/proxy/src/console/messages.rs
+++ b/proxy/src/console/messages.rs
@@ -1,3 +1,4 @@
+use measured::FixedCardinalityLabel;
 use serde::{Deserialize, Serialize};
 use std::fmt;
 
@@ -102,7 +103,7 @@ pub struct MetricsAuxInfo {
     pub cold_start_info: ColdStartInfo,
 }
 
-#[derive(Debug, Default, Serialize, Deserialize, Clone, Copy)]
+#[derive(Debug, Default, Serialize, Deserialize, Clone, Copy, FixedCardinalityLabel)]
 #[serde(rename_all = "snake_case")]
 pub enum ColdStartInfo {
     #[default]
@@ -110,9 +111,11 @@ pub enum ColdStartInfo {
     /// Compute was already running
     Warm,
     #[serde(rename = "pool_hit")]
+    #[label(rename = "pool_hit")]
     /// Compute was not running but there was an available VM
     VmPoolHit,
     #[serde(rename = "pool_miss")]
+    #[label(rename = "pool_miss")]
     /// Compute was not running and there were no VMs available
     VmPoolMiss,
 
diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs
index f7d621fb12..dfda29e0b1 100644
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -8,15 +8,16 @@ use crate::{
         backend::{ComputeCredentialKeys, ComputeUserInfo},
         IpPattern,
     },
-    cache::{project_info::ProjectInfoCacheImpl, Cached, TimedLru},
+    cache::{endpoints::EndpointsCache, project_info::ProjectInfoCacheImpl, Cached, TimedLru},
     compute,
-    config::{CacheOptions, ProjectInfoCacheOptions},
+    config::{CacheOptions, EndpointCacheConfig, ProjectInfoCacheOptions},
     context::RequestMonitoring,
     intern::ProjectIdInt,
+    metrics::ApiLockMetrics,
     scram, EndpointCacheKey,
 };
 use dashmap::DashMap;
-use std::{sync::Arc, time::Duration};
+use std::{hash::Hash, sync::Arc, time::Duration};
 use tokio::sync::{OwnedSemaphorePermit, Semaphore};
 use tokio::time::Instant;
 use tracing::info;
@@ -207,6 +208,9 @@ pub mod errors {
         #[error(transparent)]
         ApiError(ApiError),
 
+        #[error("Too many connections attempts")]
+        TooManyConnections,
+
         #[error("Timeout waiting to acquire wake compute lock")]
         TimeoutError,
     }
@@ -239,6 +243,8 @@ pub mod errors {
                 // However, API might return a meaningful error.
                 ApiError(e) => e.to_string_client(),
 
+                TooManyConnections => self.to_string(),
+
                 TimeoutError => "timeout while acquiring the compute resource lock".to_owned(),
             }
         }
@@ -249,6 +255,7 @@ pub mod errors {
             match self {
                 WakeComputeError::BadComputeAddress(_) => crate::error::ErrorKind::ControlPlane,
                 WakeComputeError::ApiError(e) => e.get_error_kind(),
+                WakeComputeError::TooManyConnections => crate::error::ErrorKind::RateLimit,
                 WakeComputeError::TimeoutError => crate::error::ErrorKind::ServiceRateLimit,
             }
         }
@@ -416,12 +423,15 @@ pub struct ApiCaches {
     pub node_info: NodeInfoCache,
     /// Cache which stores project_id -> endpoint_ids mapping.
     pub project_info: Arc<ProjectInfoCacheImpl>,
+    /// List of all valid endpoints.
+    pub endpoints_cache: Arc<EndpointsCache>,
 }
 
 impl ApiCaches {
     pub fn new(
         wake_compute_cache_config: CacheOptions,
         project_info_cache_config: ProjectInfoCacheOptions,
+        endpoint_cache_config: EndpointCacheConfig,
     ) -> Self {
         Self {
             node_info: NodeInfoCache::new(
@@ -431,83 +441,41 @@ impl ApiCaches {
                 true,
             ),
             project_info: Arc::new(ProjectInfoCacheImpl::new(project_info_cache_config)),
+            endpoints_cache: Arc::new(EndpointsCache::new(endpoint_cache_config)),
         }
     }
 }
 
 /// Various caches for [`console`](super).
-pub struct ApiLocks {
+pub struct ApiLocks<K> {
     name: &'static str,
-    node_locks: DashMap<EndpointCacheKey, Arc<Semaphore>>,
+    node_locks: DashMap<K, Arc<Semaphore>>,
     permits: usize,
     timeout: Duration,
-    registered: prometheus::IntCounter,
-    unregistered: prometheus::IntCounter,
-    reclamation_lag: prometheus::Histogram,
-    lock_acquire_lag: prometheus::Histogram,
+    epoch: std::time::Duration,
+    metrics: &'static ApiLockMetrics,
 }
 
-impl ApiLocks {
+impl<K: Hash + Eq + Clone> ApiLocks<K> {
     pub fn new(
         name: &'static str,
         permits: usize,
         shards: usize,
         timeout: Duration,
+        epoch: std::time::Duration,
+        metrics: &'static ApiLockMetrics,
     ) -> prometheus::Result<Self> {
-        let registered = prometheus::IntCounter::with_opts(
-            prometheus::Opts::new(
-                "semaphores_registered",
-                "Number of semaphores registered in this api lock",
-            )
-            .namespace(name),
-        )?;
-        prometheus::register(Box::new(registered.clone()))?;
-        let unregistered = prometheus::IntCounter::with_opts(
-            prometheus::Opts::new(
-                "semaphores_unregistered",
-                "Number of semaphores unregistered in this api lock",
-            )
-            .namespace(name),
-        )?;
-        prometheus::register(Box::new(unregistered.clone()))?;
-        let reclamation_lag = prometheus::Histogram::with_opts(
-            prometheus::HistogramOpts::new(
-                "reclamation_lag_seconds",
-                "Time it takes to reclaim unused semaphores in the api lock",
-            )
-            .namespace(name)
-            // 1us -> 65ms
-            // benchmarks on my mac indicate it's usually in the range of 256us and 512us
-            .buckets(prometheus::exponential_buckets(1e-6, 2.0, 16)?),
-        )?;
-        prometheus::register(Box::new(reclamation_lag.clone()))?;
-        let lock_acquire_lag = prometheus::Histogram::with_opts(
-            prometheus::HistogramOpts::new(
-                "semaphore_acquire_seconds",
-                "Time it takes to reclaim unused semaphores in the api lock",
-            )
-            .namespace(name)
-            // 0.1ms -> 6s
-            .buckets(prometheus::exponential_buckets(1e-4, 2.0, 16)?),
-        )?;
-        prometheus::register(Box::new(lock_acquire_lag.clone()))?;
-
         Ok(Self {
             name,
             node_locks: DashMap::with_shard_amount(shards),
             permits,
             timeout,
-            lock_acquire_lag,
-            registered,
-            unregistered,
-            reclamation_lag,
+            epoch,
+            metrics,
         })
     }
 
-    pub async fn get_wake_compute_permit(
-        &self,
-        key: &EndpointCacheKey,
-    ) -> Result<WakeComputePermit, errors::WakeComputeError> {
+    pub async fn get_permit(&self, key: &K) -> Result<WakeComputePermit, errors::WakeComputeError> {
         if self.permits == 0 {
             return Ok(WakeComputePermit { permit: None });
         }
@@ -520,7 +488,7 @@ impl ApiLocks {
                 self.node_locks
                     .entry(key.clone())
                     .or_insert_with(|| {
-                        self.registered.inc();
+                        self.metrics.semaphores_registered.inc();
                         Arc::new(Semaphore::new(self.permits))
                     })
                     .clone()
@@ -528,20 +496,21 @@ impl ApiLocks {
         };
         let permit = tokio::time::timeout_at(now + self.timeout, semaphore.acquire_owned()).await;
 
-        self.lock_acquire_lag
-            .observe((Instant::now() - now).as_secs_f64());
+        self.metrics
+            .semaphore_acquire_seconds
+            .observe(now.elapsed().as_secs_f64());
 
         Ok(WakeComputePermit {
             permit: Some(permit??),
         })
     }
 
-    pub async fn garbage_collect_worker(&self, epoch: std::time::Duration) {
+    pub async fn garbage_collect_worker(&self) {
         if self.permits == 0 {
             return;
         }
-
-        let mut interval = tokio::time::interval(epoch / (self.node_locks.shards().len()) as u32);
+        let mut interval =
+            tokio::time::interval(self.epoch / (self.node_locks.shards().len()) as u32);
         loop {
             for (i, shard) in self.node_locks.shards().iter().enumerate() {
                 interval.tick().await;
@@ -554,13 +523,13 @@ impl ApiLocks {
                     "performing epoch reclamation on api lock"
                 );
                 let mut lock = shard.write();
-                let timer = self.reclamation_lag.start_timer();
+                let timer = self.metrics.reclamation_lag_seconds.start_timer();
                 let count = lock
                     .extract_if(|_, semaphore| Arc::strong_count(semaphore.get_mut()) == 1)
                     .count();
                 drop(lock);
-                self.unregistered.inc_by(count as u64);
-                timer.observe_duration()
+                self.metrics.semaphores_unregistered.inc_by(count as u64);
+                timer.observe();
             }
         }
     }
diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index 1a3e2ca795..ec66641d01 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -7,13 +7,15 @@ use super::{
     NodeInfo,
 };
 use crate::{
-    auth::backend::ComputeUserInfo, compute, console::messages::ColdStartInfo, http, scram,
-};
-use crate::{
-    cache::Cached,
-    context::RequestMonitoring,
-    metrics::{ALLOWED_IPS_BY_CACHE_OUTCOME, ALLOWED_IPS_NUMBER},
+    auth::backend::ComputeUserInfo,
+    compute,
+    console::messages::ColdStartInfo,
+    http,
+    metrics::{CacheOutcome, Metrics},
+    rate_limiter::EndpointRateLimiter,
+    scram, EndpointCacheKey, Normalize,
 };
+use crate::{cache::Cached, context::RequestMonitoring};
 use futures::TryFutureExt;
 use std::sync::Arc;
 use tokio::time::Instant;
@@ -23,7 +25,8 @@ use tracing::{error, info, info_span, warn, Instrument};
 pub struct Api {
     endpoint: http::Endpoint,
     pub caches: &'static ApiCaches,
-    locks: &'static ApiLocks,
+    pub locks: &'static ApiLocks<EndpointCacheKey>,
+    pub endpoint_rate_limiter: Arc<EndpointRateLimiter>,
     jwt: String,
 }
 
@@ -32,7 +35,8 @@ impl Api {
     pub fn new(
         endpoint: http::Endpoint,
         caches: &'static ApiCaches,
-        locks: &'static ApiLocks,
+        locks: &'static ApiLocks<EndpointCacheKey>,
+        endpoint_rate_limiter: Arc<EndpointRateLimiter>,
     ) -> Self {
         let jwt: String = match std::env::var("NEON_PROXY_TO_CONTROLPLANE_TOKEN") {
             Ok(v) => v,
@@ -42,6 +46,7 @@ impl Api {
             endpoint,
             caches,
             locks,
+            endpoint_rate_limiter,
             jwt,
         }
     }
@@ -55,6 +60,15 @@ impl Api {
         ctx: &mut RequestMonitoring,
         user_info: &ComputeUserInfo,
     ) -> Result<AuthInfo, GetAuthInfoError> {
+        if !self
+            .caches
+            .endpoints_cache
+            .is_valid(ctx, &user_info.endpoint.normalize())
+            .await
+        {
+            info!("endpoint is not valid, skipping the request");
+            return Ok(AuthInfo::default());
+        }
         let request_id = ctx.session_id.to_string();
         let application_name = ctx.console_application_name();
         async {
@@ -81,7 +95,9 @@ impl Api {
                 Ok(body) => body,
                 // Error 404 is special: it's ok not to have a secret.
                 Err(e) => match e.http_status_code() {
-                    Some(http::StatusCode::NOT_FOUND) => return Ok(AuthInfo::default()),
+                    Some(http::StatusCode::NOT_FOUND) => {
+                        return Ok(AuthInfo::default());
+                    }
                     _otherwise => return Err(e.into()),
                 },
             };
@@ -95,7 +111,10 @@ impl Api {
                 Some(secret)
             };
             let allowed_ips = body.allowed_ips.unwrap_or_default();
-            ALLOWED_IPS_NUMBER.observe(allowed_ips.len() as f64);
+            Metrics::get()
+                .proxy
+                .allowed_ips_number
+                .observe(allowed_ips.len() as f64);
             Ok(AuthInfo {
                 secret,
                 allowed_ips,
@@ -174,23 +193,27 @@ impl super::Api for Api {
         ctx: &mut RequestMonitoring,
         user_info: &ComputeUserInfo,
     ) -> Result<CachedRoleSecret, GetAuthInfoError> {
-        let ep = &user_info.endpoint;
+        let normalized_ep = &user_info.endpoint.normalize();
         let user = &user_info.user;
-        if let Some(role_secret) = self.caches.project_info.get_role_secret(ep, user) {
+        if let Some(role_secret) = self
+            .caches
+            .project_info
+            .get_role_secret(normalized_ep, user)
+        {
             return Ok(role_secret);
         }
         let auth_info = self.do_get_auth_info(ctx, user_info).await?;
         if let Some(project_id) = auth_info.project_id {
-            let ep_int = ep.into();
+            let normalized_ep_int = normalized_ep.into();
             self.caches.project_info.insert_role_secret(
                 project_id,
-                ep_int,
+                normalized_ep_int,
                 user.into(),
                 auth_info.secret.clone(),
             );
             self.caches.project_info.insert_allowed_ips(
                 project_id,
-                ep_int,
+                normalized_ep_int,
                 Arc::new(auth_info.allowed_ips),
             );
             ctx.set_project_id(project_id);
@@ -204,30 +227,34 @@ impl super::Api for Api {
         ctx: &mut RequestMonitoring,
         user_info: &ComputeUserInfo,
     ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
-        let ep = &user_info.endpoint;
-        if let Some(allowed_ips) = self.caches.project_info.get_allowed_ips(ep) {
-            ALLOWED_IPS_BY_CACHE_OUTCOME
-                .with_label_values(&["hit"])
-                .inc();
+        let normalized_ep = &user_info.endpoint.normalize();
+        if let Some(allowed_ips) = self.caches.project_info.get_allowed_ips(normalized_ep) {
+            Metrics::get()
+                .proxy
+                .allowed_ips_cache_misses
+                .inc(CacheOutcome::Hit);
             return Ok((allowed_ips, None));
         }
-        ALLOWED_IPS_BY_CACHE_OUTCOME
-            .with_label_values(&["miss"])
-            .inc();
+        Metrics::get()
+            .proxy
+            .allowed_ips_cache_misses
+            .inc(CacheOutcome::Miss);
         let auth_info = self.do_get_auth_info(ctx, user_info).await?;
         let allowed_ips = Arc::new(auth_info.allowed_ips);
         let user = &user_info.user;
         if let Some(project_id) = auth_info.project_id {
-            let ep_int = ep.into();
+            let normalized_ep_int = normalized_ep.into();
             self.caches.project_info.insert_role_secret(
                 project_id,
-                ep_int,
+                normalized_ep_int,
                 user.into(),
                 auth_info.secret.clone(),
             );
-            self.caches
-                .project_info
-                .insert_allowed_ips(project_id, ep_int, allowed_ips.clone());
+            self.caches.project_info.insert_allowed_ips(
+                project_id,
+                normalized_ep_int,
+                allowed_ips.clone(),
+            );
             ctx.set_project_id(project_id);
         }
         Ok((
@@ -254,7 +281,15 @@ impl super::Api for Api {
             return Ok(cached);
         }
 
-        let permit = self.locks.get_wake_compute_permit(&key).await?;
+        // check rate limit
+        if !self
+            .endpoint_rate_limiter
+            .check(user_info.endpoint.normalize().into(), 1)
+        {
+            return Err(WakeComputeError::TooManyConnections);
+        }
+
+        let permit = self.locks.get_permit(&key).await?;
 
         // after getting back a permit - it's possible the cache was filled
         // double check
diff --git a/proxy/src/context.rs b/proxy/src/context.rs
index fec95f4722..dfd3ef108e 100644
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -5,14 +5,14 @@ use once_cell::sync::OnceCell;
 use smol_str::SmolStr;
 use std::net::IpAddr;
 use tokio::sync::mpsc;
-use tracing::{field::display, info_span, Span};
+use tracing::{field::display, info, info_span, Span};
 use uuid::Uuid;
 
 use crate::{
     console::messages::{ColdStartInfo, MetricsAuxInfo},
     error::ErrorKind,
     intern::{BranchIdInt, ProjectIdInt},
-    metrics::{LatencyTimer, ENDPOINT_ERRORS_BY_KIND, ERROR_BY_KIND},
+    metrics::{ConnectOutcome, InvalidEndpointsGroup, LatencyTimer, Metrics, Protocol},
     DbName, EndpointId, RoleName,
 };
 
@@ -20,7 +20,8 @@ use self::parquet::RequestData;
 
 pub mod parquet;
 
-static LOG_CHAN: OnceCell<mpsc::WeakUnboundedSender<RequestData>> = OnceCell::new();
+pub static LOG_CHAN: OnceCell<mpsc::WeakUnboundedSender<RequestData>> = OnceCell::new();
+pub static LOG_CHAN_DISCONNECT: OnceCell<mpsc::WeakUnboundedSender<RequestData>> = OnceCell::new();
 
 /// Context data for a single request to connect to a database.
 ///
@@ -29,7 +30,7 @@ static LOG_CHAN: OnceCell<mpsc::WeakUnboundedSender<RequestData>> = OnceCell::ne
 pub struct RequestMonitoring {
     pub peer_addr: IpAddr,
     pub session_id: Uuid,
-    pub protocol: &'static str,
+    pub protocol: Protocol,
     first_packet: chrono::DateTime<Utc>,
     region: &'static str,
     pub span: Span,
@@ -49,7 +50,12 @@ pub struct RequestMonitoring {
     // extra
     // This sender is here to keep the request monitoring channel open while requests are taking place.
     sender: Option<mpsc::UnboundedSender<RequestData>>,
+    // This sender is only used to log the length of session in case of success.
+    disconnect_sender: Option<mpsc::UnboundedSender<RequestData>>,
     pub latency_timer: LatencyTimer,
+    // Whether proxy decided that it's not a valid endpoint end rejected it before going to cplane.
+    rejected: Option<bool>,
+    disconnect_timestamp: Option<chrono::DateTime<Utc>>,
 }
 
 #[derive(Clone, Debug)]
@@ -65,7 +71,7 @@ impl RequestMonitoring {
     pub fn new(
         session_id: Uuid,
         peer_addr: IpAddr,
-        protocol: &'static str,
+        protocol: Protocol,
         region: &'static str,
     ) -> Self {
         let span = info_span!(
@@ -74,6 +80,7 @@ impl RequestMonitoring {
             ?session_id,
             %peer_addr,
             ep = tracing::field::Empty,
+            role = tracing::field::Empty,
         );
 
         Self {
@@ -93,16 +100,19 @@ impl RequestMonitoring {
             error_kind: None,
             auth_method: None,
             success: false,
+            rejected: None,
             cold_start_info: ColdStartInfo::Unknown,
 
             sender: LOG_CHAN.get().and_then(|tx| tx.upgrade()),
+            disconnect_sender: LOG_CHAN_DISCONNECT.get().and_then(|tx| tx.upgrade()),
             latency_timer: LatencyTimer::new(protocol),
+            disconnect_timestamp: None,
         }
     }
 
     #[cfg(test)]
     pub fn test() -> Self {
-        RequestMonitoring::new(Uuid::now_v7(), [127, 0, 0, 1].into(), "test", "test")
+        RequestMonitoring::new(Uuid::now_v7(), [127, 0, 0, 1].into(), Protocol::Tcp, "test")
     }
 
     pub fn console_application_name(&self) -> String {
@@ -113,6 +123,10 @@ impl RequestMonitoring {
         )
     }
 
+    pub fn set_rejected(&mut self, rejected: bool) {
+        self.rejected = Some(rejected);
+    }
+
     pub fn set_cold_start_info(&mut self, info: ColdStartInfo) {
         self.cold_start_info = info;
         self.latency_timer.cold_start_info(info);
@@ -134,9 +148,9 @@ impl RequestMonitoring {
     pub fn set_endpoint_id(&mut self, endpoint_id: EndpointId) {
         if self.endpoint_id.is_none() {
             self.span.record("ep", display(&endpoint_id));
-            crate::metrics::CONNECTING_ENDPOINTS
-                .with_label_values(&[self.protocol])
-                .measure(&endpoint_id);
+            let metric = &Metrics::get().proxy.connecting_endpoints;
+            let label = metric.with_labels(self.protocol);
+            metric.get_metric(label).measure(&endpoint_id);
             self.endpoint_id = Some(endpoint_id);
         }
     }
@@ -150,6 +164,7 @@ impl RequestMonitoring {
     }
 
     pub fn set_user(&mut self, user: RoleName) {
+        self.span.record("role", display(&user));
         self.user = Some(user);
     }
 
@@ -157,14 +172,22 @@ impl RequestMonitoring {
         self.auth_method = Some(auth_method);
     }
 
+    pub fn has_private_peer_addr(&self) -> bool {
+        match self.peer_addr {
+            IpAddr::V4(ip) => ip.is_private(),
+            _ => false,
+        }
+    }
+
     pub fn set_error_kind(&mut self, kind: ErrorKind) {
-        ERROR_BY_KIND
-            .with_label_values(&[kind.to_metric_label()])
-            .inc();
+        // Do not record errors from the private address to metrics.
+        if !self.has_private_peer_addr() {
+            Metrics::get().proxy.errors_total.inc(kind);
+        }
         if let Some(ep) = &self.endpoint_id {
-            ENDPOINT_ERRORS_BY_KIND
-                .with_label_values(&[kind.to_metric_label()])
-                .measure(ep);
+            let metric = &Metrics::get().proxy.endpoints_affected_by_errors;
+            let label = metric.with_labels(kind);
+            metric.get_metric(label).measure(ep);
         }
         self.error_kind = Some(kind);
     }
@@ -173,13 +196,55 @@ impl RequestMonitoring {
         self.success = true;
     }
 
-    pub fn log(self) {}
-}
-
-impl Drop for RequestMonitoring {
-    fn drop(&mut self) {
+    pub fn log_connect(&mut self) {
+        let outcome = if self.success {
+            ConnectOutcome::Success
+        } else {
+            ConnectOutcome::Failed
+        };
+        if let Some(rejected) = self.rejected {
+            let ep = self
+                .endpoint_id
+                .as_ref()
+                .map(|x| x.as_str())
+                .unwrap_or_default();
+            // This makes sense only if cache is disabled
+            info!(
+                ?outcome,
+                ?rejected,
+                ?ep,
+                "check endpoint is valid with outcome"
+            );
+            Metrics::get()
+                .proxy
+                .invalid_endpoints_total
+                .inc(InvalidEndpointsGroup {
+                    protocol: self.protocol,
+                    rejected: rejected.into(),
+                    outcome,
+                });
+        }
         if let Some(tx) = self.sender.take() {
             let _: Result<(), _> = tx.send(RequestData::from(&*self));
         }
     }
+
+    fn log_disconnect(&mut self) {
+        // If we are here, it's guaranteed that the user successfully connected to the endpoint.
+        // Here we log the length of the session.
+        self.disconnect_timestamp = Some(Utc::now());
+        if let Some(tx) = self.disconnect_sender.take() {
+            let _: Result<(), _> = tx.send(RequestData::from(&*self));
+        }
+    }
+}
+
+impl Drop for RequestMonitoring {
+    fn drop(&mut self) {
+        if self.sender.is_some() {
+            self.log_connect();
+        } else {
+            self.log_disconnect();
+        }
+    }
 }
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index eb77409429..8104fe6087 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -19,7 +19,10 @@ use tokio_util::sync::CancellationToken;
 use tracing::{debug, info, Span};
 use utils::backoff;
 
-use crate::config::{remote_storage_from_toml, OptRemoteStorageConfig};
+use crate::{
+    config::{remote_storage_from_toml, OptRemoteStorageConfig},
+    context::LOG_CHAN_DISCONNECT,
+};
 
 use super::{RequestMonitoring, LOG_CHAN};
 
@@ -31,6 +34,9 @@ pub struct ParquetUploadArgs {
     #[clap(long, default_value = "{}", value_parser = remote_storage_from_toml)]
     parquet_upload_remote_storage: OptRemoteStorageConfig,
 
+    #[clap(long, default_value = "{}", value_parser = remote_storage_from_toml)]
+    parquet_upload_disconnect_events_remote_storage: OptRemoteStorageConfig,
+
     /// How many rows to include in a row group
     #[clap(long, default_value_t = 8192)]
     parquet_upload_row_group_size: usize,
@@ -91,6 +97,8 @@ pub struct RequestData {
     /// Tracks time from session start (HTTP request/libpq TCP handshake)
     /// Through to success/failure
     duration_us: u64,
+    /// If the session was successful after the disconnect, will be created one more event with filled `disconnect_timestamp`.
+    disconnect_timestamp: Option<chrono::NaiveDateTime>,
 }
 
 impl From<&RequestMonitoring> for RequestData {
@@ -111,7 +119,7 @@ impl From<&RequestMonitoring> for RequestData {
                 super::AuthMethod::ScramSha256Plus => "scram_sha_256_plus",
                 super::AuthMethod::Cleartext => "cleartext",
             }),
-            protocol: value.protocol,
+            protocol: value.protocol.as_str(),
             region: value.region,
             error: value.error_kind.as_ref().map(|e| e.to_metric_label()),
             success: value.success,
@@ -120,6 +128,7 @@ impl From<&RequestMonitoring> for RequestData {
                 .elapsed()
                 .unwrap_or_default()
                 .as_micros() as u64, // 584 millenia... good enough
+            disconnect_timestamp: value.disconnect_timestamp.map(|x| x.naive_utc()),
         }
     }
 }
@@ -141,8 +150,9 @@ pub async fn worker(
     LOG_CHAN.set(tx.downgrade()).unwrap();
 
     // setup row stream that will close on cancellation
+    let cancellation_token2 = cancellation_token.clone();
     tokio::spawn(async move {
-        cancellation_token.cancelled().await;
+        cancellation_token2.cancelled().await;
         // dropping this sender will cause the channel to close only once
         // all the remaining inflight requests have been completed.
         drop(tx);
@@ -167,9 +177,38 @@ pub async fn worker(
         test_remote_failures: 0,
     };
 
-    worker_inner(storage, rx, parquet_config).await
+    // TODO(anna): consider moving this to a separate function.
+    if let Some(disconnect_events_storage_config) =
+        config.parquet_upload_disconnect_events_remote_storage
+    {
+        let (tx_disconnect, mut rx_disconnect) = mpsc::unbounded_channel();
+        LOG_CHAN_DISCONNECT.set(tx_disconnect.downgrade()).unwrap();
+
+        // setup row stream that will close on cancellation
+        tokio::spawn(async move {
+            cancellation_token.cancelled().await;
+            // dropping this sender will cause the channel to close only once
+            // all the remaining inflight requests have been completed.
+            drop(tx_disconnect);
+        });
+        let rx_disconnect = futures::stream::poll_fn(move |cx| rx_disconnect.poll_recv(cx));
+        let rx_disconnect = rx_disconnect.map(RequestData::from);
+
+        let storage_disconnect =
+            GenericRemoteStorage::from_config(&disconnect_events_storage_config)
+                .context("remote storage for disconnect events init")?;
+        let parquet_config_disconnect = parquet_config.clone();
+        tokio::try_join!(
+            worker_inner(storage, rx, parquet_config),
+            worker_inner(storage_disconnect, rx_disconnect, parquet_config_disconnect)
+        )
+        .map(|_| ())
+    } else {
+        worker_inner(storage, rx, parquet_config).await
+    }
 }
 
+#[derive(Clone, Debug)]
 struct ParquetConfig {
     propeties: WriterPropertiesPtr,
     rows_per_group: usize,
@@ -413,6 +452,7 @@ mod tests {
                     )
                     .unwrap(),
                     max_keys_per_list_response: DEFAULT_MAX_KEYS_PER_LIST_RESPONSE,
+                    upload_storage_class: None,
                 }),
                 timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
             })
@@ -451,6 +491,7 @@ mod tests {
             success: rng.gen(),
             cold_start_info: "no",
             duration_us: rng.gen_range(0..30_000_000),
+            disconnect_timestamp: None,
         }
     }
 
@@ -519,15 +560,15 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1314385, 3, 6000),
-                (1314378, 3, 6000),
-                (1314438, 3, 6000),
-                (1314395, 3, 6000),
-                (1314525, 3, 6000),
-                (1314367, 3, 6000),
-                (1314159, 3, 6000),
-                (1314395, 3, 6000),
-                (438352, 1, 2000)
+                (1315008, 3, 6000),
+                (1315001, 3, 6000),
+                (1315061, 3, 6000),
+                (1315018, 3, 6000),
+                (1315148, 3, 6000),
+                (1314990, 3, 6000),
+                (1314782, 3, 6000),
+                (1315018, 3, 6000),
+                (438575, 1, 2000)
             ]
         );
 
@@ -557,11 +598,11 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1220633, 5, 10000),
-                (1226783, 5, 10000),
-                (1228577, 5, 10000),
-                (1227939, 5, 10000),
-                (1219217, 5, 10000)
+                (1221738, 5, 10000),
+                (1227888, 5, 10000),
+                (1229682, 5, 10000),
+                (1229044, 5, 10000),
+                (1220322, 5, 10000)
             ]
         );
 
@@ -593,11 +634,11 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1206280, 5, 10000),
-                (1206011, 5, 10000),
-                (1206304, 5, 10000),
-                (1206292, 5, 10000),
-                (1206547, 5, 10000)
+                (1207385, 5, 10000),
+                (1207116, 5, 10000),
+                (1207409, 5, 10000),
+                (1207397, 5, 10000),
+                (1207652, 5, 10000)
             ]
         );
 
@@ -622,15 +663,15 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1314385, 3, 6000),
-                (1314378, 3, 6000),
-                (1314438, 3, 6000),
-                (1314395, 3, 6000),
-                (1314525, 3, 6000),
-                (1314367, 3, 6000),
-                (1314159, 3, 6000),
-                (1314395, 3, 6000),
-                (438352, 1, 2000)
+                (1315008, 3, 6000),
+                (1315001, 3, 6000),
+                (1315061, 3, 6000),
+                (1315018, 3, 6000),
+                (1315148, 3, 6000),
+                (1314990, 3, 6000),
+                (1314782, 3, 6000),
+                (1315018, 3, 6000),
+                (438575, 1, 2000)
             ]
         );
 
@@ -667,7 +708,7 @@ mod tests {
         // files are smaller than the size threshold, but they took too long to fill so were flushed early
         assert_eq!(
             file_stats,
-            [(658823, 2, 3001), (658537, 2, 3000), (658333, 2, 2999)]
+            [(659240, 2, 3001), (658954, 2, 3000), (658750, 2, 2999)]
         );
 
         tmpdir.close().unwrap();
diff --git a/proxy/src/error.rs b/proxy/src/error.rs
index 4614f3913d..fdfe50a494 100644
--- a/proxy/src/error.rs
+++ b/proxy/src/error.rs
@@ -1,5 +1,7 @@
 use std::{error::Error as StdError, fmt, io};
 
+use measured::FixedCardinalityLabel;
+
 /// Upcast (almost) any error into an opaque [`io::Error`].
 pub fn io_error(e: impl Into<Box<dyn StdError + Send + Sync>>) -> io::Error {
     io::Error::new(io::ErrorKind::Other, e)
@@ -29,24 +31,29 @@ pub trait UserFacingError: ReportableError {
     }
 }
 
-#[derive(Copy, Clone, Debug, Eq, PartialEq)]
+#[derive(Copy, Clone, Debug, Eq, PartialEq, FixedCardinalityLabel)]
+#[label(singleton = "type")]
 pub enum ErrorKind {
     /// Wrong password, unknown endpoint, protocol violation, etc...
     User,
 
     /// Network error between user and proxy. Not necessarily user error
+    #[label(rename = "clientdisconnect")]
     ClientDisconnect,
 
     /// Proxy self-imposed user rate limits
+    #[label(rename = "ratelimit")]
     RateLimit,
 
     /// Proxy self-imposed service-wise rate limits
+    #[label(rename = "serviceratelimit")]
     ServiceRateLimit,
 
     /// internal errors
     Service,
 
     /// Error communicating with control plane
+    #[label(rename = "controlplane")]
     ControlPlane,
 
     /// Postgres error
diff --git a/proxy/src/http.rs b/proxy/src/http.rs
index 59e1492ed4..fc7400869f 100644
--- a/proxy/src/http.rs
+++ b/proxy/src/http.rs
@@ -4,7 +4,7 @@
 
 pub mod health_server;
 
-use std::{sync::Arc, time::Duration};
+use std::{str::FromStr, sync::Arc, time::Duration};
 
 use futures::FutureExt;
 pub use reqwest::{Request, Response, StatusCode};
@@ -13,13 +13,16 @@ pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
 use tokio::time::Instant;
 use tracing::trace;
 
-use crate::{metrics::CONSOLE_REQUEST_LATENCY, rate_limiter, url::ApiUrl};
+use crate::{
+    metrics::{ConsoleRequest, Metrics},
+    url::ApiUrl,
+};
 use reqwest_middleware::RequestBuilder;
 
 /// This is the preferred way to create new http clients,
 /// because it takes care of observability (OpenTelemetry).
 /// We deliberately don't want to replace this with a public static.
-pub fn new_client(rate_limiter_config: rate_limiter::RateLimiterConfig) -> ClientWithMiddleware {
+pub fn new_client() -> ClientWithMiddleware {
     let client = reqwest::ClientBuilder::new()
         .dns_resolver(Arc::new(GaiResolver::default()))
         .connection_verbose(true)
@@ -28,7 +31,6 @@ pub fn new_client(rate_limiter_config: rate_limiter::RateLimiterConfig) -> Clien
 
     reqwest_middleware::ClientBuilder::new(client)
         .with(reqwest_tracing::TracingMiddleware::default())
-        .with(rate_limiter::Limiter::new(rate_limiter_config))
         .build()
 }
 
@@ -90,22 +92,23 @@ impl Endpoint {
 
     /// Execute a [request](reqwest::Request).
     pub async fn execute(&self, request: Request) -> Result<Response, Error> {
-        let path = request.url().path().to_string();
-        let start = Instant::now();
-        let res = self.client.execute(request).await;
-        CONSOLE_REQUEST_LATENCY
-            .with_label_values(&[&path])
-            .observe(start.elapsed().as_secs_f64());
-        res
+        let _timer = Metrics::get()
+            .proxy
+            .console_request_latency
+            .start_timer(ConsoleRequest {
+                request: request.url().path(),
+            });
+
+        self.client.execute(request).await
     }
 }
 
-/// https://docs.rs/reqwest/0.11.18/src/reqwest/dns/gai.rs.html
-use hyper::{
-    client::connect::dns::{GaiResolver as HyperGaiResolver, Name},
-    service::Service,
+use hyper_util::client::legacy::connect::dns::{
+    GaiResolver as HyperGaiResolver, Name as HyperName,
 };
-use reqwest::dns::{Addrs, Resolve, Resolving};
+use reqwest::dns::{Addrs, Name, Resolve, Resolving};
+/// https://docs.rs/reqwest/0.11.18/src/reqwest/dns/gai.rs.html
+use tower_service::Service;
 #[derive(Debug)]
 pub struct GaiResolver(HyperGaiResolver);
 
@@ -118,11 +121,12 @@ impl Default for GaiResolver {
 impl Resolve for GaiResolver {
     fn resolve(&self, name: Name) -> Resolving {
         let this = &mut self.0.clone();
+        let hyper_name = HyperName::from_str(name.as_str()).expect("name should be valid");
         let start = Instant::now();
         Box::pin(
-            Service::<Name>::call(this, name.clone()).map(move |result| {
+            Service::<HyperName>::call(this, hyper_name).map(move |result| {
                 let resolve_duration = start.elapsed();
-                trace!(duration = ?resolve_duration, addr = %name, "resolve host complete");
+                trace!(duration = ?resolve_duration, addr = %name.as_str(), "resolve host complete");
                 result
                     .map(|addrs| -> Addrs { Box::new(addrs) })
                     .map_err(|err| -> Box<dyn std::error::Error + Send + Sync> { Box::new(err) })
diff --git a/proxy/src/http/health_server.rs b/proxy/src/http/health_server.rs
index cbb17ebcb7..cae9eb5b97 100644
--- a/proxy/src/http/health_server.rs
+++ b/proxy/src/http/health_server.rs
@@ -1,30 +1,49 @@
 use anyhow::{anyhow, bail};
-use hyper::{Body, Request, Response, StatusCode};
-use std::{convert::Infallible, net::TcpListener};
-use tracing::info;
+use hyper::{header::CONTENT_TYPE, Body, Request, Response, StatusCode};
+use measured::{text::BufferedTextEncoder, MetricGroup};
+use metrics::NeonMetrics;
+use std::{
+    convert::Infallible,
+    net::TcpListener,
+    sync::{Arc, Mutex},
+};
+use tracing::{info, info_span};
 use utils::http::{
-    endpoint::{self, prometheus_metrics_handler, request_span},
+    endpoint::{self, request_span},
     error::ApiError,
     json::json_response,
     RouterBuilder, RouterService,
 };
 
+use crate::jemalloc;
+
 async fn status_handler(_: Request<Body>) -> Result<Response<Body>, ApiError> {
     json_response(StatusCode::OK, "")
 }
 
-fn make_router() -> RouterBuilder<hyper::Body, ApiError> {
+fn make_router(metrics: AppMetrics) -> RouterBuilder<hyper::Body, ApiError> {
+    let state = Arc::new(Mutex::new(PrometheusHandler {
+        encoder: BufferedTextEncoder::new(),
+        metrics,
+    }));
+
     endpoint::make_router()
-        .get("/metrics", |r| request_span(r, prometheus_metrics_handler))
+        .get("/metrics", move |r| {
+            let state = state.clone();
+            request_span(r, move |b| prometheus_metrics_handler(b, state))
+        })
         .get("/v1/status", status_handler)
 }
 
-pub async fn task_main(http_listener: TcpListener) -> anyhow::Result<Infallible> {
+pub async fn task_main(
+    http_listener: TcpListener,
+    metrics: AppMetrics,
+) -> anyhow::Result<Infallible> {
     scopeguard::defer! {
         info!("http has shut down");
     }
 
-    let service = || RouterService::new(make_router().build()?);
+    let service = || RouterService::new(make_router(metrics).build()?);
 
     hyper::Server::from_tcp(http_listener)?
         .serve(service().map_err(|e| anyhow!(e))?)
@@ -32,3 +51,57 @@ pub async fn task_main(http_listener: TcpListener) -> anyhow::Result<Infallible>
 
     bail!("hyper server without shutdown handling cannot shutdown successfully");
 }
+
+struct PrometheusHandler {
+    encoder: BufferedTextEncoder,
+    metrics: AppMetrics,
+}
+
+#[derive(MetricGroup)]
+pub struct AppMetrics {
+    #[metric(namespace = "jemalloc")]
+    pub jemalloc: Option<jemalloc::MetricRecorder>,
+    #[metric(flatten)]
+    pub neon_metrics: NeonMetrics,
+    #[metric(flatten)]
+    pub proxy: &'static crate::metrics::Metrics,
+}
+
+async fn prometheus_metrics_handler(
+    _req: Request<Body>,
+    state: Arc<Mutex<PrometheusHandler>>,
+) -> Result<Response<Body>, ApiError> {
+    let started_at = std::time::Instant::now();
+
+    let span = info_span!("blocking");
+    let body = tokio::task::spawn_blocking(move || {
+        let _span = span.entered();
+
+        let mut state = state.lock().unwrap();
+        let PrometheusHandler { encoder, metrics } = &mut *state;
+
+        metrics
+            .collect_group_into(&mut *encoder)
+            .unwrap_or_else(|infallible| match infallible {});
+
+        let body = encoder.finish();
+
+        tracing::info!(
+            bytes = body.len(),
+            elapsed_ms = started_at.elapsed().as_millis(),
+            "responded /metrics"
+        );
+
+        body
+    })
+    .await
+    .unwrap();
+
+    let response = Response::builder()
+        .status(200)
+        .header(CONTENT_TYPE, "text/plain; version=0.0.4")
+        .body(Body::from(body))
+        .unwrap();
+
+    Ok(response)
+}
diff --git a/proxy/src/intern.rs b/proxy/src/intern.rs
index a6519bdff9..e38135dd22 100644
--- a/proxy/src/intern.rs
+++ b/proxy/src/intern.rs
@@ -160,6 +160,11 @@ impl From<&EndpointId> for EndpointIdInt {
         EndpointIdTag::get_interner().get_or_intern(value)
     }
 }
+impl From<EndpointId> for EndpointIdInt {
+    fn from(value: EndpointId) -> Self {
+        EndpointIdTag::get_interner().get_or_intern(&value)
+    }
+}
 
 #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
 pub struct BranchIdTag;
@@ -175,6 +180,11 @@ impl From<&BranchId> for BranchIdInt {
         BranchIdTag::get_interner().get_or_intern(value)
     }
 }
+impl From<BranchId> for BranchIdInt {
+    fn from(value: BranchId) -> Self {
+        BranchIdTag::get_interner().get_or_intern(&value)
+    }
+}
 
 #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
 pub struct ProjectIdTag;
@@ -190,6 +200,11 @@ impl From<&ProjectId> for ProjectIdInt {
         ProjectIdTag::get_interner().get_or_intern(value)
     }
 }
+impl From<ProjectId> for ProjectIdInt {
+    fn from(value: ProjectId) -> Self {
+        ProjectIdTag::get_interner().get_or_intern(&value)
+    }
+}
 
 #[cfg(test)]
 mod tests {
diff --git a/proxy/src/jemalloc.rs b/proxy/src/jemalloc.rs
index ed20798d56..3243e6a140 100644
--- a/proxy/src/jemalloc.rs
+++ b/proxy/src/jemalloc.rs
@@ -1,27 +1,45 @@
-use std::time::Duration;
+use std::marker::PhantomData;
 
-use metrics::IntGauge;
-use prometheus::{register_int_gauge_with_registry, Registry};
+use measured::{
+    label::NoLabels,
+    metric::{
+        gauge::GaugeState, group::Encoding, group::MetricValue, name::MetricNameEncoder,
+        MetricEncoding, MetricFamilyEncoding, MetricType,
+    },
+    text::TextEncoder,
+    LabelGroup, MetricGroup,
+};
 use tikv_jemalloc_ctl::{config, epoch, epoch_mib, stats, version};
 
 pub struct MetricRecorder {
     epoch: epoch_mib,
-    active: stats::active_mib,
-    active_gauge: IntGauge,
-    allocated: stats::allocated_mib,
-    allocated_gauge: IntGauge,
-    mapped: stats::mapped_mib,
-    mapped_gauge: IntGauge,
-    metadata: stats::metadata_mib,
-    metadata_gauge: IntGauge,
-    resident: stats::resident_mib,
-    resident_gauge: IntGauge,
-    retained: stats::retained_mib,
-    retained_gauge: IntGauge,
+    inner: Metrics,
+}
+
+#[derive(MetricGroup)]
+struct Metrics {
+    active_bytes: JemallocGaugeFamily<stats::active_mib>,
+    allocated_bytes: JemallocGaugeFamily<stats::allocated_mib>,
+    mapped_bytes: JemallocGaugeFamily<stats::mapped_mib>,
+    metadata_bytes: JemallocGaugeFamily<stats::metadata_mib>,
+    resident_bytes: JemallocGaugeFamily<stats::resident_mib>,
+    retained_bytes: JemallocGaugeFamily<stats::retained_mib>,
+}
+
+impl<Enc: Encoding> MetricGroup<Enc> for MetricRecorder
+where
+    Metrics: MetricGroup<Enc>,
+{
+    fn collect_group_into(&self, enc: &mut Enc) -> Result<(), Enc::Err> {
+        if self.epoch.advance().is_ok() {
+            self.inner.collect_group_into(enc)?;
+        }
+        Ok(())
+    }
 }
 
 impl MetricRecorder {
-    pub fn new(registry: &Registry) -> Result<Self, anyhow::Error> {
+    pub fn new() -> Result<Self, anyhow::Error> {
         tracing::info!(
             config = config::malloc_conf::read()?,
             version = version::read()?,
@@ -30,71 +48,69 @@ impl MetricRecorder {
 
         Ok(Self {
             epoch: epoch::mib()?,
-            active: stats::active::mib()?,
-            active_gauge: register_int_gauge_with_registry!(
-                "jemalloc_active_bytes",
-                "Total number of bytes in active pages allocated by the process",
-                registry
-            )?,
-            allocated: stats::allocated::mib()?,
-            allocated_gauge: register_int_gauge_with_registry!(
-                "jemalloc_allocated_bytes",
-                "Total number of bytes allocated by the process",
-                registry
-            )?,
-            mapped: stats::mapped::mib()?,
-            mapped_gauge: register_int_gauge_with_registry!(
-                "jemalloc_mapped_bytes",
-                "Total number of bytes in active extents mapped by the allocator",
-                registry
-            )?,
-            metadata: stats::metadata::mib()?,
-            metadata_gauge: register_int_gauge_with_registry!(
-                "jemalloc_metadata_bytes",
-                "Total number of bytes dedicated to jemalloc metadata",
-                registry
-            )?,
-            resident: stats::resident::mib()?,
-            resident_gauge: register_int_gauge_with_registry!(
-                "jemalloc_resident_bytes",
-                "Total number of bytes in physically resident data pages mapped by the allocator",
-                registry
-            )?,
-            retained: stats::retained::mib()?,
-            retained_gauge: register_int_gauge_with_registry!(
-                "jemalloc_retained_bytes",
-                "Total number of bytes in virtual memory mappings that were retained rather than being returned to the operating system",
-                registry
-            )?,
-        })
-    }
-
-    fn _poll(&self) -> Result<(), anyhow::Error> {
-        self.epoch.advance()?;
-        self.active_gauge.set(self.active.read()? as i64);
-        self.allocated_gauge.set(self.allocated.read()? as i64);
-        self.mapped_gauge.set(self.mapped.read()? as i64);
-        self.metadata_gauge.set(self.metadata.read()? as i64);
-        self.resident_gauge.set(self.resident.read()? as i64);
-        self.retained_gauge.set(self.retained.read()? as i64);
-        Ok(())
-    }
-
-    #[inline]
-    pub fn poll(&self) {
-        if let Err(error) = self._poll() {
-            tracing::warn!(%error, "Failed to poll jemalloc stats");
-        }
-    }
-
-    pub fn start(self) -> tokio::task::JoinHandle<()> {
-        tokio::task::spawn(async move {
-            let mut interval = tokio::time::interval(Duration::from_secs(15));
-            interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip);
-            loop {
-                self.poll();
-                interval.tick().await;
-            }
+            inner: Metrics {
+                active_bytes: JemallocGaugeFamily(stats::active::mib()?),
+                allocated_bytes: JemallocGaugeFamily(stats::allocated::mib()?),
+                mapped_bytes: JemallocGaugeFamily(stats::mapped::mib()?),
+                metadata_bytes: JemallocGaugeFamily(stats::metadata::mib()?),
+                resident_bytes: JemallocGaugeFamily(stats::resident::mib()?),
+                retained_bytes: JemallocGaugeFamily(stats::retained::mib()?),
+            },
         })
     }
 }
+
+struct JemallocGauge<T>(PhantomData<T>);
+
+impl<T> Default for JemallocGauge<T> {
+    fn default() -> Self {
+        JemallocGauge(PhantomData)
+    }
+}
+impl<T> MetricType for JemallocGauge<T> {
+    type Metadata = T;
+}
+
+struct JemallocGaugeFamily<T>(T);
+impl<M, T: Encoding> MetricFamilyEncoding<T> for JemallocGaugeFamily<M>
+where
+    JemallocGauge<M>: MetricEncoding<T, Metadata = M>,
+{
+    fn collect_family_into(&self, name: impl MetricNameEncoder, enc: &mut T) -> Result<(), T::Err> {
+        JemallocGauge::write_type(&name, enc)?;
+        JemallocGauge(PhantomData).collect_into(&self.0, NoLabels, name, enc)
+    }
+}
+
+macro_rules! jemalloc_gauge {
+    ($stat:ident, $mib:ident) => {
+        impl<W: std::io::Write> MetricEncoding<TextEncoder<W>> for JemallocGauge<stats::$mib> {
+            fn write_type(
+                name: impl MetricNameEncoder,
+                enc: &mut TextEncoder<W>,
+            ) -> Result<(), std::io::Error> {
+                GaugeState::write_type(name, enc)
+            }
+
+            fn collect_into(
+                &self,
+                mib: &stats::$mib,
+                labels: impl LabelGroup,
+                name: impl MetricNameEncoder,
+                enc: &mut TextEncoder<W>,
+            ) -> Result<(), std::io::Error> {
+                if let Ok(v) = mib.read() {
+                    enc.write_metric_value(name, labels, MetricValue::Int(v as i64))?;
+                }
+                Ok(())
+            }
+        }
+    };
+}
+
+jemalloc_gauge!(active, active_mib);
+jemalloc_gauge!(allocated, allocated_mib);
+jemalloc_gauge!(mapped, mapped_mib);
+jemalloc_gauge!(metadata, metadata_mib);
+jemalloc_gauge!(resident, resident_mib);
+jemalloc_gauge!(retained, retained_mib);
diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs
index da7c7f3ed2..35c1616481 100644
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -127,6 +127,24 @@ macro_rules! smol_str_wrapper {
     };
 }
 
+const POOLER_SUFFIX: &str = "-pooler";
+
+pub trait Normalize {
+    fn normalize(&self) -> Self;
+}
+
+impl<S: Clone + AsRef<str> + From<String>> Normalize for S {
+    fn normalize(&self) -> Self {
+        if self.as_ref().ends_with(POOLER_SUFFIX) {
+            let mut s = self.as_ref().to_string();
+            s.truncate(s.len() - POOLER_SUFFIX.len());
+            s.into()
+        } else {
+            self.clone()
+        }
+    }
+}
+
 // 90% of role name strings are 20 characters or less.
 smol_str_wrapper!(RoleName);
 // 50% of endpoint strings are 23 characters or less.
@@ -140,3 +158,25 @@ smol_str_wrapper!(ProjectId);
 smol_str_wrapper!(EndpointCacheKey);
 
 smol_str_wrapper!(DbName);
+
+// postgres hostname, will likely be a port:ip addr
+smol_str_wrapper!(Host);
+
+// Endpoints are a bit tricky. Rare they might be branches or projects.
+impl EndpointId {
+    pub fn is_endpoint(&self) -> bool {
+        self.0.starts_with("ep-")
+    }
+    pub fn is_branch(&self) -> bool {
+        self.0.starts_with("br-")
+    }
+    pub fn is_project(&self) -> bool {
+        !self.is_endpoint() && !self.is_branch()
+    }
+    pub fn as_branch(&self) -> BranchId {
+        BranchId(self.0.clone())
+    }
+    pub fn as_project(&self) -> ProjectId {
+        ProjectId(self.0.clone())
+    }
+}
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index 59ee899c08..1590316925 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -1,181 +1,372 @@
-use ::metrics::{
-    exponential_buckets, register_histogram, register_histogram_vec, register_hll_vec,
-    register_int_counter_pair_vec, register_int_counter_vec, register_int_gauge,
-    register_int_gauge_vec, Histogram, HistogramVec, HyperLogLogVec, IntCounterPairVec,
-    IntCounterVec, IntGauge, IntGaugeVec,
-};
-use metrics::{
-    register_hll, register_int_counter, register_int_counter_pair, HyperLogLog, IntCounter,
-    IntCounterPair,
-};
+use std::sync::OnceLock;
+
+use lasso::ThreadedRodeo;
+use measured::{
+    label::StaticLabelSet,
+    metric::{histogram::Thresholds, name::MetricName},
+    Counter, CounterVec, FixedCardinalityLabel, Gauge, Histogram, HistogramVec, LabelGroup,
+    MetricGroup,
+};
+use metrics::{CounterPairAssoc, CounterPairVec, HyperLogLog, HyperLogLogVec};
 
-use once_cell::sync::Lazy;
 use tokio::time::{self, Instant};
 
 use crate::console::messages::ColdStartInfo;
 
-pub static NUM_DB_CONNECTIONS_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
-    register_int_counter_pair_vec!(
-        "proxy_opened_db_connections_total",
-        "Number of opened connections to a database.",
-        "proxy_closed_db_connections_total",
-        "Number of closed connections to a database.",
-        &["protocol"],
-    )
-    .unwrap()
-});
+#[derive(MetricGroup)]
+pub struct Metrics {
+    #[metric(namespace = "proxy")]
+    pub proxy: ProxyMetrics,
 
-pub static NUM_CLIENT_CONNECTION_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
-    register_int_counter_pair_vec!(
-        "proxy_opened_client_connections_total",
-        "Number of opened connections from a client.",
-        "proxy_closed_client_connections_total",
-        "Number of closed connections from a client.",
-        &["protocol"],
-    )
-    .unwrap()
-});
+    #[metric(namespace = "wake_compute_lock")]
+    pub wake_compute_lock: ApiLockMetrics,
+}
 
-pub static NUM_CONNECTION_REQUESTS_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
-    register_int_counter_pair_vec!(
-        "proxy_accepted_connections_total",
-        "Number of client connections accepted.",
-        "proxy_closed_connections_total",
-        "Number of client connections closed.",
-        &["protocol"],
-    )
-    .unwrap()
-});
+impl Metrics {
+    pub fn get() -> &'static Self {
+        static SELF: OnceLock<Metrics> = OnceLock::new();
+        SELF.get_or_init(|| Metrics {
+            proxy: ProxyMetrics::default(),
+            wake_compute_lock: ApiLockMetrics::new(),
+        })
+    }
+}
 
-pub static COMPUTE_CONNECTION_LATENCY: Lazy<HistogramVec> = Lazy::new(|| {
-    register_histogram_vec!(
-        "proxy_compute_connection_latency_seconds",
-        "Time it took for proxy to establish a connection to the compute endpoint",
-        // http/ws/tcp, true/false, true/false, success/failure, client/client_and_cplane
-        // 3 * 6 * 2 * 2 = 72 counters
-        &["protocol", "cold_start_info", "outcome", "excluded"],
-        // largest bucket = 2^16 * 0.5ms = 32s
-        exponential_buckets(0.0005, 2.0, 16).unwrap(),
-    )
-    .unwrap()
-});
+#[derive(MetricGroup)]
+#[metric(new())]
+pub struct ProxyMetrics {
+    #[metric(flatten)]
+    pub db_connections: CounterPairVec<NumDbConnectionsGauge>,
+    #[metric(flatten)]
+    pub client_connections: CounterPairVec<NumClientConnectionsGauge>,
+    #[metric(flatten)]
+    pub connection_requests: CounterPairVec<NumConnectionRequestsGauge>,
+    #[metric(flatten)]
+    pub http_endpoint_pools: HttpEndpointPools,
 
-pub static CONSOLE_REQUEST_LATENCY: Lazy<HistogramVec> = Lazy::new(|| {
-    register_histogram_vec!(
-        "proxy_console_request_latency",
-        "Time it took for proxy to establish a connection to the compute endpoint",
-        // proxy_wake_compute/proxy_get_role_info
-        &["request"],
+    /// Time it took for proxy to establish a connection to the compute endpoint.
+    // largest bucket = 2^16 * 0.5ms = 32s
+    #[metric(metadata = Thresholds::exponential_buckets(0.0005, 2.0))]
+    pub compute_connection_latency_seconds: HistogramVec<ComputeConnectionLatencySet, 16>,
+
+    /// Time it took for proxy to receive a response from control plane.
+    #[metric(
         // largest bucket = 2^16 * 0.2ms = 13s
-        exponential_buckets(0.0002, 2.0, 16).unwrap(),
-    )
-    .unwrap()
-});
+        metadata = Thresholds::exponential_buckets(0.0002, 2.0),
+    )]
+    pub console_request_latency: HistogramVec<ConsoleRequestSet, 16>,
 
-pub static ALLOWED_IPS_BY_CACHE_OUTCOME: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "proxy_allowed_ips_cache_misses",
-        "Number of cache hits/misses for allowed ips",
-        // hit/miss
-        &["outcome"],
-    )
-    .unwrap()
-});
+    /// Time it takes to acquire a token to call console plane.
+    // largest bucket = 3^16 * 0.05ms = 2.15s
+    #[metric(metadata = Thresholds::exponential_buckets(0.00005, 3.0))]
+    pub control_plane_token_acquire_seconds: Histogram<16>,
 
-pub static RATE_LIMITER_ACQUIRE_LATENCY: Lazy<Histogram> = Lazy::new(|| {
-    register_histogram!(
-        "proxy_control_plane_token_acquire_seconds",
-        "Time it took for proxy to establish a connection to the compute endpoint",
-        // largest bucket = 3^16 * 0.05ms = 2.15s
-        exponential_buckets(0.00005, 3.0, 16).unwrap(),
-    )
-    .unwrap()
-});
+    /// Size of the HTTP request body lengths.
+    // smallest bucket = 16 bytes
+    // largest bucket = 4^12 * 16 bytes = 256MB
+    #[metric(metadata = Thresholds::exponential_buckets(16.0, 4.0))]
+    pub http_conn_content_length_bytes: HistogramVec<StaticLabelSet<HttpDirection>, 12>,
 
-pub static RATE_LIMITER_LIMIT: Lazy<IntGaugeVec> = Lazy::new(|| {
-    register_int_gauge_vec!(
-        "semaphore_control_plane_limit",
-        "Current limit of the semaphore control plane",
-        &["limit"], // 2 counters
-    )
-    .unwrap()
-});
+    /// Time it takes to reclaim unused connection pools.
+    #[metric(metadata = Thresholds::exponential_buckets(1e-6, 2.0))]
+    pub http_pool_reclaimation_lag_seconds: Histogram<16>,
 
-pub static NUM_CONNECTION_ACCEPTED_BY_SNI: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "proxy_accepted_connections_by_sni",
-        "Number of connections (per sni).",
-        &["kind"],
-    )
-    .unwrap()
-});
+    /// Number of opened connections to a database.
+    pub http_pool_opened_connections: Gauge,
 
-pub static ALLOWED_IPS_NUMBER: Lazy<Histogram> = Lazy::new(|| {
-    register_histogram!(
-        "proxy_allowed_ips_number",
-        "Number of allowed ips",
-        vec![0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0],
-    )
-    .unwrap()
-});
+    /// Number of cache hits/misses for allowed ips.
+    pub allowed_ips_cache_misses: CounterVec<StaticLabelSet<CacheOutcome>>,
 
-pub static HTTP_CONTENT_LENGTH: Lazy<HistogramVec> = Lazy::new(|| {
-    register_histogram_vec!(
-        "proxy_http_conn_content_length_bytes",
-        "Number of bytes the HTTP response content consumes",
-        // request/response
-        &["direction"],
-        // smallest bucket = 16 bytes
-        // largest bucket = 4^12 * 16 bytes = 256MB
-        exponential_buckets(16.0, 4.0, 12).unwrap()
-    )
-    .unwrap()
-});
+    /// Number of allowed ips
+    #[metric(metadata = Thresholds::with_buckets([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0]))]
+    pub allowed_ips_number: Histogram<10>,
 
-pub static GC_LATENCY: Lazy<Histogram> = Lazy::new(|| {
-    register_histogram!(
-        "proxy_http_pool_reclaimation_lag_seconds",
-        "Time it takes to reclaim unused connection pools",
-        // 1us -> 65ms
-        exponential_buckets(1e-6, 2.0, 16).unwrap(),
-    )
-    .unwrap()
-});
+    /// Number of connections (per sni).
+    pub accepted_connections_by_sni: CounterVec<StaticLabelSet<SniKind>>,
 
-pub static ENDPOINT_POOLS: Lazy<IntCounterPair> = Lazy::new(|| {
-    register_int_counter_pair!(
-        "proxy_http_pool_endpoints_registered_total",
-        "Number of endpoints we have registered pools for",
-        "proxy_http_pool_endpoints_unregistered_total",
-        "Number of endpoints we have unregistered pools for",
-    )
-    .unwrap()
-});
+    /// Number of connection failures (per kind).
+    pub connection_failures_total: CounterVec<StaticLabelSet<ConnectionFailureKind>>,
 
-pub static NUM_OPEN_CLIENTS_IN_HTTP_POOL: Lazy<IntGauge> = Lazy::new(|| {
-    register_int_gauge!(
-        "proxy_http_pool_opened_connections",
-        "Number of opened connections to a database.",
-    )
-    .unwrap()
-});
+    /// Number of wake-up failures (per kind).
+    pub connection_failures_breakdown: CounterVec<ConnectionFailuresBreakdownSet>,
 
-pub static NUM_CANCELLATION_REQUESTS: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "proxy_cancellation_requests_total",
-        "Number of cancellation requests (per found/not_found).",
-        &["source", "kind"],
-    )
-    .unwrap()
-});
+    /// Number of bytes sent/received between all clients and backends.
+    pub io_bytes: CounterVec<StaticLabelSet<Direction>>,
 
-pub const NUM_CANCELLATION_REQUESTS_SOURCE_FROM_CLIENT: &str = "from_client";
-pub const NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS: &str = "from_redis";
+    /// Number of errors by a given classification.
+    pub errors_total: CounterVec<StaticLabelSet<crate::error::ErrorKind>>,
+
+    /// Number of cancellation requests (per found/not_found).
+    pub cancellation_requests_total: CounterVec<CancellationRequestSet>,
+
+    /// Number of errors by a given classification
+    pub redis_errors_total: CounterVec<RedisErrorsSet>,
+
+    /// Number of TLS handshake failures
+    pub tls_handshake_failures: Counter,
+
+    /// Number of connection requests affected by authentication rate limits
+    pub requests_auth_rate_limits_total: Counter,
+
+    /// HLL approximate cardinality of endpoints that are connecting
+    pub connecting_endpoints: HyperLogLogVec<StaticLabelSet<Protocol>, 32>,
+
+    /// Number of endpoints affected by errors of a given classification
+    pub endpoints_affected_by_errors: HyperLogLogVec<StaticLabelSet<crate::error::ErrorKind>, 32>,
+
+    /// Number of endpoints affected by authentication rate limits
+    pub endpoints_auth_rate_limits: HyperLogLog<32>,
+
+    /// Number of invalid endpoints (per protocol, per rejected).
+    pub invalid_endpoints_total: CounterVec<InvalidEndpointsSet>,
+
+    /// Number of retries (per outcome, per retry_type).
+    #[metric(metadata = Thresholds::with_buckets([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]))]
+    pub retries_metric: HistogramVec<RetriesMetricSet, 9>,
+
+    /// Number of events consumed from redis (per event type).
+    pub redis_events_count: CounterVec<StaticLabelSet<RedisEventsCount>>,
+
+    #[metric(namespace = "connect_compute_lock")]
+    pub connect_compute_lock: ApiLockMetrics,
+}
+
+#[derive(MetricGroup)]
+#[metric(new())]
+pub struct ApiLockMetrics {
+    /// Number of semaphores registered in this api lock
+    pub semaphores_registered: Counter,
+    /// Number of semaphores unregistered in this api lock
+    pub semaphores_unregistered: Counter,
+    /// Time it takes to reclaim unused semaphores in the api lock
+    #[metric(metadata = Thresholds::exponential_buckets(1e-6, 2.0))]
+    pub reclamation_lag_seconds: Histogram<16>,
+    /// Time it takes to acquire a semaphore lock
+    #[metric(metadata = Thresholds::exponential_buckets(1e-4, 2.0))]
+    pub semaphore_acquire_seconds: Histogram<16>,
+}
+
+impl Default for ProxyMetrics {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl Default for ApiLockMetrics {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+#[derive(FixedCardinalityLabel, Copy, Clone)]
+#[label(singleton = "direction")]
+pub enum HttpDirection {
+    Request,
+    Response,
+}
+
+#[derive(FixedCardinalityLabel, Copy, Clone)]
+#[label(singleton = "direction")]
+pub enum Direction {
+    Tx,
+    Rx,
+}
+
+#[derive(FixedCardinalityLabel, Clone, Copy, Debug)]
+#[label(singleton = "protocol")]
+pub enum Protocol {
+    Http,
+    Ws,
+    Tcp,
+    SniRouter,
+}
+
+impl Protocol {
+    pub fn as_str(&self) -> &'static str {
+        match self {
+            Protocol::Http => "http",
+            Protocol::Ws => "ws",
+            Protocol::Tcp => "tcp",
+            Protocol::SniRouter => "sni_router",
+        }
+    }
+}
+
+impl std::fmt::Display for Protocol {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.write_str(self.as_str())
+    }
+}
+
+#[derive(FixedCardinalityLabel, Copy, Clone)]
+pub enum Bool {
+    True,
+    False,
+}
+
+#[derive(FixedCardinalityLabel, Copy, Clone)]
+#[label(singleton = "outcome")]
+pub enum Outcome {
+    Success,
+    Failed,
+}
+
+#[derive(FixedCardinalityLabel, Copy, Clone)]
+#[label(singleton = "outcome")]
+pub enum CacheOutcome {
+    Hit,
+    Miss,
+}
+
+#[derive(LabelGroup)]
+#[label(set = ConsoleRequestSet)]
+pub struct ConsoleRequest<'a> {
+    #[label(dynamic_with = ThreadedRodeo, default)]
+    pub request: &'a str,
+}
+
+#[derive(MetricGroup, Default)]
+pub struct HttpEndpointPools {
+    /// Number of endpoints we have registered pools for
+    pub http_pool_endpoints_registered_total: Counter,
+    /// Number of endpoints we have unregistered pools for
+    pub http_pool_endpoints_unregistered_total: Counter,
+}
+
+pub struct HttpEndpointPoolsGuard<'a> {
+    dec: &'a Counter,
+}
+
+impl Drop for HttpEndpointPoolsGuard<'_> {
+    fn drop(&mut self) {
+        self.dec.inc();
+    }
+}
+
+impl HttpEndpointPools {
+    pub fn guard(&self) -> HttpEndpointPoolsGuard {
+        self.http_pool_endpoints_registered_total.inc();
+        HttpEndpointPoolsGuard {
+            dec: &self.http_pool_endpoints_unregistered_total,
+        }
+    }
+}
+pub struct NumDbConnectionsGauge;
+impl CounterPairAssoc for NumDbConnectionsGauge {
+    const INC_NAME: &'static MetricName = MetricName::from_str("opened_db_connections_total");
+    const DEC_NAME: &'static MetricName = MetricName::from_str("closed_db_connections_total");
+    const INC_HELP: &'static str = "Number of opened connections to a database.";
+    const DEC_HELP: &'static str = "Number of closed connections to a database.";
+    type LabelGroupSet = StaticLabelSet<Protocol>;
+}
+pub type NumDbConnectionsGuard<'a> = metrics::MeasuredCounterPairGuard<'a, NumDbConnectionsGauge>;
+
+pub struct NumClientConnectionsGauge;
+impl CounterPairAssoc for NumClientConnectionsGauge {
+    const INC_NAME: &'static MetricName = MetricName::from_str("opened_client_connections_total");
+    const DEC_NAME: &'static MetricName = MetricName::from_str("closed_client_connections_total");
+    const INC_HELP: &'static str = "Number of opened connections from a client.";
+    const DEC_HELP: &'static str = "Number of closed connections from a client.";
+    type LabelGroupSet = StaticLabelSet<Protocol>;
+}
+pub type NumClientConnectionsGuard<'a> =
+    metrics::MeasuredCounterPairGuard<'a, NumClientConnectionsGauge>;
+
+pub struct NumConnectionRequestsGauge;
+impl CounterPairAssoc for NumConnectionRequestsGauge {
+    const INC_NAME: &'static MetricName = MetricName::from_str("accepted_connections_total");
+    const DEC_NAME: &'static MetricName = MetricName::from_str("closed_connections_total");
+    const INC_HELP: &'static str = "Number of client connections accepted.";
+    const DEC_HELP: &'static str = "Number of client connections closed.";
+    type LabelGroupSet = StaticLabelSet<Protocol>;
+}
+pub type NumConnectionRequestsGuard<'a> =
+    metrics::MeasuredCounterPairGuard<'a, NumConnectionRequestsGauge>;
+
+#[derive(LabelGroup)]
+#[label(set = ComputeConnectionLatencySet)]
+pub struct ComputeConnectionLatencyGroup {
+    protocol: Protocol,
+    cold_start_info: ColdStartInfo,
+    outcome: ConnectOutcome,
+    excluded: LatencyExclusions,
+}
+
+#[derive(FixedCardinalityLabel, Copy, Clone)]
+pub enum LatencyExclusions {
+    Client,
+    ClientAndCplane,
+    ClientCplaneCompute,
+    ClientCplaneComputeRetry,
+}
+
+#[derive(FixedCardinalityLabel, Copy, Clone)]
+#[label(singleton = "kind")]
+pub enum SniKind {
+    Sni,
+    NoSni,
+    PasswordHack,
+}
+
+#[derive(FixedCardinalityLabel, Copy, Clone)]
+#[label(singleton = "kind")]
+pub enum ConnectionFailureKind {
+    ComputeCached,
+    ComputeUncached,
+}
+
+#[derive(FixedCardinalityLabel, Copy, Clone)]
+#[label(singleton = "kind")]
+pub enum WakeupFailureKind {
+    BadComputeAddress,
+    ApiTransportError,
+    QuotaExceeded,
+    ApiConsoleLocked,
+    ApiConsoleBadRequest,
+    ApiConsoleOtherServerError,
+    ApiConsoleOtherError,
+    TimeoutError,
+}
+
+#[derive(LabelGroup)]
+#[label(set = ConnectionFailuresBreakdownSet)]
+pub struct ConnectionFailuresBreakdownGroup {
+    pub kind: WakeupFailureKind,
+    pub retry: Bool,
+}
+
+#[derive(LabelGroup, Copy, Clone)]
+#[label(set = RedisErrorsSet)]
+pub struct RedisErrors<'a> {
+    #[label(dynamic_with = ThreadedRodeo, default)]
+    pub channel: &'a str,
+}
+
+#[derive(FixedCardinalityLabel, Copy, Clone)]
+pub enum CancellationSource {
+    FromClient,
+    FromRedis,
+    Local,
+}
+
+#[derive(FixedCardinalityLabel, Copy, Clone)]
+pub enum CancellationOutcome {
+    NotFound,
+    Found,
+}
+
+#[derive(LabelGroup)]
+#[label(set = CancellationRequestSet)]
+pub struct CancellationRequest {
+    pub source: CancellationSource,
+    pub kind: CancellationOutcome,
+}
 
 pub enum Waiting {
     Cplane,
     Client,
     Compute,
+    RetryTimeout,
 }
 
 #[derive(Default)]
@@ -183,20 +374,7 @@ struct Accumulated {
     cplane: time::Duration,
     client: time::Duration,
     compute: time::Duration,
-}
-
-enum Outcome {
-    Success,
-    Failed,
-}
-
-impl Outcome {
-    fn as_str(&self) -> &'static str {
-        match self {
-            Outcome::Success => "success",
-            Outcome::Failed => "failed",
-        }
-    }
+    retry: time::Duration,
 }
 
 pub struct LatencyTimer {
@@ -207,9 +385,9 @@ pub struct LatencyTimer {
     // accumulated time on the stopwatch
     accumulated: Accumulated,
     // label data
-    protocol: &'static str,
+    protocol: Protocol,
     cold_start_info: ColdStartInfo,
-    outcome: Outcome,
+    outcome: ConnectOutcome,
 }
 
 pub struct LatencyTimerPause<'a> {
@@ -219,7 +397,7 @@ pub struct LatencyTimerPause<'a> {
 }
 
 impl LatencyTimer {
-    pub fn new(protocol: &'static str) -> Self {
+    pub fn new(protocol: Protocol) -> Self {
         Self {
             start: time::Instant::now(),
             stop: None,
@@ -227,7 +405,7 @@ impl LatencyTimer {
             protocol,
             cold_start_info: ColdStartInfo::Unknown,
             // assume failed unless otherwise specified
-            outcome: Outcome::Failed,
+            outcome: ConnectOutcome::Failed,
         }
     }
 
@@ -248,7 +426,7 @@ impl LatencyTimer {
         self.stop = Some(time::Instant::now());
 
         // success
-        self.outcome = Outcome::Success;
+        self.outcome = ConnectOutcome::Success;
     }
 }
 
@@ -259,132 +437,119 @@ impl Drop for LatencyTimerPause<'_> {
             Waiting::Cplane => self.timer.accumulated.cplane += dur,
             Waiting::Client => self.timer.accumulated.client += dur,
             Waiting::Compute => self.timer.accumulated.compute += dur,
+            Waiting::RetryTimeout => self.timer.accumulated.retry += dur,
         }
     }
 }
 
+#[derive(FixedCardinalityLabel, Clone, Copy, Debug)]
+pub enum ConnectOutcome {
+    Success,
+    Failed,
+}
+
 impl Drop for LatencyTimer {
     fn drop(&mut self) {
         let duration = self
             .stop
             .unwrap_or_else(time::Instant::now)
             .duration_since(self.start);
-        // Excluding cplane communication from the accumulated time.
-        COMPUTE_CONNECTION_LATENCY
-            .with_label_values(&[
-                self.protocol,
-                self.cold_start_info.as_str(),
-                self.outcome.as_str(),
-                "client",
-            ])
-            .observe((duration.saturating_sub(self.accumulated.client)).as_secs_f64());
+
+        let metric = &Metrics::get().proxy.compute_connection_latency_seconds;
+
+        // Excluding client communication from the accumulated time.
+        metric.observe(
+            ComputeConnectionLatencyGroup {
+                protocol: self.protocol,
+                cold_start_info: self.cold_start_info,
+                outcome: self.outcome,
+                excluded: LatencyExclusions::Client,
+            },
+            duration
+                .saturating_sub(self.accumulated.client)
+                .as_secs_f64(),
+        );
+
         // Exclude client and cplane communication from the accumulated time.
         let accumulated_total = self.accumulated.client + self.accumulated.cplane;
-        COMPUTE_CONNECTION_LATENCY
-            .with_label_values(&[
-                self.protocol,
-                self.cold_start_info.as_str(),
-                self.outcome.as_str(),
-                "client_and_cplane",
-            ])
-            .observe((duration.saturating_sub(accumulated_total)).as_secs_f64());
+        metric.observe(
+            ComputeConnectionLatencyGroup {
+                protocol: self.protocol,
+                cold_start_info: self.cold_start_info,
+                outcome: self.outcome,
+                excluded: LatencyExclusions::ClientAndCplane,
+            },
+            duration.saturating_sub(accumulated_total).as_secs_f64(),
+        );
+
+        // Exclude client cplane, compue communication from the accumulated time.
+        let accumulated_total =
+            self.accumulated.client + self.accumulated.cplane + self.accumulated.compute;
+        metric.observe(
+            ComputeConnectionLatencyGroup {
+                protocol: self.protocol,
+                cold_start_info: self.cold_start_info,
+                outcome: self.outcome,
+                excluded: LatencyExclusions::ClientCplaneCompute,
+            },
+            duration.saturating_sub(accumulated_total).as_secs_f64(),
+        );
+
+        // Exclude client cplane, compue, retry communication from the accumulated time.
+        let accumulated_total = self.accumulated.client
+            + self.accumulated.cplane
+            + self.accumulated.compute
+            + self.accumulated.retry;
+        metric.observe(
+            ComputeConnectionLatencyGroup {
+                protocol: self.protocol,
+                cold_start_info: self.cold_start_info,
+                outcome: self.outcome,
+                excluded: LatencyExclusions::ClientCplaneComputeRetry,
+            },
+            duration.saturating_sub(accumulated_total).as_secs_f64(),
+        );
     }
 }
 
-pub static NUM_CONNECTION_FAILURES: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "proxy_connection_failures_total",
-        "Number of connection failures (per kind).",
-        &["kind"],
-    )
-    .unwrap()
-});
-
-pub static NUM_WAKEUP_FAILURES: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "proxy_connection_failures_breakdown",
-        "Number of wake-up failures (per kind).",
-        &["retry", "kind"],
-    )
-    .unwrap()
-});
-
-pub static NUM_BYTES_PROXIED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "proxy_io_bytes",
-        "Number of bytes sent/received between all clients and backends.",
-        &["direction"],
-    )
-    .unwrap()
-});
-
-pub const fn bool_to_str(x: bool) -> &'static str {
-    if x {
-        "true"
-    } else {
-        "false"
+impl From<bool> for Bool {
+    fn from(value: bool) -> Self {
+        if value {
+            Bool::True
+        } else {
+            Bool::False
+        }
     }
 }
 
-pub static CONNECTING_ENDPOINTS: Lazy<HyperLogLogVec<32>> = Lazy::new(|| {
-    register_hll_vec!(
-        32,
-        "proxy_connecting_endpoints",
-        "HLL approximate cardinality of endpoints that are connecting",
-        &["protocol"],
-    )
-    .unwrap()
-});
+#[derive(LabelGroup)]
+#[label(set = InvalidEndpointsSet)]
+pub struct InvalidEndpointsGroup {
+    pub protocol: Protocol,
+    pub rejected: Bool,
+    pub outcome: ConnectOutcome,
+}
 
-pub static ERROR_BY_KIND: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "proxy_errors_total",
-        "Number of errors by a given classification",
-        &["type"],
-    )
-    .unwrap()
-});
+#[derive(LabelGroup)]
+#[label(set = RetriesMetricSet)]
+pub struct RetriesMetricGroup {
+    pub outcome: ConnectOutcome,
+    pub retry_type: RetryType,
+}
 
-pub static ENDPOINT_ERRORS_BY_KIND: Lazy<HyperLogLogVec<32>> = Lazy::new(|| {
-    register_hll_vec!(
-        32,
-        "proxy_endpoints_affected_by_errors",
-        "Number of endpoints affected by errors of a given classification",
-        &["type"],
-    )
-    .unwrap()
-});
+#[derive(FixedCardinalityLabel, Clone, Copy, Debug)]
+pub enum RetryType {
+    WakeCompute,
+    ConnectToCompute,
+}
 
-pub static REDIS_BROKEN_MESSAGES: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "proxy_redis_errors_total",
-        "Number of errors by a given classification",
-        &["channel"],
-    )
-    .unwrap()
-});
-
-pub static TLS_HANDSHAKE_FAILURES: Lazy<IntCounter> = Lazy::new(|| {
-    register_int_counter!(
-        "proxy_tls_handshake_failures",
-        "Number of TLS handshake failures",
-    )
-    .unwrap()
-});
-
-pub static ENDPOINTS_AUTH_RATE_LIMITED: Lazy<HyperLogLog<32>> = Lazy::new(|| {
-    register_hll!(
-        32,
-        "proxy_endpoints_auth_rate_limits",
-        "Number of endpoints affected by authentication rate limits",
-    )
-    .unwrap()
-});
-
-pub static AUTH_RATE_LIMIT_HITS: Lazy<IntCounter> = Lazy::new(|| {
-    register_int_counter!(
-        "proxy_requests_auth_rate_limits_total",
-        "Number of connection requests affected by authentication rate limits",
-    )
-    .unwrap()
-});
+#[derive(FixedCardinalityLabel, Clone, Copy, Debug)]
+#[label(singleton = "event")]
+pub enum RedisEventsCount {
+    EndpointCreated,
+    BranchCreated,
+    ProjectCreated,
+    CancelSession,
+    PasswordUpdate,
+    AllowedIpsUpdate,
+}
diff --git a/proxy/src/protocol2.rs b/proxy/src/protocol2.rs
index 700c8c8681..1dd4563514 100644
--- a/proxy/src/protocol2.rs
+++ b/proxy/src/protocol2.rs
@@ -1,48 +1,26 @@
 //! Proxy Protocol V2 implementation
 
 use std::{
-    future::{poll_fn, Future},
     io,
     net::SocketAddr,
-    pin::{pin, Pin},
-    sync::Mutex,
-    task::{ready, Context, Poll},
+    pin::Pin,
+    task::{Context, Poll},
 };
 
-use bytes::{Buf, BytesMut};
-use hyper::server::accept::Accept;
-use hyper::server::conn::{AddrIncoming, AddrStream};
-use metrics::IntCounterPairGuard;
+use bytes::BytesMut;
 use pin_project_lite::pin_project;
 use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, ReadBuf};
-use uuid::Uuid;
-
-use crate::metrics::NUM_CLIENT_CONNECTION_GAUGE;
-
-pub struct ProxyProtocolAccept {
-    pub incoming: AddrIncoming,
-    pub protocol: &'static str,
-}
 
 pin_project! {
-    pub struct WithClientIp<T> {
+    /// A chained [`AsyncRead`] with [`AsyncWrite`] passthrough
+    pub struct ChainRW<T> {
         #[pin]
         pub inner: T,
         buf: BytesMut,
-        tlv_bytes: u16,
-        state: ProxyParse,
     }
 }
 
-#[derive(Clone, PartialEq, Debug)]
-enum ProxyParse {
-    NotStarted,
-
-    Finished(SocketAddr),
-    None,
-}
-
-impl<T: AsyncWrite> AsyncWrite for WithClientIp<T> {
+impl<T: AsyncWrite> AsyncWrite for ChainRW<T> {
     #[inline]
     fn poll_write(
         self: Pin<&mut Self>,
@@ -77,364 +55,174 @@ impl<T: AsyncWrite> AsyncWrite for WithClientIp<T> {
     }
 }
 
-impl<T> WithClientIp<T> {
-    pub fn new(inner: T) -> Self {
-        WithClientIp {
-            inner,
-            buf: BytesMut::with_capacity(128),
-            tlv_bytes: 0,
-            state: ProxyParse::NotStarted,
-        }
-    }
-
-    pub fn client_addr(&self) -> Option<SocketAddr> {
-        match self.state {
-            ProxyParse::Finished(socket) => Some(socket),
-            _ => None,
-        }
-    }
-}
-
-impl<T: AsyncRead + Unpin> WithClientIp<T> {
-    pub async fn wait_for_addr(&mut self) -> io::Result<Option<SocketAddr>> {
-        match self.state {
-            ProxyParse::NotStarted => {
-                let mut pin = Pin::new(&mut *self);
-                let addr = poll_fn(|cx| pin.as_mut().poll_client_ip(cx)).await?;
-                match addr {
-                    Some(addr) => self.state = ProxyParse::Finished(addr),
-                    None => self.state = ProxyParse::None,
-                }
-                Ok(addr)
-            }
-            ProxyParse::Finished(addr) => Ok(Some(addr)),
-            ProxyParse::None => Ok(None),
-        }
-    }
-}
-
 /// Proxy Protocol Version 2 Header
 const HEADER: [u8; 12] = [
     0x0D, 0x0A, 0x0D, 0x0A, 0x00, 0x0D, 0x0A, 0x51, 0x55, 0x49, 0x54, 0x0A,
 ];
 
-impl<T: AsyncRead> WithClientIp<T> {
-    /// implementation of <https://www.haproxy.org/download/2.4/doc/proxy-protocol.txt>
-    /// Version 2 (Binary Format)
-    fn poll_client_ip(
-        mut self: Pin<&mut Self>,
-        cx: &mut Context<'_>,
-    ) -> Poll<io::Result<Option<SocketAddr>>> {
-        // The binary header format starts with a constant 12 bytes block containing the protocol signature :
-        //    \x0D \x0A \x0D \x0A \x00 \x0D \x0A \x51 \x55 \x49 \x54 \x0A
-        while self.buf.len() < 16 {
-            let mut this = self.as_mut().project();
-            let bytes_read = pin!(this.inner.read_buf(this.buf)).poll(cx)?;
+pub async fn read_proxy_protocol<T: AsyncRead + Unpin>(
+    mut read: T,
+) -> std::io::Result<(ChainRW<T>, Option<SocketAddr>)> {
+    let mut buf = BytesMut::with_capacity(128);
+    while buf.len() < 16 {
+        let bytes_read = read.read_buf(&mut buf).await?;
 
-            // exit for bad header
-            let len = usize::min(self.buf.len(), HEADER.len());
-            if self.buf[..len] != HEADER[..len] {
-                return Poll::Ready(Ok(None));
-            }
-
-            // if no more bytes available then exit
-            if ready!(bytes_read) == 0 {
-                return Poll::Ready(Ok(None));
-            };
+        // exit for bad header
+        let len = usize::min(buf.len(), HEADER.len());
+        if buf[..len] != HEADER[..len] {
+            return Ok((ChainRW { inner: read, buf }, None));
         }
 
-        // The next byte (the 13th one) is the protocol version and command.
-        // The highest four bits contains the version. As of this specification, it must
-        // always be sent as \x2 and the receiver must only accept this value.
-        let vc = self.buf[12];
-        let version = vc >> 4;
-        let command = vc & 0b1111;
-        if version != 2 {
-            return Poll::Ready(Err(io::Error::new(
+        // if no more bytes available then exit
+        if bytes_read == 0 {
+            return Ok((ChainRW { inner: read, buf }, None));
+        };
+    }
+
+    let header = buf.split_to(16);
+
+    // The next byte (the 13th one) is the protocol version and command.
+    // The highest four bits contains the version. As of this specification, it must
+    // always be sent as \x2 and the receiver must only accept this value.
+    let vc = header[12];
+    let version = vc >> 4;
+    let command = vc & 0b1111;
+    if version != 2 {
+        return Err(io::Error::new(
+            io::ErrorKind::Other,
+            "invalid proxy protocol version. expected version 2",
+        ));
+    }
+    match command {
+        // the connection was established on purpose by the proxy
+        // without being relayed. The connection endpoints are the sender and the
+        // receiver. Such connections exist when the proxy sends health-checks to the
+        // server. The receiver must accept this connection as valid and must use the
+        // real connection endpoints and discard the protocol block including the
+        // family which is ignored.
+        0 => {}
+        // the connection was established on behalf of another node,
+        // and reflects the original connection endpoints. The receiver must then use
+        // the information provided in the protocol block to get original the address.
+        1 => {}
+        // other values are unassigned and must not be emitted by senders. Receivers
+        // must drop connections presenting unexpected values here.
+        _ => {
+            return Err(io::Error::new(
                 io::ErrorKind::Other,
-                "invalid proxy protocol version. expected version 2",
-            )));
+                "invalid proxy protocol command. expected local (0) or proxy (1)",
+            ))
         }
-        match command {
-            // the connection was established on purpose by the proxy
-            // without being relayed. The connection endpoints are the sender and the
-            // receiver. Such connections exist when the proxy sends health-checks to the
-            // server. The receiver must accept this connection as valid and must use the
-            // real connection endpoints and discard the protocol block including the
-            // family which is ignored.
-            0 => {}
-            // the connection was established on behalf of another node,
-            // and reflects the original connection endpoints. The receiver must then use
-            // the information provided in the protocol block to get original the address.
-            1 => {}
-            // other values are unassigned and must not be emitted by senders. Receivers
-            // must drop connections presenting unexpected values here.
-            _ => {
-                return Poll::Ready(Err(io::Error::new(
-                    io::ErrorKind::Other,
-                    "invalid proxy protocol command. expected local (0) or proxy (1)",
-                )))
-            }
-        };
+    };
 
-        // The 14th byte contains the transport protocol and address family. The highest 4
-        // bits contain the address family, the lowest 4 bits contain the protocol.
-        let ft = self.buf[13];
-        let address_length = match ft {
-            // - \x11 : TCP over IPv4 : the forwarded connection uses TCP over the AF_INET
-            //   protocol family. Address length is 2*4 + 2*2 = 12 bytes.
-            // - \x12 : UDP over IPv4 : the forwarded connection uses UDP over the AF_INET
-            //   protocol family. Address length is 2*4 + 2*2 = 12 bytes.
-            0x11 | 0x12 => 12,
-            // - \x21 : TCP over IPv6 : the forwarded connection uses TCP over the AF_INET6
-            //   protocol family. Address length is 2*16 + 2*2 = 36 bytes.
-            // - \x22 : UDP over IPv6 : the forwarded connection uses UDP over the AF_INET6
-            //   protocol family. Address length is 2*16 + 2*2 = 36 bytes.
-            0x21 | 0x22 => 36,
-            // unspecified or unix stream. ignore the addresses
-            _ => 0,
-        };
+    // The 14th byte contains the transport protocol and address family. The highest 4
+    // bits contain the address family, the lowest 4 bits contain the protocol.
+    let ft = header[13];
+    let address_length = match ft {
+        // - \x11 : TCP over IPv4 : the forwarded connection uses TCP over the AF_INET
+        //   protocol family. Address length is 2*4 + 2*2 = 12 bytes.
+        // - \x12 : UDP over IPv4 : the forwarded connection uses UDP over the AF_INET
+        //   protocol family. Address length is 2*4 + 2*2 = 12 bytes.
+        0x11 | 0x12 => 12,
+        // - \x21 : TCP over IPv6 : the forwarded connection uses TCP over the AF_INET6
+        //   protocol family. Address length is 2*16 + 2*2 = 36 bytes.
+        // - \x22 : UDP over IPv6 : the forwarded connection uses UDP over the AF_INET6
+        //   protocol family. Address length is 2*16 + 2*2 = 36 bytes.
+        0x21 | 0x22 => 36,
+        // unspecified or unix stream. ignore the addresses
+        _ => 0,
+    };
 
-        // The 15th and 16th bytes is the address length in bytes in network endian order.
-        // It is used so that the receiver knows how many address bytes to skip even when
-        // it does not implement the presented protocol. Thus the length of the protocol
-        // header in bytes is always exactly 16 + this value. When a sender presents a
-        // LOCAL connection, it should not present any address so it sets this field to
-        // zero. Receivers MUST always consider this field to skip the appropriate number
-        // of bytes and must not assume zero is presented for LOCAL connections. When a
-        // receiver accepts an incoming connection showing an UNSPEC address family or
-        // protocol, it may or may not decide to log the address information if present.
-        let remaining_length = u16::from_be_bytes(self.buf[14..16].try_into().unwrap());
-        if remaining_length < address_length {
-            return Poll::Ready(Err(io::Error::new(
-                io::ErrorKind::Other,
-                "invalid proxy protocol length. not enough to fit requested IP addresses",
-            )));
+    // The 15th and 16th bytes is the address length in bytes in network endian order.
+    // It is used so that the receiver knows how many address bytes to skip even when
+    // it does not implement the presented protocol. Thus the length of the protocol
+    // header in bytes is always exactly 16 + this value. When a sender presents a
+    // LOCAL connection, it should not present any address so it sets this field to
+    // zero. Receivers MUST always consider this field to skip the appropriate number
+    // of bytes and must not assume zero is presented for LOCAL connections. When a
+    // receiver accepts an incoming connection showing an UNSPEC address family or
+    // protocol, it may or may not decide to log the address information if present.
+    let remaining_length = u16::from_be_bytes(header[14..16].try_into().unwrap());
+    if remaining_length < address_length {
+        return Err(io::Error::new(
+            io::ErrorKind::Other,
+            "invalid proxy protocol length. not enough to fit requested IP addresses",
+        ));
+    }
+    drop(header);
+
+    while buf.len() < remaining_length as usize {
+        if read.read_buf(&mut buf).await? == 0 {
+            return Err(io::Error::new(
+                io::ErrorKind::UnexpectedEof,
+                "stream closed while waiting for proxy protocol addresses",
+            ));
         }
-
-        while self.buf.len() < 16 + address_length as usize {
-            let mut this = self.as_mut().project();
-            if ready!(pin!(this.inner.read_buf(this.buf)).poll(cx)?) == 0 {
-                return Poll::Ready(Err(io::Error::new(
-                    io::ErrorKind::UnexpectedEof,
-                    "stream closed while waiting for proxy protocol addresses",
-                )));
-            }
-        }
-
-        let this = self.as_mut().project();
-
-        // we are sure this is a proxy protocol v2 entry and we have read all the bytes we need
-        // discard the header we have parsed
-        this.buf.advance(16);
-
-        // Starting from the 17th byte, addresses are presented in network byte order.
-        // The address order is always the same :
-        //   - source layer 3 address in network byte order
-        //   - destination layer 3 address in network byte order
-        //   - source layer 4 address if any, in network byte order (port)
-        //   - destination layer 4 address if any, in network byte order (port)
-        let addresses = this.buf.split_to(address_length as usize);
-        let socket = match address_length {
-            12 => {
-                let src_addr: [u8; 4] = addresses[0..4].try_into().unwrap();
-                let src_port = u16::from_be_bytes(addresses[8..10].try_into().unwrap());
-                Some(SocketAddr::from((src_addr, src_port)))
-            }
-            36 => {
-                let src_addr: [u8; 16] = addresses[0..16].try_into().unwrap();
-                let src_port = u16::from_be_bytes(addresses[32..34].try_into().unwrap());
-                Some(SocketAddr::from((src_addr, src_port)))
-            }
-            _ => None,
-        };
-
-        *this.tlv_bytes = remaining_length - address_length;
-        self.as_mut().skip_tlv_inner();
-
-        Poll::Ready(Ok(socket))
     }
 
-    #[cold]
-    fn read_ip(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<()>> {
-        let ip = ready!(self.as_mut().poll_client_ip(cx)?);
-        match ip {
-            Some(x) => *self.as_mut().project().state = ProxyParse::Finished(x),
-            None => *self.as_mut().project().state = ProxyParse::None,
+    // Starting from the 17th byte, addresses are presented in network byte order.
+    // The address order is always the same :
+    //   - source layer 3 address in network byte order
+    //   - destination layer 3 address in network byte order
+    //   - source layer 4 address if any, in network byte order (port)
+    //   - destination layer 4 address if any, in network byte order (port)
+    let addresses = buf.split_to(remaining_length as usize);
+    let socket = match address_length {
+        12 => {
+            let src_addr: [u8; 4] = addresses[0..4].try_into().unwrap();
+            let src_port = u16::from_be_bytes(addresses[8..10].try_into().unwrap());
+            Some(SocketAddr::from((src_addr, src_port)))
         }
-        Poll::Ready(Ok(()))
-    }
+        36 => {
+            let src_addr: [u8; 16] = addresses[0..16].try_into().unwrap();
+            let src_port = u16::from_be_bytes(addresses[32..34].try_into().unwrap());
+            Some(SocketAddr::from((src_addr, src_port)))
+        }
+        _ => None,
+    };
 
-    #[cold]
-    fn skip_tlv(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<()>> {
-        let mut this = self.as_mut().project();
-        // we know that this.buf is empty
-        debug_assert_eq!(this.buf.len(), 0);
-
-        this.buf.reserve((*this.tlv_bytes).clamp(0, 1024) as usize);
-        ready!(pin!(this.inner.read_buf(this.buf)).poll(cx)?);
-        self.skip_tlv_inner();
-
-        Poll::Ready(Ok(()))
-    }
-
-    fn skip_tlv_inner(self: Pin<&mut Self>) {
-        let tlv_bytes_read = match u16::try_from(self.buf.len()) {
-            // we read more than u16::MAX therefore we must have read the full tlv_bytes
-            Err(_) => self.tlv_bytes,
-            // we might not have read the full tlv bytes yet
-            Ok(n) => u16::min(n, self.tlv_bytes),
-        };
-        let this = self.project();
-        *this.tlv_bytes -= tlv_bytes_read;
-        this.buf.advance(tlv_bytes_read as usize);
-    }
+    Ok((ChainRW { inner: read, buf }, socket))
 }
 
-impl<T: AsyncRead> AsyncRead for WithClientIp<T> {
+impl<T: AsyncRead> AsyncRead for ChainRW<T> {
     #[inline]
     fn poll_read(
-        mut self: Pin<&mut Self>,
+        self: Pin<&mut Self>,
         cx: &mut Context<'_>,
         buf: &mut ReadBuf<'_>,
     ) -> Poll<io::Result<()>> {
-        // I'm assuming these 3 comparisons will be easy to branch predict.
-        // especially with the cold attributes
-        // which should make this read wrapper almost invisible
-
-        if let ProxyParse::NotStarted = self.state {
-            ready!(self.as_mut().read_ip(cx)?);
-        }
-
-        while self.tlv_bytes > 0 {
-            ready!(self.as_mut().skip_tlv(cx)?)
-        }
-
-        let this = self.project();
-        if this.buf.is_empty() {
-            this.inner.poll_read(cx, buf)
+        if self.buf.is_empty() {
+            self.project().inner.poll_read(cx, buf)
         } else {
-            // we know that tlv_bytes is 0
-            debug_assert_eq!(*this.tlv_bytes, 0);
-
-            let write = usize::min(this.buf.len(), buf.remaining());
-            let slice = this.buf.split_to(write).freeze();
-            buf.put_slice(&slice);
-
-            // reset the allocation so it can be freed
-            if this.buf.is_empty() {
-                *this.buf = BytesMut::new();
-            }
-
-            Poll::Ready(Ok(()))
+            self.read_from_buf(buf)
         }
     }
 }
 
-impl Accept for ProxyProtocolAccept {
-    type Conn = WithConnectionGuard<WithClientIp<AddrStream>>;
+impl<T: AsyncRead> ChainRW<T> {
+    #[cold]
+    fn read_from_buf(self: Pin<&mut Self>, buf: &mut ReadBuf<'_>) -> Poll<io::Result<()>> {
+        debug_assert!(!self.buf.is_empty());
+        let this = self.project();
 
-    type Error = io::Error;
+        let write = usize::min(this.buf.len(), buf.remaining());
+        let slice = this.buf.split_to(write).freeze();
+        buf.put_slice(&slice);
 
-    fn poll_accept(
-        mut self: Pin<&mut Self>,
-        cx: &mut Context<'_>,
-    ) -> Poll<Option<Result<Self::Conn, Self::Error>>> {
-        let conn = ready!(Pin::new(&mut self.incoming).poll_accept(cx)?);
-
-        let conn_id = uuid::Uuid::new_v4();
-        let span = tracing::info_span!("http_conn", ?conn_id);
-        {
-            let _enter = span.enter();
-            tracing::info!("accepted new TCP connection");
+        // reset the allocation so it can be freed
+        if this.buf.is_empty() {
+            *this.buf = BytesMut::new();
         }
 
-        let Some(conn) = conn else {
-            return Poll::Ready(None);
-        };
-
-        Poll::Ready(Some(Ok(WithConnectionGuard {
-            inner: WithClientIp::new(conn),
-            connection_id: Uuid::new_v4(),
-            gauge: Mutex::new(Some(
-                NUM_CLIENT_CONNECTION_GAUGE
-                    .with_label_values(&[self.protocol])
-                    .guard(),
-            )),
-            span,
-        })))
-    }
-}
-
-pin_project! {
-    pub struct WithConnectionGuard<T> {
-        #[pin]
-        pub inner: T,
-        pub connection_id: Uuid,
-        pub gauge: Mutex<Option<IntCounterPairGuard>>,
-        pub span: tracing::Span,
-    }
-
-    impl<S> PinnedDrop for WithConnectionGuard<S> {
-        fn drop(this: Pin<&mut Self>) {
-            let _enter = this.span.enter();
-            tracing::info!("HTTP connection closed")
-        }
-    }
-}
-
-impl<T: AsyncWrite> AsyncWrite for WithConnectionGuard<T> {
-    #[inline]
-    fn poll_write(
-        self: Pin<&mut Self>,
-        cx: &mut Context<'_>,
-        buf: &[u8],
-    ) -> Poll<Result<usize, io::Error>> {
-        self.project().inner.poll_write(cx, buf)
-    }
-
-    #[inline]
-    fn poll_flush(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Result<(), io::Error>> {
-        self.project().inner.poll_flush(cx)
-    }
-
-    #[inline]
-    fn poll_shutdown(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Result<(), io::Error>> {
-        self.project().inner.poll_shutdown(cx)
-    }
-
-    #[inline]
-    fn poll_write_vectored(
-        self: Pin<&mut Self>,
-        cx: &mut Context<'_>,
-        bufs: &[io::IoSlice<'_>],
-    ) -> Poll<Result<usize, io::Error>> {
-        self.project().inner.poll_write_vectored(cx, bufs)
-    }
-
-    #[inline]
-    fn is_write_vectored(&self) -> bool {
-        self.inner.is_write_vectored()
-    }
-}
-
-impl<T: AsyncRead> AsyncRead for WithConnectionGuard<T> {
-    fn poll_read(
-        self: Pin<&mut Self>,
-        cx: &mut Context<'_>,
-        buf: &mut ReadBuf<'_>,
-    ) -> Poll<io::Result<()>> {
-        self.project().inner.poll_read(cx, buf)
+        Poll::Ready(Ok(()))
     }
 }
 
 #[cfg(test)]
 mod tests {
-    use std::pin::pin;
-
     use tokio::io::AsyncReadExt;
 
-    use crate::protocol2::{ProxyParse, WithClientIp};
+    use crate::protocol2::read_proxy_protocol;
 
     #[tokio::test]
     async fn test_ipv4() {
@@ -456,16 +244,15 @@ mod tests {
 
         let extra_data = [0x55; 256];
 
-        let mut read = pin!(WithClientIp::new(header.chain(extra_data.as_slice())));
+        let (mut read, addr) = read_proxy_protocol(header.chain(extra_data.as_slice()))
+            .await
+            .unwrap();
 
         let mut bytes = vec![];
         read.read_to_end(&mut bytes).await.unwrap();
 
         assert_eq!(bytes, extra_data);
-        assert_eq!(
-            read.state,
-            ProxyParse::Finished(([127, 0, 0, 1], 65535).into())
-        );
+        assert_eq!(addr, Some(([127, 0, 0, 1], 65535).into()));
     }
 
     #[tokio::test]
@@ -488,17 +275,17 @@ mod tests {
 
         let extra_data = [0x55; 256];
 
-        let mut read = pin!(WithClientIp::new(header.chain(extra_data.as_slice())));
+        let (mut read, addr) = read_proxy_protocol(header.chain(extra_data.as_slice()))
+            .await
+            .unwrap();
 
         let mut bytes = vec![];
         read.read_to_end(&mut bytes).await.unwrap();
 
         assert_eq!(bytes, extra_data);
         assert_eq!(
-            read.state,
-            ProxyParse::Finished(
-                ([15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0], 257).into()
-            )
+            addr,
+            Some(([15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0], 257).into())
         );
     }
 
@@ -506,24 +293,24 @@ mod tests {
     async fn test_invalid() {
         let data = [0x55; 256];
 
-        let mut read = pin!(WithClientIp::new(data.as_slice()));
+        let (mut read, addr) = read_proxy_protocol(data.as_slice()).await.unwrap();
 
         let mut bytes = vec![];
         read.read_to_end(&mut bytes).await.unwrap();
         assert_eq!(bytes, data);
-        assert_eq!(read.state, ProxyParse::None);
+        assert_eq!(addr, None);
     }
 
     #[tokio::test]
     async fn test_short() {
         let data = [0x55; 10];
 
-        let mut read = pin!(WithClientIp::new(data.as_slice()));
+        let (mut read, addr) = read_proxy_protocol(data.as_slice()).await.unwrap();
 
         let mut bytes = vec![];
         read.read_to_end(&mut bytes).await.unwrap();
         assert_eq!(bytes, data);
-        assert_eq!(read.state, ProxyParse::None);
+        assert_eq!(addr, None);
     }
 
     #[tokio::test]
@@ -549,15 +336,14 @@ mod tests {
 
         let extra_data = [0xaa; 256];
 
-        let mut read = pin!(WithClientIp::new(header.chain(extra_data.as_slice())));
+        let (mut read, addr) = read_proxy_protocol(header.chain(extra_data.as_slice()))
+            .await
+            .unwrap();
 
         let mut bytes = vec![];
         read.read_to_end(&mut bytes).await.unwrap();
 
         assert_eq!(bytes, extra_data);
-        assert_eq!(
-            read.state,
-            ProxyParse::Finished(([55, 56, 57, 58], 65535).into())
-        );
+        assert_eq!(addr, Some(([55, 56, 57, 58], 65535).into()));
     }
 }
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index 6051c0a812..e4e095d77d 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -7,6 +7,7 @@ pub mod handshake;
 pub mod passthrough;
 pub mod retry;
 pub mod wake_compute;
+pub use copy_bidirectional::copy_bidirectional_client_compute;
 
 use crate::{
     auth,
@@ -15,16 +16,14 @@ use crate::{
     config::{ProxyConfig, TlsConfig},
     context::RequestMonitoring,
     error::ReportableError,
-    metrics::{NUM_CLIENT_CONNECTION_GAUGE, NUM_CONNECTION_REQUESTS_GAUGE},
-    protocol2::WithClientIp,
+    metrics::{Metrics, NumClientConnectionsGuard},
+    protocol2::read_proxy_protocol,
     proxy::handshake::{handshake, HandshakeData},
-    rate_limiter::EndpointRateLimiter,
     stream::{PqStream, Stream},
     EndpointCacheKey,
 };
 use futures::TryFutureExt;
 use itertools::Itertools;
-use metrics::IntCounterPairGuard;
 use once_cell::sync::OnceCell;
 use pq_proto::{BeMessage as Be, StartupMessageParams};
 use regex::Regex;
@@ -61,7 +60,6 @@ pub async fn task_main(
     config: &'static ProxyConfig,
     listener: tokio::net::TcpListener,
     cancellation_token: CancellationToken,
-    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
     cancellation_handler: Arc<CancellationHandlerMain>,
 ) -> anyhow::Result<()> {
     scopeguard::defer! {
@@ -79,31 +77,29 @@ pub async fn task_main(
     {
         let (socket, peer_addr) = accept_result?;
 
-        let conn_gauge = NUM_CLIENT_CONNECTION_GAUGE
-            .with_label_values(&["tcp"])
-            .guard();
+        let conn_gauge = Metrics::get()
+            .proxy
+            .client_connections
+            .guard(crate::metrics::Protocol::Tcp);
 
         let session_id = uuid::Uuid::new_v4();
         let cancellation_handler = Arc::clone(&cancellation_handler);
-        let endpoint_rate_limiter = endpoint_rate_limiter.clone();
 
         tracing::info!(protocol = "tcp", %session_id, "accepted new TCP connection");
 
         connections.spawn(async move {
-            let mut socket = WithClientIp::new(socket);
-            let mut peer_addr = peer_addr.ip();
-            match socket.wait_for_addr().await {
-                Ok(Some(addr)) => peer_addr = addr.ip(),
+            let (socket, peer_addr) = match read_proxy_protocol(socket).await{
+                Ok((socket, Some(addr))) => (socket, addr.ip()),
                 Err(e) => {
                     error!("per-client task finished with an error: {e:#}");
                     return;
                 }
-                Ok(None) if config.require_client_ip => {
+                Ok((_socket, None)) if config.require_client_ip => {
                     error!("missing required client IP");
                     return;
                 }
-                Ok(None) => {}
-            }
+                Ok((socket, None)) => (socket, peer_addr.ip())
+            };
 
             match socket.inner.set_nodelay(true) {
                 Ok(()) => {},
@@ -113,7 +109,12 @@ pub async fn task_main(
                 },
             };
 
-            let mut ctx = RequestMonitoring::new(session_id, peer_addr, "tcp", &config.region);
+            let mut ctx = RequestMonitoring::new(
+                    session_id,
+                    peer_addr,
+                    crate::metrics::Protocol::Tcp,
+                    &config.region,
+                );
             let span = ctx.span.clone();
 
             let res = handle_client(
@@ -122,7 +123,6 @@ pub async fn task_main(
                 cancellation_handler,
                 socket,
                 ClientMode::Tcp,
-                endpoint_rate_limiter,
                 conn_gauge,
             )
             .instrument(span.clone())
@@ -132,16 +132,14 @@ pub async fn task_main(
                 Err(e) => {
                     // todo: log and push to ctx the error kind
                     ctx.set_error_kind(e.get_error_kind());
-                    ctx.log();
                     error!(parent: &span, "per-client task finished with an error: {e:#}");
                 }
                 Ok(None) => {
                     ctx.set_success();
-                    ctx.log();
                 }
                 Ok(Some(p)) => {
                     ctx.set_success();
-                    ctx.log();
+                    ctx.log_connect();
                     match p.proxy_pass().instrument(span.clone()).await {
                         Ok(()) => {}
                         Err(e) => {
@@ -236,20 +234,23 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     cancellation_handler: Arc<CancellationHandlerMain>,
     stream: S,
     mode: ClientMode,
-    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
-    conn_gauge: IntCounterPairGuard,
+    conn_gauge: NumClientConnectionsGuard<'static>,
 ) -> Result<Option<ProxyPassthrough<CancellationHandlerMainInternal, S>>, ClientRequestError> {
-    info!("handling interactive connection from client");
+    info!(
+        protocol = %ctx.protocol,
+        "handling interactive connection from client"
+    );
 
+    let metrics = &Metrics::get().proxy;
     let proto = ctx.protocol;
-    let _request_gauge = NUM_CONNECTION_REQUESTS_GAUGE
-        .with_label_values(&[proto])
-        .guard();
+    // let _client_gauge = metrics.client_connections.guard(proto);
+    let _request_gauge = metrics.connection_requests.guard(proto);
 
     let tls = config.tls_config.as_ref();
 
+    let record_handshake_error = !ctx.has_private_peer_addr();
     let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Client);
-    let do_handshake = handshake(stream, mode.handshake_tls(tls));
+    let do_handshake = handshake(stream, mode.handshake_tls(tls), record_handshake_error);
     let (mut stream, params) =
         match tokio::time::timeout(config.handshake_timeout, do_handshake).await?? {
             HandshakeData::Startup(stream, params) => (stream, params),
@@ -278,15 +279,6 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
         Err(e) => stream.throw_error(e).await?,
     };
 
-    // check rate limit
-    if let Some(ep) = user_info.get_endpoint() {
-        if !endpoint_rate_limiter.check(ep, 1) {
-            return stream
-                .throw_error(auth::AuthError::too_many_connections())
-                .await?;
-        }
-    }
-
     let user = user_info.get_user().to_owned();
     let user_info = match user_info
         .authenticate(
@@ -309,9 +301,14 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
 
     let mut node = connect_to_compute(
         ctx,
-        &TcpMechanism { params: &params },
+        &TcpMechanism {
+            params: &params,
+            locks: &config.connect_compute_locks,
+        },
         &user_info,
         mode.allow_self_signed_compute(config),
+        config.wake_compute_retry_config,
+        config.connect_to_compute_retry_config,
     )
     .or_else(|e| stream.throw_error(e))
     .await?;
diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs
index 4c0d68ce0b..c8528d0296 100644
--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -1,14 +1,16 @@
 use crate::{
     auth::backend::ComputeCredentialKeys,
     compute::{self, PostgresConnection},
-    console::{self, errors::WakeComputeError, CachedNodeInfo, NodeInfo},
+    config::RetryConfig,
+    console::{self, errors::WakeComputeError, locks::ApiLocks, CachedNodeInfo, NodeInfo},
     context::RequestMonitoring,
     error::ReportableError,
-    metrics::NUM_CONNECTION_FAILURES,
+    metrics::{ConnectOutcome, ConnectionFailureKind, Metrics, RetriesMetricGroup, RetryType},
     proxy::{
         retry::{retry_after, ShouldRetry},
         wake_compute::wake_compute,
     },
+    Host,
 };
 use async_trait::async_trait;
 use pq_proto::StartupMessageParams;
@@ -27,10 +29,10 @@ pub fn invalidate_cache(node_info: console::CachedNodeInfo) -> NodeInfo {
         warn!("invalidating stalled compute node info cache entry");
     }
     let label = match is_cached {
-        true => "compute_cached",
-        false => "compute_uncached",
+        true => ConnectionFailureKind::ComputeCached,
+        false => ConnectionFailureKind::ComputeUncached,
     };
-    NUM_CONNECTION_FAILURES.with_label_values(&[label]).inc();
+    Metrics::get().proxy.connection_failures_total.inc(label);
 
     node_info.invalidate()
 }
@@ -63,6 +65,9 @@ pub trait ComputeConnectBackend {
 pub struct TcpMechanism<'a> {
     /// KV-dictionary with PostgreSQL connection params.
     pub params: &'a StartupMessageParams,
+
+    /// connect_to_compute concurrency lock
+    pub locks: &'static ApiLocks<Host>,
 }
 
 #[async_trait]
@@ -78,6 +83,8 @@ impl ConnectMechanism for TcpMechanism<'_> {
         node_info: &console::CachedNodeInfo,
         timeout: time::Duration,
     ) -> Result<PostgresConnection, Self::Error> {
+        let host = node_info.config.get_host()?;
+        let _permit = self.locks.get_permit(&host).await?;
         node_info.connect(ctx, timeout).await
     }
 
@@ -93,19 +100,23 @@ pub async fn connect_to_compute<M: ConnectMechanism, B: ComputeConnectBackend>(
     mechanism: &M,
     user_info: &B,
     allow_self_signed_compute: bool,
+    wake_compute_retry_config: RetryConfig,
+    connect_to_compute_retry_config: RetryConfig,
 ) -> Result<M::Connection, M::Error>
 where
     M::ConnectError: ShouldRetry + std::fmt::Debug,
     M::Error: From<WakeComputeError>,
 {
     let mut num_retries = 0;
-    let mut node_info = wake_compute(&mut num_retries, ctx, user_info).await?;
+    let mut node_info =
+        wake_compute(&mut num_retries, ctx, user_info, wake_compute_retry_config).await?;
     if let Some(keys) = user_info.get_keys() {
         node_info.set_keys(keys);
     }
     node_info.allow_self_signed_compute = allow_self_signed_compute;
     // let mut node_info = credentials.get_node_info(ctx, user_info).await?;
     mechanism.update_connect_config(&mut node_info.config);
+    let retry_type = RetryType::ConnectToCompute;
 
     // try once
     let err = match mechanism
@@ -114,6 +125,13 @@ where
     {
         Ok(res) => {
             ctx.latency_timer.success();
+            Metrics::get().proxy.retries_metric.observe(
+                RetriesMetricGroup {
+                    outcome: ConnectOutcome::Success,
+                    retry_type,
+                },
+                num_retries.into(),
+            );
             return Ok(res);
         }
         Err(e) => e,
@@ -121,10 +139,17 @@ where
 
     error!(error = ?err, "could not connect to compute node");
 
-    let node_info = if !node_info.cached() {
+    let node_info = if !node_info.cached() || !err.should_retry_database_address() {
         // If we just recieved this from cplane and dodn't get it from cache, we shouldn't retry.
         // Do not need to retrieve a new node_info, just return the old one.
-        if !err.should_retry(num_retries) {
+        if !err.should_retry(num_retries, connect_to_compute_retry_config) {
+            Metrics::get().proxy.retries_metric.observe(
+                RetriesMetricGroup {
+                    outcome: ConnectOutcome::Failed,
+                    retry_type,
+                },
+                num_retries.into(),
+            );
             return Err(err.into());
         }
         node_info
@@ -132,7 +157,8 @@ where
         // if we failed to connect, it's likely that the compute node was suspended, wake a new compute node
         info!("compute node's state has likely changed; requesting a wake-up");
         let old_node_info = invalidate_cache(node_info);
-        let mut node_info = wake_compute(&mut num_retries, ctx, user_info).await?;
+        let mut node_info =
+            wake_compute(&mut num_retries, ctx, user_info, wake_compute_retry_config).await?;
         node_info.reuse_settings(old_node_info);
 
         mechanism.update_connect_config(&mut node_info.config);
@@ -151,21 +177,40 @@ where
         {
             Ok(res) => {
                 ctx.latency_timer.success();
+                Metrics::get().proxy.retries_metric.observe(
+                    RetriesMetricGroup {
+                        outcome: ConnectOutcome::Success,
+                        retry_type,
+                    },
+                    num_retries.into(),
+                );
+                info!(?num_retries, "connected to compute node after");
                 return Ok(res);
             }
             Err(e) => {
-                let retriable = e.should_retry(num_retries);
+                let retriable = e.should_retry(num_retries, connect_to_compute_retry_config);
                 if !retriable {
                     error!(error = ?e, num_retries, retriable, "couldn't connect to compute node");
+                    Metrics::get().proxy.retries_metric.observe(
+                        RetriesMetricGroup {
+                            outcome: ConnectOutcome::Failed,
+                            retry_type,
+                        },
+                        num_retries.into(),
+                    );
                     return Err(e.into());
                 }
                 warn!(error = ?e, num_retries, retriable, "couldn't connect to compute node");
             }
         }
 
-        let wait_duration = retry_after(num_retries);
+        let wait_duration = retry_after(num_retries, connect_to_compute_retry_config);
         num_retries += 1;
 
+        let pause = ctx
+            .latency_timer
+            .pause(crate::metrics::Waiting::RetryTimeout);
         time::sleep(wait_duration).await;
+        drop(pause);
     }
 }
diff --git a/proxy/src/proxy/copy_bidirectional.rs b/proxy/src/proxy/copy_bidirectional.rs
index 684be74f9a..4b09ebd8dc 100644
--- a/proxy/src/proxy/copy_bidirectional.rs
+++ b/proxy/src/proxy/copy_bidirectional.rs
@@ -41,7 +41,7 @@ where
 }
 
 #[tracing::instrument(skip_all)]
-pub(super) async fn copy_bidirectional_client_compute<Client, Compute>(
+pub async fn copy_bidirectional_client_compute<Client, Compute>(
     client: &mut Client,
     compute: &mut Compute,
 ) -> Result<(u64, u64), std::io::Error>
diff --git a/proxy/src/proxy/handshake.rs b/proxy/src/proxy/handshake.rs
index 4665e07d23..dd935cc245 100644
--- a/proxy/src/proxy/handshake.rs
+++ b/proxy/src/proxy/handshake.rs
@@ -63,6 +63,7 @@ pub enum HandshakeData<S> {
 pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
     stream: S,
     mut tls: Option<&TlsConfig>,
+    record_handshake_error: bool,
 ) -> Result<HandshakeData<S>, HandshakeError> {
     // Client may try upgrading to each protocol only once
     let (mut tried_ssl, mut tried_gss) = (false, false);
@@ -95,7 +96,9 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
                         if !read_buf.is_empty() {
                             return Err(HandshakeError::EarlyData);
                         }
-                        let tls_stream = raw.upgrade(tls.to_server_config()).await?;
+                        let tls_stream = raw
+                            .upgrade(tls.to_server_config(), record_handshake_error)
+                            .await?;
 
                         let (_, tls_server_end_point) = tls
                             .cert_resolver
diff --git a/proxy/src/proxy/passthrough.rs b/proxy/src/proxy/passthrough.rs
index c81a1a8292..62de79946f 100644
--- a/proxy/src/proxy/passthrough.rs
+++ b/proxy/src/proxy/passthrough.rs
@@ -2,11 +2,10 @@ use crate::{
     cancellation,
     compute::PostgresConnection,
     console::messages::MetricsAuxInfo,
-    metrics::NUM_BYTES_PROXIED_COUNTER,
+    metrics::{Direction, Metrics, NumClientConnectionsGuard, NumConnectionRequestsGuard},
     stream::Stream,
     usage_metrics::{Ids, MetricCounterRecorder, USAGE_METRICS},
 };
-use metrics::IntCounterPairGuard;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::info;
 use utils::measured_stream::MeasuredStream;
@@ -23,24 +22,25 @@ pub async fn proxy_pass(
         branch_id: aux.branch_id,
     });
 
-    let m_sent = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["tx"]);
+    let metrics = &Metrics::get().proxy.io_bytes;
+    let m_sent = metrics.with_labels(Direction::Tx);
     let mut client = MeasuredStream::new(
         client,
         |_| {},
         |cnt| {
             // Number of bytes we sent to the client (outbound).
-            m_sent.inc_by(cnt as u64);
+            metrics.get_metric(m_sent).inc_by(cnt as u64);
             usage.record_egress(cnt as u64);
         },
     );
 
-    let m_recv = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["rx"]);
+    let m_recv = metrics.with_labels(Direction::Rx);
     let mut compute = MeasuredStream::new(
         compute,
         |_| {},
         |cnt| {
             // Number of bytes the client sent to the compute node (inbound).
-            m_recv.inc_by(cnt as u64);
+            metrics.get_metric(m_recv).inc_by(cnt as u64);
         },
     );
 
@@ -60,8 +60,8 @@ pub struct ProxyPassthrough<P, S> {
     pub compute: PostgresConnection,
     pub aux: MetricsAuxInfo,
 
-    pub req: IntCounterPairGuard,
-    pub conn: IntCounterPairGuard,
+    pub req: NumConnectionRequestsGuard<'static>,
+    pub conn: NumClientConnectionsGuard<'static>,
     pub cancel: cancellation::Session<P>,
 }
 
diff --git a/proxy/src/proxy/retry.rs b/proxy/src/proxy/retry.rs
index a85ed380b0..36a05ba190 100644
--- a/proxy/src/proxy/retry.rs
+++ b/proxy/src/proxy/retry.rs
@@ -1,21 +1,18 @@
-use crate::compute;
+use crate::{compute, config::RetryConfig};
 use std::{error::Error, io};
 use tokio::time;
 
-/// Number of times we should retry the `/proxy_wake_compute` http request.
-/// Retry duration is BASE_RETRY_WAIT_DURATION * RETRY_WAIT_EXPONENT_BASE ^ n, where n starts at 0
-pub const NUM_RETRIES_CONNECT: u32 = 16;
-const BASE_RETRY_WAIT_DURATION: time::Duration = time::Duration::from_millis(25);
-const RETRY_WAIT_EXPONENT_BASE: f64 = std::f64::consts::SQRT_2;
-
 pub trait ShouldRetry {
     fn could_retry(&self) -> bool;
-    fn should_retry(&self, num_retries: u32) -> bool {
+    fn should_retry(&self, num_retries: u32, config: RetryConfig) -> bool {
         match self {
-            _ if num_retries >= NUM_RETRIES_CONNECT => false,
+            _ if num_retries >= config.max_retries => false,
             err => err.could_retry(),
         }
     }
+    fn should_retry_database_address(&self) -> bool {
+        true
+    }
 }
 
 impl ShouldRetry for io::Error {
@@ -39,6 +36,21 @@ impl ShouldRetry for tokio_postgres::error::DbError {
                 | &SqlState::SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION,
         )
     }
+    fn should_retry_database_address(&self) -> bool {
+        use tokio_postgres::error::SqlState;
+        // Here are errors that happens after the user successfully authenticated to the database.
+        // TODO: there are pgbouncer errors that should be retried, but they are not listed here.
+        !matches!(
+            self.code(),
+            &SqlState::TOO_MANY_CONNECTIONS
+                | &SqlState::OUT_OF_MEMORY
+                | &SqlState::SYNTAX_ERROR
+                | &SqlState::T_R_SERIALIZATION_FAILURE
+                | &SqlState::INVALID_CATALOG_NAME
+                | &SqlState::INVALID_SCHEMA_NAME
+                | &SqlState::INVALID_PARAMETER_VALUE
+        )
+    }
 }
 
 impl ShouldRetry for tokio_postgres::Error {
@@ -51,6 +63,15 @@ impl ShouldRetry for tokio_postgres::Error {
             false
         }
     }
+    fn should_retry_database_address(&self) -> bool {
+        if let Some(io_err) = self.source().and_then(|x| x.downcast_ref()) {
+            io::Error::should_retry_database_address(io_err)
+        } else if let Some(db_err) = self.source().and_then(|x| x.downcast_ref()) {
+            tokio_postgres::error::DbError::should_retry_database_address(db_err)
+        } else {
+            true
+        }
+    }
 }
 
 impl ShouldRetry for compute::ConnectionError {
@@ -61,8 +82,17 @@ impl ShouldRetry for compute::ConnectionError {
             _ => false,
         }
     }
+    fn should_retry_database_address(&self) -> bool {
+        match self {
+            compute::ConnectionError::Postgres(err) => err.should_retry_database_address(),
+            compute::ConnectionError::CouldNotConnect(err) => err.should_retry_database_address(),
+            _ => true,
+        }
+    }
 }
 
-pub fn retry_after(num_retries: u32) -> time::Duration {
-    BASE_RETRY_WAIT_DURATION.mul_f64(RETRY_WAIT_EXPONENT_BASE.powi((num_retries as i32) - 1))
+pub fn retry_after(num_retries: u32, config: RetryConfig) -> time::Duration {
+    config
+        .base_delay
+        .mul_f64(config.backoff_factor.powi((num_retries as i32) - 1))
 }
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index 71d85e106d..ad48af0093 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -10,13 +10,13 @@ use super::*;
 use crate::auth::backend::{
     ComputeCredentialKeys, ComputeCredentials, ComputeUserInfo, MaybeOwned, TestBackend,
 };
-use crate::config::CertResolver;
+use crate::config::{CertResolver, RetryConfig};
 use crate::console::caches::NodeInfoCache;
 use crate::console::messages::MetricsAuxInfo;
 use crate::console::provider::{CachedAllowedIps, CachedRoleSecret, ConsoleBackend};
 use crate::console::{self, CachedNodeInfo, NodeInfo};
 use crate::error::ErrorKind;
-use crate::proxy::retry::{retry_after, NUM_RETRIES_CONNECT};
+use crate::proxy::retry::retry_after;
 use crate::{http, sasl, scram, BranchId, EndpointId, ProjectId};
 use anyhow::{bail, Context};
 use async_trait::async_trait;
@@ -174,8 +174,8 @@ async fn dummy_proxy(
     tls: Option<TlsConfig>,
     auth: impl TestAuth + Send,
 ) -> anyhow::Result<()> {
-    let client = WithClientIp::new(client);
-    let mut stream = match handshake(client, tls.as_ref()).await? {
+    let (client, _) = read_proxy_protocol(client).await?;
+    let mut stream = match handshake(client, tls.as_ref(), false).await? {
         HandshakeData::Startup(stream, _) => stream,
         HandshakeData::Cancel(_) => bail!("cancellation not supported"),
     };
@@ -361,11 +361,15 @@ async fn scram_auth_mock() -> anyhow::Result<()> {
 #[test]
 fn connect_compute_total_wait() {
     let mut total_wait = tokio::time::Duration::ZERO;
-    for num_retries in 1..NUM_RETRIES_CONNECT {
-        total_wait += retry_after(num_retries);
+    let config = RetryConfig {
+        base_delay: Duration::from_secs(1),
+        max_retries: 5,
+        backoff_factor: 2.0,
+    };
+    for num_retries in 1..config.max_retries {
+        total_wait += retry_after(num_retries, config);
     }
-    assert!(total_wait < tokio::time::Duration::from_secs(12));
-    assert!(total_wait > tokio::time::Duration::from_secs(10));
+    assert!(f64::abs(total_wait.as_secs_f64() - 15.0) < 0.1);
 }
 
 #[derive(Clone, Copy, Debug)]
@@ -549,7 +553,12 @@ async fn connect_to_compute_success() {
     let mut ctx = RequestMonitoring::test();
     let mechanism = TestConnectMechanism::new(vec![Wake, Connect]);
     let user_info = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mut ctx, &mechanism, &user_info, false)
+    let config = RetryConfig {
+        base_delay: Duration::from_secs(1),
+        max_retries: 5,
+        backoff_factor: 2.0,
+    };
+    connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config)
         .await
         .unwrap();
     mechanism.verify();
@@ -562,7 +571,12 @@ async fn connect_to_compute_retry() {
     let mut ctx = RequestMonitoring::test();
     let mechanism = TestConnectMechanism::new(vec![Wake, Retry, Wake, Connect]);
     let user_info = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mut ctx, &mechanism, &user_info, false)
+    let config = RetryConfig {
+        base_delay: Duration::from_secs(1),
+        max_retries: 5,
+        backoff_factor: 2.0,
+    };
+    connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config)
         .await
         .unwrap();
     mechanism.verify();
@@ -576,7 +590,12 @@ async fn connect_to_compute_non_retry_1() {
     let mut ctx = RequestMonitoring::test();
     let mechanism = TestConnectMechanism::new(vec![Wake, Retry, Wake, Fail]);
     let user_info = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mut ctx, &mechanism, &user_info, false)
+    let config = RetryConfig {
+        base_delay: Duration::from_secs(1),
+        max_retries: 5,
+        backoff_factor: 2.0,
+    };
+    connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config)
         .await
         .unwrap_err();
     mechanism.verify();
@@ -590,7 +609,12 @@ async fn connect_to_compute_non_retry_2() {
     let mut ctx = RequestMonitoring::test();
     let mechanism = TestConnectMechanism::new(vec![Wake, Fail, Wake, Connect]);
     let user_info = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mut ctx, &mechanism, &user_info, false)
+    let config = RetryConfig {
+        base_delay: Duration::from_secs(1),
+        max_retries: 5,
+        backoff_factor: 2.0,
+    };
+    connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config)
         .await
         .unwrap();
     mechanism.verify();
@@ -600,17 +624,32 @@ async fn connect_to_compute_non_retry_2() {
 #[tokio::test]
 async fn connect_to_compute_non_retry_3() {
     let _ = env_logger::try_init();
-    assert_eq!(NUM_RETRIES_CONNECT, 16);
+    tokio::time::pause();
     use ConnectAction::*;
     let mut ctx = RequestMonitoring::test();
-    let mechanism = TestConnectMechanism::new(vec![
-        Wake, Retry, Wake, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry,
-        Retry, Retry, Retry, Retry, Retry, /* the 17th time */ Retry,
-    ]);
+    let mechanism =
+        TestConnectMechanism::new(vec![Wake, Retry, Wake, Retry, Retry, Retry, Retry, Retry]);
     let user_info = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mut ctx, &mechanism, &user_info, false)
-        .await
-        .unwrap_err();
+    let wake_compute_retry_config = RetryConfig {
+        base_delay: Duration::from_secs(1),
+        max_retries: 1,
+        backoff_factor: 2.0,
+    };
+    let connect_to_compute_retry_config = RetryConfig {
+        base_delay: Duration::from_secs(1),
+        max_retries: 5,
+        backoff_factor: 2.0,
+    };
+    connect_to_compute(
+        &mut ctx,
+        &mechanism,
+        &user_info,
+        false,
+        wake_compute_retry_config,
+        connect_to_compute_retry_config,
+    )
+    .await
+    .unwrap_err();
     mechanism.verify();
 }
 
@@ -622,7 +661,12 @@ async fn wake_retry() {
     let mut ctx = RequestMonitoring::test();
     let mechanism = TestConnectMechanism::new(vec![WakeRetry, Wake, Connect]);
     let user_info = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mut ctx, &mechanism, &user_info, false)
+    let config = RetryConfig {
+        base_delay: Duration::from_secs(1),
+        max_retries: 5,
+        backoff_factor: 2.0,
+    };
+    connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config)
         .await
         .unwrap();
     mechanism.verify();
@@ -636,7 +680,12 @@ async fn wake_non_retry() {
     let mut ctx = RequestMonitoring::test();
     let mechanism = TestConnectMechanism::new(vec![WakeRetry, WakeFail]);
     let user_info = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mut ctx, &mechanism, &user_info, false)
+    let config = RetryConfig {
+        base_delay: Duration::from_secs(1),
+        max_retries: 5,
+        backoff_factor: 2.0,
+    };
+    connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config)
         .await
         .unwrap_err();
     mechanism.verify();
diff --git a/proxy/src/proxy/tests/mitm.rs b/proxy/src/proxy/tests/mitm.rs
index 3b760e5dab..cbfc9f1358 100644
--- a/proxy/src/proxy/tests/mitm.rs
+++ b/proxy/src/proxy/tests/mitm.rs
@@ -34,7 +34,10 @@ async fn proxy_mitm(
     tokio::spawn(async move {
         // begin handshake with end_server
         let end_server = connect_tls(server2, client_config2.make_tls_connect().unwrap()).await;
-        let (end_client, startup) = match handshake(client1, Some(&server_config1)).await.unwrap() {
+        let (end_client, startup) = match handshake(client1, Some(&server_config1), false)
+            .await
+            .unwrap()
+        {
             HandshakeData::Startup(stream, params) => (stream, params),
             HandshakeData::Cancel(_) => panic!("cancellation not supported"),
         };
diff --git a/proxy/src/proxy/wake_compute.rs b/proxy/src/proxy/wake_compute.rs
index bfe4b7ec3a..3d9e94dd72 100644
--- a/proxy/src/proxy/wake_compute.rs
+++ b/proxy/src/proxy/wake_compute.rs
@@ -1,10 +1,14 @@
+use crate::config::RetryConfig;
 use crate::console::{errors::WakeComputeError, provider::CachedNodeInfo};
 use crate::context::RequestMonitoring;
-use crate::metrics::{bool_to_str, NUM_WAKEUP_FAILURES};
+use crate::metrics::{
+    ConnectOutcome, ConnectionFailuresBreakdownGroup, Metrics, RetriesMetricGroup, RetryType,
+    WakeupFailureKind,
+};
 use crate::proxy::retry::retry_after;
-use hyper::StatusCode;
+use hyper1::StatusCode;
 use std::ops::ControlFlow;
-use tracing::{error, warn};
+use tracing::{error, info, warn};
 
 use super::connect_compute::ComputeConnectBackend;
 use super::retry::ShouldRetry;
@@ -13,25 +17,48 @@ pub async fn wake_compute<B: ComputeConnectBackend>(
     num_retries: &mut u32,
     ctx: &mut RequestMonitoring,
     api: &B,
+    config: RetryConfig,
 ) -> Result<CachedNodeInfo, WakeComputeError> {
+    let retry_type = RetryType::WakeCompute;
     loop {
         let wake_res = api.wake_compute(ctx).await;
-        match handle_try_wake(wake_res, *num_retries) {
+        match handle_try_wake(wake_res, *num_retries, config) {
             Err(e) => {
                 error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node");
                 report_error(&e, false);
+                Metrics::get().proxy.retries_metric.observe(
+                    RetriesMetricGroup {
+                        outcome: ConnectOutcome::Failed,
+                        retry_type,
+                    },
+                    (*num_retries).into(),
+                );
                 return Err(e);
             }
             Ok(ControlFlow::Continue(e)) => {
                 warn!(error = ?e, num_retries, retriable = true, "couldn't wake compute node");
                 report_error(&e, true);
             }
-            Ok(ControlFlow::Break(n)) => return Ok(n),
+            Ok(ControlFlow::Break(n)) => {
+                Metrics::get().proxy.retries_metric.observe(
+                    RetriesMetricGroup {
+                        outcome: ConnectOutcome::Success,
+                        retry_type,
+                    },
+                    (*num_retries).into(),
+                );
+                info!(?num_retries, "compute node woken up after");
+                return Ok(n);
+            }
         }
 
-        let wait_duration = retry_after(*num_retries);
+        let wait_duration = retry_after(*num_retries, config);
         *num_retries += 1;
+        let pause = ctx
+            .latency_timer
+            .pause(crate::metrics::Waiting::RetryTimeout);
         tokio::time::sleep(wait_duration).await;
+        drop(pause);
     }
 }
 
@@ -42,10 +69,11 @@ pub async fn wake_compute<B: ComputeConnectBackend>(
 pub fn handle_try_wake(
     result: Result<CachedNodeInfo, WakeComputeError>,
     num_retries: u32,
+    config: RetryConfig,
 ) -> Result<ControlFlow<CachedNodeInfo, WakeComputeError>, WakeComputeError> {
     match result {
         Err(err) => match &err {
-            WakeComputeError::ApiError(api) if api.should_retry(num_retries) => {
+            WakeComputeError::ApiError(api) if api.should_retry(num_retries, config) => {
                 Ok(ControlFlow::Continue(err))
             }
             _ => Err(err),
@@ -57,39 +85,47 @@ pub fn handle_try_wake(
 
 fn report_error(e: &WakeComputeError, retry: bool) {
     use crate::console::errors::ApiError;
-    let retry = bool_to_str(retry);
     let kind = match e {
-        WakeComputeError::BadComputeAddress(_) => "bad_compute_address",
-        WakeComputeError::ApiError(ApiError::Transport(_)) => "api_transport_error",
+        WakeComputeError::BadComputeAddress(_) => WakeupFailureKind::BadComputeAddress,
+        WakeComputeError::ApiError(ApiError::Transport(_)) => WakeupFailureKind::ApiTransportError,
         WakeComputeError::ApiError(ApiError::Console {
             status: StatusCode::LOCKED,
             ref text,
         }) if text.contains("written data quota exceeded")
             || text.contains("the limit for current plan reached") =>
         {
-            "quota_exceeded"
+            WakeupFailureKind::QuotaExceeded
         }
         WakeComputeError::ApiError(ApiError::Console {
             status: StatusCode::UNPROCESSABLE_ENTITY,
             ref text,
         }) if text.contains("compute time quota of non-primary branches is exceeded") => {
-            "quota_exceeded"
+            WakeupFailureKind::QuotaExceeded
         }
         WakeComputeError::ApiError(ApiError::Console {
             status: StatusCode::LOCKED,
             ..
-        }) => "api_console_locked",
+        }) => WakeupFailureKind::ApiConsoleLocked,
         WakeComputeError::ApiError(ApiError::Console {
             status: StatusCode::BAD_REQUEST,
             ..
-        }) => "api_console_bad_request",
+        }) => WakeupFailureKind::ApiConsoleBadRequest,
         WakeComputeError::ApiError(ApiError::Console { status, .. })
             if status.is_server_error() =>
         {
-            "api_console_other_server_error"
+            WakeupFailureKind::ApiConsoleOtherServerError
         }
-        WakeComputeError::ApiError(ApiError::Console { .. }) => "api_console_other_error",
-        WakeComputeError::TimeoutError => "timeout_error",
+        WakeComputeError::ApiError(ApiError::Console { .. }) => {
+            WakeupFailureKind::ApiConsoleOtherError
+        }
+        WakeComputeError::TooManyConnections => WakeupFailureKind::ApiConsoleLocked,
+        WakeComputeError::TimeoutError => WakeupFailureKind::TimeoutError,
     };
-    NUM_WAKEUP_FAILURES.with_label_values(&[retry, kind]).inc();
+    Metrics::get()
+        .proxy
+        .connection_failures_breakdown
+        .inc(ConnectionFailuresBreakdownGroup {
+            kind,
+            retry: retry.into(),
+        });
 }
diff --git a/proxy/src/rate_limiter.rs b/proxy/src/rate_limiter.rs
index 13dffffca0..c542267547 100644
--- a/proxy/src/rate_limiter.rs
+++ b/proxy/src/rate_limiter.rs
@@ -1,7 +1,2 @@
-mod aimd;
-mod limit_algorithm;
 mod limiter;
-pub use aimd::Aimd;
-pub use limit_algorithm::{AimdConfig, Fixed, RateLimitAlgorithm, RateLimiterConfig};
-pub use limiter::Limiter;
-pub use limiter::{AuthRateLimiter, EndpointRateLimiter, RateBucketInfo, RedisRateLimiter};
+pub use limiter::{BucketRateLimiter, EndpointRateLimiter, GlobalRateLimiter, RateBucketInfo};
diff --git a/proxy/src/rate_limiter/aimd.rs b/proxy/src/rate_limiter/aimd.rs
deleted file mode 100644
index 2c14a54a6c..0000000000
--- a/proxy/src/rate_limiter/aimd.rs
+++ /dev/null
@@ -1,166 +0,0 @@
-use std::usize;
-
-use async_trait::async_trait;
-
-use super::limit_algorithm::{AimdConfig, LimitAlgorithm, Sample};
-
-use super::limiter::Outcome;
-
-/// Loss-based congestion avoidance.
-///
-/// Additive-increase, multiplicative decrease.
-///
-/// Adds available currency when:
-/// 1. no load-based errors are observed, and
-/// 2. the utilisation of the current limit is high.
-///
-/// Reduces available concurrency by a factor when load-based errors are detected.
-pub struct Aimd {
-    min_limit: usize,
-    max_limit: usize,
-    decrease_factor: f32,
-    increase_by: usize,
-    min_utilisation_threshold: f32,
-}
-
-impl Aimd {
-    pub fn new(config: AimdConfig) -> Self {
-        Self {
-            min_limit: config.aimd_min_limit,
-            max_limit: config.aimd_max_limit,
-            decrease_factor: config.aimd_decrease_factor,
-            increase_by: config.aimd_increase_by,
-            min_utilisation_threshold: config.aimd_min_utilisation_threshold,
-        }
-    }
-}
-
-#[async_trait]
-impl LimitAlgorithm for Aimd {
-    async fn update(&mut self, old_limit: usize, sample: Sample) -> usize {
-        use Outcome::*;
-        match sample.outcome {
-            Success => {
-                let utilisation = sample.in_flight as f32 / old_limit as f32;
-
-                if utilisation > self.min_utilisation_threshold {
-                    let limit = old_limit + self.increase_by;
-                    limit.clamp(self.min_limit, self.max_limit)
-                } else {
-                    old_limit
-                }
-            }
-            Overload => {
-                let limit = old_limit as f32 * self.decrease_factor;
-
-                // Floor instead of round, so the limit reduces even with small numbers.
-                // E.g. round(2 * 0.9) = 2, but floor(2 * 0.9) = 1
-                let limit = limit.floor() as usize;
-
-                limit.clamp(self.min_limit, self.max_limit)
-            }
-        }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use std::sync::Arc;
-
-    use tokio::sync::Notify;
-
-    use super::*;
-
-    use crate::rate_limiter::{Limiter, RateLimiterConfig};
-
-    #[tokio::test]
-    async fn should_decrease_limit_on_overload() {
-        let config = RateLimiterConfig {
-            initial_limit: 10,
-            aimd_config: Some(AimdConfig {
-                aimd_decrease_factor: 0.5,
-                ..Default::default()
-            }),
-            disable: false,
-            ..Default::default()
-        };
-
-        let release_notifier = Arc::new(Notify::new());
-
-        let limiter = Limiter::new(config).with_release_notifier(release_notifier.clone());
-
-        let token = limiter.try_acquire().unwrap();
-        limiter.release(token, Some(Outcome::Overload)).await;
-        release_notifier.notified().await;
-        assert_eq!(limiter.state().limit(), 5, "overload: decrease");
-    }
-
-    #[tokio::test]
-    async fn should_increase_limit_on_success_when_using_gt_util_threshold() {
-        let config = RateLimiterConfig {
-            initial_limit: 4,
-            aimd_config: Some(AimdConfig {
-                aimd_decrease_factor: 0.5,
-                aimd_min_utilisation_threshold: 0.5,
-                aimd_increase_by: 1,
-                ..Default::default()
-            }),
-            disable: false,
-            ..Default::default()
-        };
-
-        let limiter = Limiter::new(config);
-
-        let token = limiter.try_acquire().unwrap();
-        let _token = limiter.try_acquire().unwrap();
-        let _token = limiter.try_acquire().unwrap();
-
-        limiter.release(token, Some(Outcome::Success)).await;
-        assert_eq!(limiter.state().limit(), 5, "success: increase");
-    }
-
-    #[tokio::test]
-    async fn should_not_change_limit_on_success_when_using_lt_util_threshold() {
-        let config = RateLimiterConfig {
-            initial_limit: 4,
-            aimd_config: Some(AimdConfig {
-                aimd_decrease_factor: 0.5,
-                aimd_min_utilisation_threshold: 0.5,
-                ..Default::default()
-            }),
-            disable: false,
-            ..Default::default()
-        };
-
-        let limiter = Limiter::new(config);
-
-        let token = limiter.try_acquire().unwrap();
-
-        limiter.release(token, Some(Outcome::Success)).await;
-        assert_eq!(
-            limiter.state().limit(),
-            4,
-            "success: ignore when < half limit"
-        );
-    }
-
-    #[tokio::test]
-    async fn should_not_change_limit_when_no_outcome() {
-        let config = RateLimiterConfig {
-            initial_limit: 10,
-            aimd_config: Some(AimdConfig {
-                aimd_decrease_factor: 0.5,
-                aimd_min_utilisation_threshold: 0.5,
-                ..Default::default()
-            }),
-            disable: false,
-            ..Default::default()
-        };
-
-        let limiter = Limiter::new(config);
-
-        let token = limiter.try_acquire().unwrap();
-        limiter.release(token, None).await;
-        assert_eq!(limiter.state().limit(), 10, "ignore");
-    }
-}
diff --git a/proxy/src/rate_limiter/limit_algorithm.rs b/proxy/src/rate_limiter/limit_algorithm.rs
deleted file mode 100644
index 5cd2d5ebb7..0000000000
--- a/proxy/src/rate_limiter/limit_algorithm.rs
+++ /dev/null
@@ -1,98 +0,0 @@
-//! Algorithms for controlling concurrency limits.
-use async_trait::async_trait;
-use std::time::Duration;
-
-use super::{limiter::Outcome, Aimd};
-
-/// An algorithm for controlling a concurrency limit.
-#[async_trait]
-pub trait LimitAlgorithm: Send + Sync + 'static {
-    /// Update the concurrency limit in response to a new job completion.
-    async fn update(&mut self, old_limit: usize, sample: Sample) -> usize;
-}
-
-/// The result of a job (or jobs), including the [Outcome] (loss) and latency (delay).
-#[derive(Debug, Clone, PartialEq, Eq)]
-pub struct Sample {
-    pub(crate) latency: Duration,
-    /// Jobs in flight when the sample was taken.
-    pub(crate) in_flight: usize,
-    pub(crate) outcome: Outcome,
-}
-
-#[derive(Clone, Copy, Debug, Default, clap::ValueEnum)]
-pub enum RateLimitAlgorithm {
-    Fixed,
-    #[default]
-    Aimd,
-}
-
-pub struct Fixed;
-
-#[async_trait]
-impl LimitAlgorithm for Fixed {
-    async fn update(&mut self, old_limit: usize, _sample: Sample) -> usize {
-        old_limit
-    }
-}
-
-#[derive(Clone, Copy, Debug)]
-pub struct RateLimiterConfig {
-    pub disable: bool,
-    pub algorithm: RateLimitAlgorithm,
-    pub timeout: Duration,
-    pub initial_limit: usize,
-    pub aimd_config: Option<AimdConfig>,
-}
-
-impl RateLimiterConfig {
-    pub fn create_rate_limit_algorithm(self) -> Box<dyn LimitAlgorithm> {
-        match self.algorithm {
-            RateLimitAlgorithm::Fixed => Box::new(Fixed),
-            RateLimitAlgorithm::Aimd => Box::new(Aimd::new(self.aimd_config.unwrap())), // For aimd algorithm config is mandatory.
-        }
-    }
-}
-
-impl Default for RateLimiterConfig {
-    fn default() -> Self {
-        Self {
-            disable: true,
-            algorithm: RateLimitAlgorithm::Aimd,
-            timeout: Duration::from_secs(1),
-            initial_limit: 100,
-            aimd_config: Some(AimdConfig::default()),
-        }
-    }
-}
-
-#[derive(clap::Parser, Clone, Copy, Debug)]
-pub struct AimdConfig {
-    /// Minimum limit for AIMD algorithm. Makes sense only if `rate_limit_algorithm` is `Aimd`.
-    #[clap(long, default_value_t = 1)]
-    pub aimd_min_limit: usize,
-    /// Maximum limit for AIMD algorithm. Makes sense only if `rate_limit_algorithm` is `Aimd`.
-    #[clap(long, default_value_t = 1500)]
-    pub aimd_max_limit: usize,
-    /// Increase AIMD increase by value in case of success. Makes sense only if `rate_limit_algorithm` is `Aimd`.
-    #[clap(long, default_value_t = 10)]
-    pub aimd_increase_by: usize,
-    /// Decrease AIMD decrease by value in case of timout/429. Makes sense only if `rate_limit_algorithm` is `Aimd`.
-    #[clap(long, default_value_t = 0.9)]
-    pub aimd_decrease_factor: f32,
-    /// A threshold below which the limit won't be increased. Makes sense only if `rate_limit_algorithm` is `Aimd`.
-    #[clap(long, default_value_t = 0.8)]
-    pub aimd_min_utilisation_threshold: f32,
-}
-
-impl Default for AimdConfig {
-    fn default() -> Self {
-        Self {
-            aimd_min_limit: 1,
-            aimd_max_limit: 1500,
-            aimd_increase_by: 10,
-            aimd_decrease_factor: 0.9,
-            aimd_min_utilisation_threshold: 0.8,
-        }
-    }
-}
diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs
index f590896dd9..5ba2c36436 100644
--- a/proxy/src/rate_limiter/limiter.rs
+++ b/proxy/src/rate_limiter/limiter.rs
@@ -2,10 +2,9 @@ use std::{
     borrow::Cow,
     collections::hash_map::RandomState,
     hash::{BuildHasher, Hash},
-    net::IpAddr,
     sync::{
         atomic::{AtomicUsize, Ordering},
-        Arc, Mutex,
+        Mutex,
     },
 };
 
@@ -13,24 +12,18 @@ use anyhow::bail;
 use dashmap::DashMap;
 use itertools::Itertools;
 use rand::{rngs::StdRng, Rng, SeedableRng};
-use tokio::sync::{Mutex as AsyncMutex, Semaphore, SemaphorePermit};
-use tokio::time::{timeout, Duration, Instant};
+use tokio::time::{Duration, Instant};
 use tracing::info;
 
-use crate::{intern::EndpointIdInt, EndpointId};
+use crate::intern::EndpointIdInt;
 
-use super::{
-    limit_algorithm::{LimitAlgorithm, Sample},
-    RateLimiterConfig,
-};
-
-pub struct RedisRateLimiter {
+pub struct GlobalRateLimiter {
     data: Vec<RateBucket>,
-    info: &'static [RateBucketInfo],
+    info: Vec<RateBucketInfo>,
 }
 
-impl RedisRateLimiter {
-    pub fn new(info: &'static [RateBucketInfo]) -> Self {
+impl GlobalRateLimiter {
+    pub fn new(info: Vec<RateBucketInfo>) -> Self {
         Self {
             data: vec![
                 RateBucket {
@@ -50,7 +43,7 @@ impl RedisRateLimiter {
         let should_allow_request = self
             .data
             .iter_mut()
-            .zip(self.info)
+            .zip(&self.info)
             .all(|(bucket, info)| bucket.should_allow_request(info, now, 1));
 
         if should_allow_request {
@@ -68,15 +61,7 @@ impl RedisRateLimiter {
 // Purposefully ignore user name and database name as clients can reconnect
 // with different names, so we'll end up sending some http requests to
 // the control plane.
-//
-// We also may save quite a lot of CPU (I think) by bailing out right after we
-// saw SNI, before doing TLS handshake. User-side error messages in that case
-// does not look very nice (`SSL SYSCALL error: Undefined error: 0`), so for now
-// I went with a more expensive way that yields user-friendlier error messages.
-pub type EndpointRateLimiter = BucketRateLimiter<EndpointId, StdRng, RandomState>;
-
-// This can't be just per IP because that would limit some PaaS that share IP addresses
-pub type AuthRateLimiter = BucketRateLimiter<(EndpointIdInt, IpAddr), StdRng, RandomState>;
+pub type EndpointRateLimiter = BucketRateLimiter<EndpointIdInt, StdRng, RandomState>;
 
 pub struct BucketRateLimiter<Key, Rand = StdRng, Hasher = RandomState> {
     map: DashMap<Key, Vec<RateBucket>, Hasher>,
@@ -149,19 +134,6 @@ impl RateBucketInfo {
         Self::new(100, Duration::from_secs(600)),
     ];
 
-    /// All of these are per endpoint-ip pair.
-    /// Context: 4096 rounds of pbkdf2 take about 1ms of cpu time to execute (1 milli-cpu-second or 1mcpus).
-    ///
-    /// First bucket: 300mcpus total per endpoint-ip pair
-    /// * 1228800 requests per second with 1 hash rounds. (endpoint rate limiter will catch this first)
-    /// * 300 requests per second with 4096 hash rounds.
-    /// * 2 requests per second with 600000 hash rounds.
-    pub const DEFAULT_AUTH_SET: [Self; 3] = [
-        Self::new(300 * 4096, Duration::from_secs(1)),
-        Self::new(200 * 4096, Duration::from_secs(60)),
-        Self::new(100 * 4096, Duration::from_secs(600)),
-    ];
-
     pub fn validate(info: &mut [Self]) -> anyhow::Result<()> {
         info.sort_unstable_by_key(|info| info.interval);
         let invalid = info
@@ -259,419 +231,16 @@ impl<K: Hash + Eq, R: Rng, S: BuildHasher + Clone> BucketRateLimiter<K, R, S> {
     }
 }
 
-/// Limits the number of concurrent jobs.
-///
-/// Concurrency is limited through the use of [Token]s. Acquire a token to run a job, and release the
-/// token once the job is finished.
-///
-/// The limit will be automatically adjusted based on observed latency (delay) and/or failures
-/// caused by overload (loss).
-pub struct Limiter {
-    limit_algo: AsyncMutex<Box<dyn LimitAlgorithm>>,
-    semaphore: std::sync::Arc<Semaphore>,
-    config: RateLimiterConfig,
-
-    // ONLY WRITE WHEN LIMIT_ALGO IS LOCKED
-    limits: AtomicUsize,
-
-    // ONLY USE ATOMIC ADD/SUB
-    in_flight: Arc<AtomicUsize>,
-
-    #[cfg(test)]
-    notifier: Option<std::sync::Arc<tokio::sync::Notify>>,
-}
-
-/// A concurrency token, required to run a job.
-///
-/// Release the token back to the [Limiter] after the job is complete.
-#[derive(Debug)]
-pub struct Token<'t> {
-    permit: Option<tokio::sync::SemaphorePermit<'t>>,
-    start: Instant,
-    in_flight: Arc<AtomicUsize>,
-}
-
-/// A snapshot of the state of the [Limiter].
-///
-/// Not guaranteed to be consistent under high concurrency.
-#[derive(Debug, Clone, Copy)]
-pub struct LimiterState {
-    limit: usize,
-    in_flight: usize,
-}
-
-/// Whether a job succeeded or failed as a result of congestion/overload.
-///
-/// Errors not considered to be caused by overload should be ignored.
-#[derive(Debug, Clone, Copy, PartialEq, Eq)]
-pub enum Outcome {
-    /// The job succeeded, or failed in a way unrelated to overload.
-    Success,
-    /// The job failed because of overload, e.g. it timed out or an explicit backpressure signal
-    /// was observed.
-    Overload,
-}
-
-impl Outcome {
-    fn from_reqwest_error(error: &reqwest_middleware::Error) -> Self {
-        match error {
-            reqwest_middleware::Error::Middleware(_) => Outcome::Success,
-            reqwest_middleware::Error::Reqwest(e) => {
-                if let Some(status) = e.status() {
-                    if status.is_server_error()
-                        || reqwest::StatusCode::TOO_MANY_REQUESTS.as_u16() == status
-                    {
-                        Outcome::Overload
-                    } else {
-                        Outcome::Success
-                    }
-                } else {
-                    Outcome::Success
-                }
-            }
-        }
-    }
-    fn from_reqwest_response(response: &reqwest::Response) -> Self {
-        if response.status().is_server_error()
-            || response.status() == reqwest::StatusCode::TOO_MANY_REQUESTS
-        {
-            Outcome::Overload
-        } else {
-            Outcome::Success
-        }
-    }
-}
-
-impl Limiter {
-    /// Create a limiter with a given limit control algorithm.
-    pub fn new(config: RateLimiterConfig) -> Self {
-        assert!(config.initial_limit > 0);
-        Self {
-            limit_algo: AsyncMutex::new(config.create_rate_limit_algorithm()),
-            semaphore: Arc::new(Semaphore::new(config.initial_limit)),
-            config,
-            limits: AtomicUsize::new(config.initial_limit),
-            in_flight: Arc::new(AtomicUsize::new(0)),
-            #[cfg(test)]
-            notifier: None,
-        }
-    }
-    // pub fn new(limit_algorithm: T, timeout: Duration, initial_limit: usize) -> Self {
-    //     assert!(initial_limit > 0);
-
-    //     Self {
-    //         limit_algo: AsyncMutex::new(limit_algorithm),
-    //         semaphore: Arc::new(Semaphore::new(initial_limit)),
-    //         timeout,
-    //         limits: AtomicUsize::new(initial_limit),
-    //         in_flight: Arc::new(AtomicUsize::new(0)),
-    //         #[cfg(test)]
-    //         notifier: None,
-    //     }
-    // }
-
-    /// In some cases [Token]s are acquired asynchronously when updating the limit.
-    #[cfg(test)]
-    pub fn with_release_notifier(mut self, n: std::sync::Arc<tokio::sync::Notify>) -> Self {
-        self.notifier = Some(n);
-        self
-    }
-
-    /// Try to immediately acquire a concurrency [Token].
-    ///
-    /// Returns `None` if there are none available.
-    pub fn try_acquire(&self) -> Option<Token> {
-        let result = if self.config.disable {
-            // If the rate limiter is disabled, we can always acquire a token.
-            Some(Token::new(None, self.in_flight.clone()))
-        } else {
-            self.semaphore
-                .try_acquire()
-                .map(|permit| Token::new(Some(permit), self.in_flight.clone()))
-                .ok()
-        };
-        if result.is_some() {
-            self.in_flight.fetch_add(1, Ordering::AcqRel);
-        }
-        result
-    }
-
-    /// Try to acquire a concurrency [Token], waiting for `duration` if there are none available.
-    ///
-    /// Returns `None` if there are none available after `duration`.
-    pub async fn acquire_timeout(&self, duration: Duration) -> Option<Token<'_>> {
-        info!("acquiring token: {:?}", self.semaphore.available_permits());
-        let result = if self.config.disable {
-            // If the rate limiter is disabled, we can always acquire a token.
-            Some(Token::new(None, self.in_flight.clone()))
-        } else {
-            match timeout(duration, self.semaphore.acquire()).await {
-                Ok(maybe_permit) => maybe_permit
-                    .map(|permit| Token::new(Some(permit), self.in_flight.clone()))
-                    .ok(),
-                Err(_) => None,
-            }
-        };
-        if result.is_some() {
-            self.in_flight.fetch_add(1, Ordering::AcqRel);
-        }
-        result
-    }
-
-    /// Return the concurrency [Token], along with the outcome of the job.
-    ///
-    /// The [Outcome] of the job, and the time taken to perform it, may be used
-    /// to update the concurrency limit.
-    ///
-    /// Set the outcome to `None` to ignore the job.
-    pub async fn release(&self, mut token: Token<'_>, outcome: Option<Outcome>) {
-        tracing::info!("outcome is {:?}", outcome);
-        let in_flight = self.in_flight.load(Ordering::Acquire);
-        let old_limit = self.limits.load(Ordering::Acquire);
-        let available = if self.config.disable {
-            0 // This is not used in the algorithm and can be anything. If the config disable it makes sense to set it to 0.
-        } else {
-            self.semaphore.available_permits()
-        };
-        let total = in_flight + available;
-
-        let mut algo = self.limit_algo.lock().await;
-
-        let new_limit = if let Some(outcome) = outcome {
-            let sample = Sample {
-                latency: token.start.elapsed(),
-                in_flight,
-                outcome,
-            };
-            algo.update(old_limit, sample).await
-        } else {
-            old_limit
-        };
-        tracing::info!("new limit is {}", new_limit);
-        let actual_limit = if new_limit < total {
-            token.forget();
-            total.saturating_sub(1)
-        } else {
-            if !self.config.disable {
-                self.semaphore.add_permits(new_limit.saturating_sub(total));
-            }
-            new_limit
-        };
-        crate::metrics::RATE_LIMITER_LIMIT
-            .with_label_values(&["expected"])
-            .set(new_limit as i64);
-        crate::metrics::RATE_LIMITER_LIMIT
-            .with_label_values(&["actual"])
-            .set(actual_limit as i64);
-        self.limits.store(new_limit, Ordering::Release);
-        #[cfg(test)]
-        if let Some(n) = &self.notifier {
-            n.notify_one();
-        }
-    }
-
-    /// The current state of the limiter.
-    pub fn state(&self) -> LimiterState {
-        let limit = self.limits.load(Ordering::Relaxed);
-        let in_flight = self.in_flight.load(Ordering::Relaxed);
-        LimiterState { limit, in_flight }
-    }
-}
-
-impl<'t> Token<'t> {
-    fn new(permit: Option<SemaphorePermit<'t>>, in_flight: Arc<AtomicUsize>) -> Self {
-        Self {
-            permit,
-            start: Instant::now(),
-            in_flight,
-        }
-    }
-
-    pub fn forget(&mut self) {
-        if let Some(permit) = self.permit.take() {
-            permit.forget();
-        }
-    }
-}
-
-impl Drop for Token<'_> {
-    fn drop(&mut self) {
-        self.in_flight.fetch_sub(1, Ordering::AcqRel);
-    }
-}
-
-impl LimiterState {
-    /// The current concurrency limit.
-    pub fn limit(&self) -> usize {
-        self.limit
-    }
-    /// The number of jobs in flight.
-    pub fn in_flight(&self) -> usize {
-        self.in_flight
-    }
-}
-
-#[async_trait::async_trait]
-impl reqwest_middleware::Middleware for Limiter {
-    async fn handle(
-        &self,
-        req: reqwest::Request,
-        extensions: &mut task_local_extensions::Extensions,
-        next: reqwest_middleware::Next<'_>,
-    ) -> reqwest_middleware::Result<reqwest::Response> {
-        let start = Instant::now();
-        let token = self
-            .acquire_timeout(self.config.timeout)
-            .await
-            .ok_or_else(|| {
-                reqwest_middleware::Error::Middleware(
-                    // TODO: Should we map it into user facing errors?
-                    crate::console::errors::ApiError::Console {
-                        status: crate::http::StatusCode::TOO_MANY_REQUESTS,
-                        text: "Too many requests".into(),
-                    }
-                    .into(),
-                )
-            })?;
-        info!(duration = ?start.elapsed(), "waiting for token to connect to the control plane");
-        crate::metrics::RATE_LIMITER_ACQUIRE_LATENCY.observe(start.elapsed().as_secs_f64());
-        match next.run(req, extensions).await {
-            Ok(response) => {
-                self.release(token, Some(Outcome::from_reqwest_response(&response)))
-                    .await;
-                Ok(response)
-            }
-            Err(e) => {
-                self.release(token, Some(Outcome::from_reqwest_error(&e)))
-                    .await;
-                Err(e)
-            }
-        }
-    }
-}
-
 #[cfg(test)]
 mod tests {
-    use std::{hash::BuildHasherDefault, pin::pin, task::Context, time::Duration};
+    use std::{hash::BuildHasherDefault, time::Duration};
 
-    use futures::{task::noop_waker_ref, Future};
     use rand::SeedableRng;
     use rustc_hash::FxHasher;
     use tokio::time;
 
-    use super::{BucketRateLimiter, EndpointRateLimiter, Limiter, Outcome};
-    use crate::{
-        rate_limiter::{RateBucketInfo, RateLimitAlgorithm},
-        EndpointId,
-    };
-
-    #[tokio::test]
-    async fn it_works() {
-        let config = super::RateLimiterConfig {
-            algorithm: RateLimitAlgorithm::Fixed,
-            timeout: Duration::from_secs(1),
-            initial_limit: 10,
-            disable: false,
-            ..Default::default()
-        };
-        let limiter = Limiter::new(config);
-
-        let token = limiter.try_acquire().unwrap();
-
-        limiter.release(token, Some(Outcome::Success)).await;
-
-        assert_eq!(limiter.state().limit(), 10);
-    }
-
-    #[tokio::test]
-    async fn is_fair() {
-        let config = super::RateLimiterConfig {
-            algorithm: RateLimitAlgorithm::Fixed,
-            timeout: Duration::from_secs(1),
-            initial_limit: 1,
-            disable: false,
-            ..Default::default()
-        };
-        let limiter = Limiter::new(config);
-
-        // === TOKEN 1 ===
-        let token1 = limiter.try_acquire().unwrap();
-
-        let mut token2_fut = pin!(limiter.acquire_timeout(Duration::from_secs(1)));
-        assert!(
-            token2_fut
-                .as_mut()
-                .poll(&mut Context::from_waker(noop_waker_ref()))
-                .is_pending(),
-            "token is acquired by token1"
-        );
-
-        let mut token3_fut = pin!(limiter.acquire_timeout(Duration::from_secs(1)));
-        assert!(
-            token3_fut
-                .as_mut()
-                .poll(&mut Context::from_waker(noop_waker_ref()))
-                .is_pending(),
-            "token is acquired by token1"
-        );
-
-        limiter.release(token1, Some(Outcome::Success)).await;
-        // === END TOKEN 1 ===
-
-        // === TOKEN 2 ===
-        assert!(
-            limiter.try_acquire().is_none(),
-            "token is acquired by token2"
-        );
-
-        assert!(
-            token3_fut
-                .as_mut()
-                .poll(&mut Context::from_waker(noop_waker_ref()))
-                .is_pending(),
-            "token is acquired by token2"
-        );
-
-        let token2 = token2_fut.await.unwrap();
-
-        limiter.release(token2, Some(Outcome::Success)).await;
-        // === END TOKEN 2 ===
-
-        // === TOKEN 3 ===
-        assert!(
-            limiter.try_acquire().is_none(),
-            "token is acquired by token3"
-        );
-
-        let token3 = token3_fut.await.unwrap();
-        limiter.release(token3, Some(Outcome::Success)).await;
-        // === END TOKEN 3 ===
-
-        // === TOKEN 4 ===
-        let token4 = limiter.try_acquire().unwrap();
-        limiter.release(token4, Some(Outcome::Success)).await;
-    }
-
-    #[tokio::test]
-    async fn disable() {
-        let config = super::RateLimiterConfig {
-            algorithm: RateLimitAlgorithm::Fixed,
-            timeout: Duration::from_secs(1),
-            initial_limit: 1,
-            disable: true,
-            ..Default::default()
-        };
-        let limiter = Limiter::new(config);
-
-        // === TOKEN 1 ===
-        let token1 = limiter.try_acquire().unwrap();
-        let token2 = limiter.try_acquire().unwrap();
-        let state = limiter.state();
-        assert_eq!(state.limit(), 1);
-        assert_eq!(state.in_flight(), 2); // For disabled limiter, it's expected.
-        limiter.release(token1, None).await;
-        limiter.release(token2, None).await;
-    }
+    use super::{BucketRateLimiter, EndpointRateLimiter};
+    use crate::{intern::EndpointIdInt, rate_limiter::RateBucketInfo, EndpointId};
 
     #[test]
     fn rate_bucket_rpi() {
@@ -721,39 +290,40 @@ mod tests {
         let limiter = EndpointRateLimiter::new(rates);
 
         let endpoint = EndpointId::from("ep-my-endpoint-1234");
+        let endpoint = EndpointIdInt::from(endpoint);
 
         time::pause();
 
         for _ in 0..100 {
-            assert!(limiter.check(endpoint.clone(), 1));
+            assert!(limiter.check(endpoint, 1));
         }
         // more connections fail
-        assert!(!limiter.check(endpoint.clone(), 1));
+        assert!(!limiter.check(endpoint, 1));
 
         // fail even after 500ms as it's in the same bucket
         time::advance(time::Duration::from_millis(500)).await;
-        assert!(!limiter.check(endpoint.clone(), 1));
+        assert!(!limiter.check(endpoint, 1));
 
         // after a full 1s, 100 requests are allowed again
         time::advance(time::Duration::from_millis(500)).await;
         for _ in 1..6 {
             for _ in 0..50 {
-                assert!(limiter.check(endpoint.clone(), 2));
+                assert!(limiter.check(endpoint, 2));
             }
             time::advance(time::Duration::from_millis(1000)).await;
         }
 
         // more connections after 600 will exceed the 20rps@30s limit
-        assert!(!limiter.check(endpoint.clone(), 1));
+        assert!(!limiter.check(endpoint, 1));
 
         // will still fail before the 30 second limit
         time::advance(time::Duration::from_millis(30_000 - 6_000 - 1)).await;
-        assert!(!limiter.check(endpoint.clone(), 1));
+        assert!(!limiter.check(endpoint, 1));
 
         // after the full 30 seconds, 100 requests are allowed again
         time::advance(time::Duration::from_millis(1)).await;
         for _ in 0..100 {
-            assert!(limiter.check(endpoint.clone(), 1));
+            assert!(limiter.check(endpoint, 1));
         }
     }
 
@@ -773,31 +343,4 @@ mod tests {
         }
         assert!(limiter.map.len() < 150_000);
     }
-
-    #[test]
-    fn test_default_auth_set() {
-        // these values used to exceed u32::MAX
-        assert_eq!(
-            RateBucketInfo::DEFAULT_AUTH_SET,
-            [
-                RateBucketInfo {
-                    interval: Duration::from_secs(1),
-                    max_rpi: 300 * 4096,
-                },
-                RateBucketInfo {
-                    interval: Duration::from_secs(60),
-                    max_rpi: 200 * 4096 * 60,
-                },
-                RateBucketInfo {
-                    interval: Duration::from_secs(600),
-                    max_rpi: 100 * 4096 * 600,
-                }
-            ]
-        );
-
-        for x in RateBucketInfo::DEFAULT_AUTH_SET {
-            let y = x.to_string().parse().unwrap();
-            assert_eq!(x, y);
-        }
-    }
 }
diff --git a/proxy/src/redis/cancellation_publisher.rs b/proxy/src/redis/cancellation_publisher.rs
index 422789813c..7baf104374 100644
--- a/proxy/src/redis/cancellation_publisher.rs
+++ b/proxy/src/redis/cancellation_publisher.rs
@@ -5,7 +5,7 @@ use redis::AsyncCommands;
 use tokio::sync::Mutex;
 use uuid::Uuid;
 
-use crate::rate_limiter::{RateBucketInfo, RedisRateLimiter};
+use crate::rate_limiter::{GlobalRateLimiter, RateBucketInfo};
 
 use super::{
     connection_with_credentials_provider::ConnectionWithCredentialsProvider,
@@ -80,7 +80,7 @@ impl<P: CancellationPublisherMut> CancellationPublisher for Arc<Mutex<P>> {
 pub struct RedisPublisherClient {
     client: ConnectionWithCredentialsProvider,
     region_id: String,
-    limiter: RedisRateLimiter,
+    limiter: GlobalRateLimiter,
 }
 
 impl RedisPublisherClient {
@@ -92,7 +92,7 @@ impl RedisPublisherClient {
         Ok(Self {
             client,
             region_id,
-            limiter: RedisRateLimiter::new(info),
+            limiter: GlobalRateLimiter::new(info.into()),
         })
     }
 
diff --git a/proxy/src/redis/connection_with_credentials_provider.rs b/proxy/src/redis/connection_with_credentials_provider.rs
index d183abb53a..3a90d911c2 100644
--- a/proxy/src/redis/connection_with_credentials_provider.rs
+++ b/proxy/src/redis/connection_with_credentials_provider.rs
@@ -77,10 +77,14 @@ impl ConnectionWithCredentialsProvider {
         }
     }
 
+    async fn ping(con: &mut MultiplexedConnection) -> RedisResult<()> {
+        redis::cmd("PING").query_async(con).await
+    }
+
     pub async fn connect(&mut self) -> anyhow::Result<()> {
         let _guard = self.mutex.lock().await;
         if let Some(con) = self.con.as_mut() {
-            match redis::cmd("PING").query_async(con).await {
+            match Self::ping(con).await {
                 Ok(()) => {
                     return Ok(());
                 }
@@ -96,7 +100,7 @@ impl ConnectionWithCredentialsProvider {
         if let Some(f) = self.refresh_token_task.take() {
             f.abort()
         }
-        let con = self
+        let mut con = self
             .get_client()
             .await?
             .get_multiplexed_tokio_connection()
@@ -109,6 +113,14 @@ impl ConnectionWithCredentialsProvider {
             });
             self.refresh_token_task = Some(f);
         }
+        match Self::ping(&mut con).await {
+            Ok(()) => {
+                info!("Connection succesfully established");
+            }
+            Err(e) => {
+                error!("Connection is broken. Error during PING: {e:?}");
+            }
+        }
         self.con = Some(con);
         Ok(())
     }
diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs
index 8b7e3e3419..87d723d17e 100644
--- a/proxy/src/redis/notifications.rs
+++ b/proxy/src/redis/notifications.rs
@@ -4,6 +4,7 @@ use futures::StreamExt;
 use pq_proto::CancelKeyData;
 use redis::aio::PubSub;
 use serde::{Deserialize, Serialize};
+use tokio_util::sync::CancellationToken;
 use uuid::Uuid;
 
 use super::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
@@ -11,7 +12,7 @@ use crate::{
     cache::project_info::ProjectInfoCache,
     cancellation::{CancelMap, CancellationHandler},
     intern::{ProjectIdInt, RoleNameInt},
-    metrics::{NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS, REDIS_BROKEN_MESSAGES},
+    metrics::{Metrics, RedisErrors, RedisEventsCount},
 };
 
 const CPLANE_CHANNEL_NAME: &str = "neondb-proxy-ws-updates";
@@ -77,6 +78,16 @@ struct MessageHandler<C: ProjectInfoCache + Send + Sync + 'static> {
     region_id: String,
 }
 
+impl<C: ProjectInfoCache + Send + Sync + 'static> Clone for MessageHandler<C> {
+    fn clone(&self) -> Self {
+        Self {
+            cache: self.cache.clone(),
+            cancellation_handler: self.cancellation_handler.clone(),
+            region_id: self.region_id.clone(),
+        }
+    }
+}
+
 impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
     pub fn new(
         cache: Arc<C>,
@@ -89,11 +100,11 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
             region_id,
         }
     }
-    pub fn disable_ttl(&self) {
-        self.cache.disable_ttl();
+    pub async fn increment_active_listeners(&self) {
+        self.cache.increment_active_listeners().await;
     }
-    pub fn enable_ttl(&self) {
-        self.cache.enable_ttl();
+    pub async fn decrement_active_listeners(&self) {
+        self.cache.decrement_active_listeners().await;
     }
     #[tracing::instrument(skip(self, msg), fields(session_id = tracing::field::Empty))]
     async fn handle_message(&self, msg: redis::Msg) -> anyhow::Result<()> {
@@ -104,9 +115,9 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
         let msg: Notification = match serde_json::from_str(&payload) {
             Ok(msg) => msg,
             Err(e) => {
-                REDIS_BROKEN_MESSAGES
-                    .with_label_values(&[msg.get_channel_name()])
-                    .inc();
+                Metrics::get().proxy.redis_errors_total.inc(RedisErrors {
+                    channel: msg.get_channel_name(),
+                });
                 tracing::error!("broken message: {e}");
                 return Ok(());
             }
@@ -118,6 +129,10 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
                     "session_id",
                     &tracing::field::display(cancel_session.session_id),
                 );
+                Metrics::get()
+                    .proxy
+                    .redis_events_count
+                    .inc(RedisEventsCount::CancelSession);
                 if let Some(cancel_region) = cancel_session.region_id {
                     // If the message is not for this region, ignore it.
                     if cancel_region != self.region_id {
@@ -138,6 +153,17 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
             }
             _ => {
                 invalidate_cache(self.cache.clone(), msg.clone());
+                if matches!(msg, AllowedIpsUpdate { .. }) {
+                    Metrics::get()
+                        .proxy
+                        .redis_events_count
+                        .inc(RedisEventsCount::AllowedIpsUpdate);
+                } else if matches!(msg, PasswordUpdate { .. }) {
+                    Metrics::get()
+                        .proxy
+                        .redis_events_count
+                        .inc(RedisEventsCount::PasswordUpdate);
+                }
                 // It might happen that the invalid entry is on the way to be cached.
                 // To make sure that the entry is invalidated, let's repeat the invalidation in INVALIDATION_LAG seconds.
                 // TODO: include the version (or the timestamp) in the message and invalidate only if the entry is cached before the message.
@@ -167,37 +193,24 @@ fn invalidate_cache<C: ProjectInfoCache>(cache: Arc<C>, msg: Notification) {
     }
 }
 
-/// Handle console's invalidation messages.
-#[tracing::instrument(name = "console_notifications", skip_all)]
-pub async fn task_main<C>(
+async fn handle_messages<C: ProjectInfoCache + Send + Sync + 'static>(
+    handler: MessageHandler<C>,
     redis: ConnectionWithCredentialsProvider,
-    cache: Arc<C>,
-    cancel_map: CancelMap,
-    region_id: String,
-) -> anyhow::Result<Infallible>
-where
-    C: ProjectInfoCache + Send + Sync + 'static,
-{
-    cache.enable_ttl();
-    let handler = MessageHandler::new(
-        cache,
-        Arc::new(CancellationHandler::<()>::new(
-            cancel_map,
-            NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS,
-        )),
-        region_id,
-    );
-
+    cancellation_token: CancellationToken,
+) -> anyhow::Result<()> {
     loop {
+        if cancellation_token.is_cancelled() {
+            return Ok(());
+        }
         let mut conn = match try_connect(&redis).await {
             Ok(conn) => {
-                handler.disable_ttl();
+                handler.increment_active_listeners().await;
                 conn
             }
             Err(e) => {
                 tracing::error!(
-                    "failed to connect to redis: {e}, will try to reconnect in {RECONNECT_TIMEOUT:#?}"
-                );
+            "failed to connect to redis: {e}, will try to reconnect in {RECONNECT_TIMEOUT:#?}"
+        );
                 tokio::time::sleep(RECONNECT_TIMEOUT).await;
                 continue;
             }
@@ -211,8 +224,47 @@ where
                     break;
                 }
             }
+            if cancellation_token.is_cancelled() {
+                handler.decrement_active_listeners().await;
+                return Ok(());
+            }
         }
-        handler.enable_ttl();
+        handler.decrement_active_listeners().await;
+    }
+}
+
+/// Handle console's invalidation messages.
+#[tracing::instrument(name = "redis_notifications", skip_all)]
+pub async fn task_main<C>(
+    redis: ConnectionWithCredentialsProvider,
+    cache: Arc<C>,
+    cancel_map: CancelMap,
+    region_id: String,
+) -> anyhow::Result<Infallible>
+where
+    C: ProjectInfoCache + Send + Sync + 'static,
+{
+    let cancellation_handler = Arc::new(CancellationHandler::<()>::new(
+        cancel_map,
+        crate::metrics::CancellationSource::FromRedis,
+    ));
+    let handler = MessageHandler::new(cache, cancellation_handler, region_id);
+    // 6h - 1m.
+    // There will be 1 minute overlap between two tasks. But at least we can be sure that no message is lost.
+    let mut interval = tokio::time::interval(std::time::Duration::from_secs(6 * 60 * 60 - 60));
+    loop {
+        let cancellation_token = CancellationToken::new();
+        interval.tick().await;
+
+        tokio::spawn(handle_messages(
+            handler.clone(),
+            redis.clone(),
+            cancellation_token.clone(),
+        ));
+        tokio::spawn(async move {
+            tokio::time::sleep(std::time::Duration::from_secs(6 * 60 * 60)).await; // 6h.
+            cancellation_token.cancel();
+        });
     }
 }
 
diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs
index a2010fd613..1a0d1f7b0e 100644
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -4,42 +4,47 @@
 
 mod backend;
 mod conn_pool;
+mod http_util;
 mod json;
 mod sql_over_http;
-pub mod tls_listener;
 mod websocket;
 
+use atomic_take::AtomicTake;
+use bytes::Bytes;
 pub use conn_pool::GlobalConnPoolOptions;
 
-use anyhow::bail;
-use hyper::StatusCode;
-use metrics::IntCounterPairGuard;
+use anyhow::Context;
+use futures::future::{select, Either};
+use futures::TryFutureExt;
+use http::{Method, Response, StatusCode};
+use http_body_util::Full;
+use hyper1::body::Incoming;
+use hyper_util::rt::TokioExecutor;
+use hyper_util::server::conn::auto::Builder;
 use rand::rngs::StdRng;
 use rand::SeedableRng;
 pub use reqwest_middleware::{ClientWithMiddleware, Error};
 pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
+use tokio::time::timeout;
+use tokio_rustls::TlsAcceptor;
 use tokio_util::task::TaskTracker;
-use tracing::instrument::Instrumented;
 
 use crate::cancellation::CancellationHandlerMain;
 use crate::config::ProxyConfig;
 use crate::context::RequestMonitoring;
-use crate::protocol2::{ProxyProtocolAccept, WithClientIp, WithConnectionGuard};
-use crate::rate_limiter::EndpointRateLimiter;
+use crate::metrics::Metrics;
+use crate::protocol2::read_proxy_protocol;
+use crate::proxy::run_until_cancelled;
 use crate::serverless::backend::PoolingBackend;
-use hyper::{
-    server::conn::{AddrIncoming, AddrStream},
-    Body, Method, Request, Response,
-};
+use crate::serverless::http_util::{api_error_into_response, json_response};
 
-use std::net::IpAddr;
+use std::net::{IpAddr, SocketAddr};
+use std::pin::pin;
 use std::sync::Arc;
-use std::task::Poll;
-use tls_listener::TlsListener;
-use tokio::net::TcpListener;
-use tokio_util::sync::{CancellationToken, DropGuard};
+use tokio::net::{TcpListener, TcpStream};
+use tokio_util::sync::CancellationToken;
 use tracing::{error, info, warn, Instrument};
-use utils::http::{error::ApiError, json::json_response};
+use utils::http::error::ApiError;
 
 pub const SERVERLESS_DRIVER_SNI: &str = "api";
 
@@ -47,7 +52,6 @@ pub async fn task_main(
     config: &'static ProxyConfig,
     ws_listener: TcpListener,
     cancellation_token: CancellationToken,
-    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
     cancellation_handler: Arc<CancellationHandlerMain>,
 ) -> anyhow::Result<()> {
     scopeguard::defer! {
@@ -91,161 +95,178 @@ pub async fn task_main(
     tls_server_config.alpn_protocols = vec![b"h2".to_vec(), b"http/1.1".to_vec()];
     let tls_acceptor: tokio_rustls::TlsAcceptor = Arc::new(tls_server_config).into();
 
-    let mut addr_incoming = AddrIncoming::from_listener(ws_listener)?;
-    let _ = addr_incoming.set_nodelay(true);
-    let addr_incoming = ProxyProtocolAccept {
-        incoming: addr_incoming,
-        protocol: "http",
-    };
+    let connections = tokio_util::task::task_tracker::TaskTracker::new();
+    connections.close(); // allows `connections.wait to complete`
 
-    let ws_connections = tokio_util::task::task_tracker::TaskTracker::new();
-    ws_connections.close(); // allows `ws_connections.wait to complete`
+    let server = Builder::new(hyper_util::rt::TokioExecutor::new());
 
-    let tls_listener = TlsListener::new(tls_acceptor, addr_incoming, config.handshake_timeout);
+    while let Some(res) = run_until_cancelled(ws_listener.accept(), &cancellation_token).await {
+        let (conn, peer_addr) = res.context("could not accept TCP stream")?;
+        if let Err(e) = conn.set_nodelay(true) {
+            tracing::error!("could not set nodelay: {e}");
+            continue;
+        }
+        let conn_id = uuid::Uuid::new_v4();
+        let http_conn_span = tracing::info_span!("http_conn", ?conn_id);
 
-    let make_svc = hyper::service::make_service_fn(
-        |stream: &tokio_rustls::server::TlsStream<
-            WithConnectionGuard<WithClientIp<AddrStream>>,
-        >| {
-            let (conn, _) = stream.get_ref();
+        connections.spawn(
+            connection_handler(
+                config,
+                backend.clone(),
+                connections.clone(),
+                cancellation_handler.clone(),
+                cancellation_token.clone(),
+                server.clone(),
+                tls_acceptor.clone(),
+                conn,
+                peer_addr,
+            )
+            .instrument(http_conn_span),
+        );
+    }
 
-            // this is jank. should dissapear with hyper 1.0 migration.
-            let gauge = conn
-                .gauge
-                .lock()
-                .expect("lock should not be poisoned")
-                .take()
-                .expect("gauge should be set on connection start");
-
-            // Cancel all current inflight HTTP requests if the HTTP connection is closed.
-            let http_cancellation_token = CancellationToken::new();
-            let cancel_connection = http_cancellation_token.clone().drop_guard();
-
-            let span = conn.span.clone();
-            let client_addr = conn.inner.client_addr();
-            let remote_addr = conn.inner.inner.remote_addr();
-            let backend = backend.clone();
-            let ws_connections = ws_connections.clone();
-            let endpoint_rate_limiter = endpoint_rate_limiter.clone();
-            let cancellation_handler = cancellation_handler.clone();
-            async move {
-                let peer_addr = match client_addr {
-                    Some(addr) => addr,
-                    None if config.require_client_ip => bail!("missing required client ip"),
-                    None => remote_addr,
-                };
-                Ok(MetricService::new(
-                    hyper::service::service_fn(move |req: Request<Body>| {
-                        let backend = backend.clone();
-                        let ws_connections2 = ws_connections.clone();
-                        let endpoint_rate_limiter = endpoint_rate_limiter.clone();
-                        let cancellation_handler = cancellation_handler.clone();
-                        let http_cancellation_token = http_cancellation_token.child_token();
-
-                        // `request_handler` is not cancel safe. It expects to be cancelled only at specific times.
-                        // By spawning the future, we ensure it never gets cancelled until it decides to.
-                        ws_connections.spawn(
-                            async move {
-                                // Cancel the current inflight HTTP request if the requets stream is closed.
-                                // This is slightly different to `_cancel_connection` in that
-                                // h2 can cancel individual requests with a `RST_STREAM`.
-                                let _cancel_session = http_cancellation_token.clone().drop_guard();
-
-                                let res = request_handler(
-                                    req,
-                                    config,
-                                    backend,
-                                    ws_connections2,
-                                    cancellation_handler,
-                                    peer_addr.ip(),
-                                    endpoint_rate_limiter,
-                                    http_cancellation_token,
-                                )
-                                .await
-                                .map_or_else(|e| e.into_response(), |r| r);
-
-                                _cancel_session.disarm();
-
-                                res
-                            }
-                            .in_current_span(),
-                        )
-                    }),
-                    gauge,
-                    cancel_connection,
-                    span,
-                ))
-            }
-        },
-    );
-
-    hyper::Server::builder(tls_listener)
-        .serve(make_svc)
-        .with_graceful_shutdown(cancellation_token.cancelled())
-        .await?;
-
-    // await websocket connections
-    ws_connections.wait().await;
+    connections.wait().await;
 
     Ok(())
 }
 
-struct MetricService<S> {
-    inner: S,
-    _gauge: IntCounterPairGuard,
-    _cancel: DropGuard,
-    span: tracing::Span,
-}
+/// Handles the TCP lifecycle.
+///
+/// 1. Parses PROXY protocol V2
+/// 2. Handles TLS handshake
+/// 3. Handles HTTP connection
+///     1. With graceful shutdowns
+///     2. With graceful request cancellation with connection failure
+///     3. With websocket upgrade support.
+#[allow(clippy::too_many_arguments)]
+async fn connection_handler(
+    config: &'static ProxyConfig,
+    backend: Arc<PoolingBackend>,
+    connections: TaskTracker,
+    cancellation_handler: Arc<CancellationHandlerMain>,
+    cancellation_token: CancellationToken,
+    server: Builder<TokioExecutor>,
+    tls_acceptor: TlsAcceptor,
+    conn: TcpStream,
+    peer_addr: SocketAddr,
+) {
+    let session_id = uuid::Uuid::new_v4();
 
-impl<S> MetricService<S> {
-    fn new(
-        inner: S,
-        _gauge: IntCounterPairGuard,
-        _cancel: DropGuard,
-        span: tracing::Span,
-    ) -> MetricService<S> {
-        MetricService {
-            inner,
-            _gauge,
-            _cancel,
-            span,
+    let _gauge = Metrics::get()
+        .proxy
+        .client_connections
+        .guard(crate::metrics::Protocol::Http);
+
+    // handle PROXY protocol
+    let (conn, peer) = match read_proxy_protocol(conn).await {
+        Ok(c) => c,
+        Err(e) => {
+            tracing::error!(?session_id, %peer_addr, "failed to accept TCP connection: invalid PROXY protocol V2 header: {e:#}");
+            return;
         }
-    }
-}
+    };
 
-impl<S, ReqBody> hyper::service::Service<Request<ReqBody>> for MetricService<S>
-where
-    S: hyper::service::Service<Request<ReqBody>>,
-{
-    type Response = S::Response;
-    type Error = S::Error;
-    type Future = Instrumented<S::Future>;
+    let peer_addr = peer.unwrap_or(peer_addr).ip();
+    let has_private_peer_addr = match peer_addr {
+        IpAddr::V4(ip) => ip.is_private(),
+        _ => false,
+    };
+    info!(?session_id, %peer_addr, "accepted new TCP connection");
 
-    fn poll_ready(&mut self, cx: &mut std::task::Context<'_>) -> Poll<Result<(), Self::Error>> {
-        self.inner.poll_ready(cx)
-    }
+    // try upgrade to TLS, but with a timeout.
+    let conn = match timeout(config.handshake_timeout, tls_acceptor.accept(conn)).await {
+        Ok(Ok(conn)) => {
+            info!(?session_id, %peer_addr, "accepted new TLS connection");
+            conn
+        }
+        // The handshake failed
+        Ok(Err(e)) => {
+            if !has_private_peer_addr {
+                Metrics::get().proxy.tls_handshake_failures.inc();
+            }
+            warn!(?session_id, %peer_addr, "failed to accept TLS connection: {e:?}");
+            return;
+        }
+        // The handshake timed out
+        Err(e) => {
+            if !has_private_peer_addr {
+                Metrics::get().proxy.tls_handshake_failures.inc();
+            }
+            warn!(?session_id, %peer_addr, "failed to accept TLS connection: {e:?}");
+            return;
+        }
+    };
 
-    fn call(&mut self, req: Request<ReqBody>) -> Self::Future {
-        self.span
-            .in_scope(|| self.inner.call(req))
-            .instrument(self.span.clone())
+    let session_id = AtomicTake::new(session_id);
+
+    // Cancel all current inflight HTTP requests if the HTTP connection is closed.
+    let http_cancellation_token = CancellationToken::new();
+    let _cancel_connection = http_cancellation_token.clone().drop_guard();
+
+    let conn = server.serve_connection_with_upgrades(
+        hyper_util::rt::TokioIo::new(conn),
+        hyper1::service::service_fn(move |req: hyper1::Request<Incoming>| {
+            // First HTTP request shares the same session ID
+            let session_id = session_id.take().unwrap_or_else(uuid::Uuid::new_v4);
+
+            // Cancel the current inflight HTTP request if the requets stream is closed.
+            // This is slightly different to `_cancel_connection` in that
+            // h2 can cancel individual requests with a `RST_STREAM`.
+            let http_request_token = http_cancellation_token.child_token();
+            let cancel_request = http_request_token.clone().drop_guard();
+
+            // `request_handler` is not cancel safe. It expects to be cancelled only at specific times.
+            // By spawning the future, we ensure it never gets cancelled until it decides to.
+            let handler = connections.spawn(
+                request_handler(
+                    req,
+                    config,
+                    backend.clone(),
+                    connections.clone(),
+                    cancellation_handler.clone(),
+                    session_id,
+                    peer_addr,
+                    http_request_token,
+                )
+                .in_current_span()
+                .map_ok_or_else(api_error_into_response, |r| r),
+            );
+
+            async move {
+                let res = handler.await;
+                cancel_request.disarm();
+                res
+            }
+        }),
+    );
+
+    // On cancellation, trigger the HTTP connection handler to shut down.
+    let res = match select(pin!(cancellation_token.cancelled()), pin!(conn)).await {
+        Either::Left((_cancelled, mut conn)) => {
+            conn.as_mut().graceful_shutdown();
+            conn.await
+        }
+        Either::Right((res, _)) => res,
+    };
+
+    match res {
+        Ok(()) => tracing::info!(%peer_addr, "HTTP connection closed"),
+        Err(e) => tracing::warn!(%peer_addr, "HTTP connection error {e}"),
     }
 }
 
 #[allow(clippy::too_many_arguments)]
 async fn request_handler(
-    mut request: Request<Body>,
+    mut request: hyper1::Request<Incoming>,
     config: &'static ProxyConfig,
     backend: Arc<PoolingBackend>,
     ws_connections: TaskTracker,
     cancellation_handler: Arc<CancellationHandlerMain>,
+    session_id: uuid::Uuid,
     peer_addr: IpAddr,
-    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
     // used to cancel in-flight HTTP requests. not used to cancel websockets
     http_cancellation_token: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    let session_id = uuid::Uuid::new_v4();
-
+) -> Result<Response<Full<Bytes>>, ApiError> {
     let host = request
         .headers()
         .get("host")
@@ -255,7 +276,13 @@ async fn request_handler(
 
     // Check if the request is a websocket upgrade request.
     if hyper_tungstenite::is_upgrade_request(&request) {
-        let ctx = RequestMonitoring::new(session_id, peer_addr, "ws", &config.region);
+        let ctx = RequestMonitoring::new(
+            session_id,
+            peer_addr,
+            crate::metrics::Protocol::Ws,
+            &config.region,
+        );
+
         let span = ctx.span.clone();
         info!(parent: &span, "performing websocket upgrade");
 
@@ -264,15 +291,9 @@ async fn request_handler(
 
         ws_connections.spawn(
             async move {
-                if let Err(e) = websocket::serve_websocket(
-                    config,
-                    ctx,
-                    websocket,
-                    cancellation_handler,
-                    host,
-                    endpoint_rate_limiter,
-                )
-                .await
+                if let Err(e) =
+                    websocket::serve_websocket(config, ctx, websocket, cancellation_handler, host)
+                        .await
                 {
                     error!("error in websocket connection: {e:#}");
                 }
@@ -282,14 +303,19 @@ async fn request_handler(
 
         // Return the response so the spawned future can continue.
         Ok(response)
-    } else if request.uri().path() == "/sql" && request.method() == Method::POST {
-        let ctx = RequestMonitoring::new(session_id, peer_addr, "http", &config.region);
+    } else if request.uri().path() == "/sql" && *request.method() == Method::POST {
+        let ctx = RequestMonitoring::new(
+            session_id,
+            peer_addr,
+            crate::metrics::Protocol::Http,
+            &config.region,
+        );
         let span = ctx.span.clone();
 
         sql_over_http::handle(config, ctx, request, backend, http_cancellation_token)
             .instrument(span)
             .await
-    } else if request.uri().path() == "/sql" && request.method() == Method::OPTIONS {
+    } else if request.uri().path() == "/sql" && *request.method() == Method::OPTIONS {
         Response::builder()
             .header("Allow", "OPTIONS, POST")
             .header("Access-Control-Allow-Origin", "*")
@@ -299,7 +325,7 @@ async fn request_handler(
             )
             .header("Access-Control-Max-Age", "86400" /* 24 hours */)
             .status(StatusCode::OK) // 204 is also valid, but see: https://developer.mozilla.org/en-US/docs/Web/HTTP/Methods/OPTIONS#status_code
-            .body(Body::empty())
+            .body(Full::new(Bytes::new()))
             .map_err(|e| ApiError::InternalServerError(e.into()))
     } else {
         json_response(StatusCode::BAD_REQUEST, "query is not supported")
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 8aa5ad4e8a..963913a260 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -6,14 +6,16 @@ use tracing::{field::display, info};
 use crate::{
     auth::{backend::ComputeCredentials, check_peer_addr_is_in_list, AuthError},
     compute,
-    config::ProxyConfig,
+    config::{AuthenticationConfig, ProxyConfig},
     console::{
         errors::{GetAuthInfoError, WakeComputeError},
+        locks::ApiLocks,
         CachedNodeInfo,
     },
     context::RequestMonitoring,
     error::{ErrorKind, ReportableError, UserFacingError},
-    proxy::connect_compute::ConnectMechanism,
+    proxy::{connect_compute::ConnectMechanism, retry::ShouldRetry},
+    Host,
 };
 
 use super::conn_pool::{poll_client, Client, ConnInfo, GlobalConnPool};
@@ -27,6 +29,7 @@ impl PoolingBackend {
     pub async fn authenticate(
         &self,
         ctx: &mut RequestMonitoring,
+        config: &AuthenticationConfig,
         conn_info: &ConnInfo,
     ) -> Result<ComputeCredentials, AuthError> {
         let user_info = conn_info.user_info.clone();
@@ -43,6 +46,7 @@ impl PoolingBackend {
         let secret = match cached_secret.value.clone() {
             Some(secret) => self.config.authentication_config.check_rate_limit(
                 ctx,
+                config,
                 secret,
                 &user_info.endpoint,
                 true,
@@ -103,9 +107,12 @@ impl PoolingBackend {
                 conn_id,
                 conn_info,
                 pool: self.pool.clone(),
+                locks: &self.config.connect_compute_locks,
             },
             &backend,
             false, // do not allow self signed compute for http flow
+            self.config.wake_compute_retry_config,
+            self.config.connect_to_compute_retry_config,
         )
         .await
     }
@@ -150,16 +157,31 @@ impl UserFacingError for HttpConnError {
     }
 }
 
+impl ShouldRetry for HttpConnError {
+    fn could_retry(&self) -> bool {
+        match self {
+            HttpConnError::ConnectionError(e) => e.could_retry(),
+            HttpConnError::ConnectionClosedAbruptly(_) => false,
+            HttpConnError::GetAuthInfo(_) => false,
+            HttpConnError::AuthError(_) => false,
+            HttpConnError::WakeCompute(_) => false,
+        }
+    }
+}
+
 struct TokioMechanism {
     pool: Arc<GlobalConnPool<tokio_postgres::Client>>,
     conn_info: ConnInfo,
     conn_id: uuid::Uuid,
+
+    /// connect_to_compute concurrency lock
+    locks: &'static ApiLocks<Host>,
 }
 
 #[async_trait]
 impl ConnectMechanism for TokioMechanism {
     type Connection = Client<tokio_postgres::Client>;
-    type ConnectError = tokio_postgres::Error;
+    type ConnectError = HttpConnError;
     type Error = HttpConnError;
 
     async fn connect_once(
@@ -168,6 +190,9 @@ impl ConnectMechanism for TokioMechanism {
         node_info: &CachedNodeInfo,
         timeout: Duration,
     ) -> Result<Self::Connection, Self::ConnectError> {
+        let host = node_info.config.get_host()?;
+        let permit = self.locks.get_permit(&host).await?;
+
         let mut config = (*node_info.config).clone();
         let config = config
             .user(&self.conn_info.user_info.user)
@@ -175,7 +200,10 @@ impl ConnectMechanism for TokioMechanism {
             .dbname(&self.conn_info.dbname)
             .connect_timeout(timeout);
 
+        let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute);
         let (client, connection) = config.connect(tokio_postgres::NoTls).await?;
+        drop(pause);
+        drop(permit);
 
         tracing::Span::current().record("pid", &tracing::field::display(client.get_process_id()));
         Ok(poll_client(
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index 35311facb8..798e488509 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -1,6 +1,5 @@
 use dashmap::DashMap;
 use futures::{future::poll_fn, Future};
-use metrics::IntCounterPairGuard;
 use parking_lot::RwLock;
 use rand::Rng;
 use smallvec::SmallVec;
@@ -16,13 +15,13 @@ use std::{
 use tokio::time::Instant;
 use tokio_postgres::tls::NoTlsStream;
 use tokio_postgres::{AsyncMessage, ReadyForQueryStatus, Socket};
+use tokio_util::sync::CancellationToken;
 
 use crate::console::messages::{ColdStartInfo, MetricsAuxInfo};
-use crate::metrics::{ENDPOINT_POOLS, GC_LATENCY, NUM_OPEN_CLIENTS_IN_HTTP_POOL};
+use crate::metrics::{HttpEndpointPoolsGuard, Metrics};
 use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS};
 use crate::{
-    auth::backend::ComputeUserInfo, context::RequestMonitoring, metrics::NUM_DB_CONNECTIONS_GAUGE,
-    DbName, EndpointCacheKey, RoleName,
+    auth::backend::ComputeUserInfo, context::RequestMonitoring, DbName, EndpointCacheKey, RoleName,
 };
 
 use tracing::{debug, error, warn, Span};
@@ -78,7 +77,7 @@ pub struct EndpointConnPool<C: ClientInnerExt> {
     pools: HashMap<(DbName, RoleName), DbUserConnPool<C>>,
     total_conns: usize,
     max_conns: usize,
-    _guard: IntCounterPairGuard,
+    _guard: HttpEndpointPoolsGuard<'static>,
     global_connections_count: Arc<AtomicUsize>,
     global_pool_size_max_conns: usize,
 }
@@ -110,7 +109,11 @@ impl<C: ClientInnerExt> EndpointConnPool<C> {
             let removed = old_len - new_len;
             if removed > 0 {
                 global_connections_count.fetch_sub(removed, atomic::Ordering::Relaxed);
-                NUM_OPEN_CLIENTS_IN_HTTP_POOL.sub(removed as i64);
+                Metrics::get()
+                    .proxy
+                    .http_pool_opened_connections
+                    .get_metric()
+                    .dec_by(removed as i64);
             }
             *total_conns -= removed;
             removed > 0
@@ -156,7 +159,11 @@ impl<C: ClientInnerExt> EndpointConnPool<C> {
                 pool.total_conns += 1;
                 pool.global_connections_count
                     .fetch_add(1, atomic::Ordering::Relaxed);
-                NUM_OPEN_CLIENTS_IN_HTTP_POOL.inc();
+                Metrics::get()
+                    .proxy
+                    .http_pool_opened_connections
+                    .get_metric()
+                    .inc();
             }
 
             pool.total_conns
@@ -176,7 +183,11 @@ impl<C: ClientInnerExt> Drop for EndpointConnPool<C> {
         if self.total_conns > 0 {
             self.global_connections_count
                 .fetch_sub(self.total_conns, atomic::Ordering::Relaxed);
-            NUM_OPEN_CLIENTS_IN_HTTP_POOL.sub(self.total_conns as i64);
+            Metrics::get()
+                .proxy
+                .http_pool_opened_connections
+                .get_metric()
+                .dec_by(self.total_conns as i64);
         }
     }
 }
@@ -215,7 +226,11 @@ impl<C: ClientInnerExt> DbUserConnPool<C> {
             removed += 1;
         }
         global_connections_count.fetch_sub(removed, atomic::Ordering::Relaxed);
-        NUM_OPEN_CLIENTS_IN_HTTP_POOL.sub(removed as i64);
+        Metrics::get()
+            .proxy
+            .http_pool_opened_connections
+            .get_metric()
+            .dec_by(removed as i64);
         conn
     }
 }
@@ -303,7 +318,10 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
         // acquire a random shard lock
         let mut shard = self.global_pool.shards()[shard].write();
 
-        let timer = GC_LATENCY.start_timer();
+        let timer = Metrics::get()
+            .proxy
+            .http_pool_reclaimation_lag_seconds
+            .start_timer();
         let current_len = shard.len();
         let mut clients_removed = 0;
         shard.retain(|endpoint, x| {
@@ -331,7 +349,7 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
 
         let new_len = shard.len();
         drop(shard);
-        timer.observe_duration();
+        timer.observe();
 
         // Do logging outside of the lock.
         if clients_removed > 0 {
@@ -339,7 +357,11 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
                 .global_connections_count
                 .fetch_sub(clients_removed, atomic::Ordering::Relaxed)
                 - clients_removed;
-            NUM_OPEN_CLIENTS_IN_HTTP_POOL.sub(clients_removed as i64);
+            Metrics::get()
+                .proxy
+                .http_pool_opened_connections
+                .get_metric()
+                .dec_by(clients_removed as i64);
             info!("pool: performed global pool gc. removed {clients_removed} clients, total number of clients in pool is {size}");
         }
         let removed = current_len - new_len;
@@ -410,7 +432,7 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
             pools: HashMap::new(),
             total_conns: 0,
             max_conns: self.config.pool_options.max_conns_per_endpoint,
-            _guard: ENDPOINT_POOLS.guard(),
+            _guard: Metrics::get().proxy.http_endpoint_pools.guard(),
             global_connections_count: self.global_connections_count.clone(),
             global_pool_size_max_conns: self.config.pool_options.max_total_conns,
         }));
@@ -450,9 +472,7 @@ pub fn poll_client<C: ClientInnerExt>(
     conn_id: uuid::Uuid,
     aux: MetricsAuxInfo,
 ) -> Client<C> {
-    let conn_gauge = NUM_DB_CONNECTIONS_GAUGE
-        .with_label_values(&[ctx.protocol])
-        .guard();
+    let conn_gauge = Metrics::get().proxy.db_connections.guard(ctx.protocol);
     let mut session_id = ctx.session_id;
     let (tx, mut rx) = tokio::sync::watch::channel(session_id);
 
@@ -469,15 +489,32 @@ pub fn poll_client<C: ClientInnerExt>(
 
     let db_user = conn_info.db_and_user();
     let idle = global_pool.get_idle_timeout();
+    let cancel = CancellationToken::new();
+    let cancelled = cancel.clone().cancelled_owned();
+
     tokio::spawn(
     async move {
         let _conn_gauge = conn_gauge;
         let mut idle_timeout = pin!(tokio::time::sleep(idle));
+        let mut cancelled = pin!(cancelled);
+
         poll_fn(move |cx| {
-            if matches!(rx.has_changed(), Ok(true)) {
-                session_id = *rx.borrow_and_update();
-                info!(%session_id, "changed session");
-                idle_timeout.as_mut().reset(Instant::now() + idle);
+            if cancelled.as_mut().poll(cx).is_ready() {
+                info!("connection dropped");
+                return Poll::Ready(())
+            }
+
+            match rx.has_changed() {
+                Ok(true) => {
+                    session_id = *rx.borrow_and_update();
+                    info!(%session_id, "changed session");
+                    idle_timeout.as_mut().reset(Instant::now() + idle);
+                }
+                Err(_) => {
+                    info!("connection dropped");
+                    return Poll::Ready(())
+                }
+                _ => {}
             }
 
             // 5 minute idle connection timeout
@@ -532,6 +569,7 @@ pub fn poll_client<C: ClientInnerExt>(
     let inner = ClientInner {
         inner: client,
         session: tx,
+        cancel,
         aux,
         conn_id,
     };
@@ -541,10 +579,18 @@ pub fn poll_client<C: ClientInnerExt>(
 struct ClientInner<C: ClientInnerExt> {
     inner: C,
     session: tokio::sync::watch::Sender<uuid::Uuid>,
+    cancel: CancellationToken,
     aux: MetricsAuxInfo,
     conn_id: uuid::Uuid,
 }
 
+impl<C: ClientInnerExt> Drop for ClientInner<C> {
+    fn drop(&mut self) {
+        // on client drop, tell the conn to shut down
+        self.cancel.cancel();
+    }
+}
+
 pub trait ClientInnerExt: Sync + Send + 'static {
     fn is_closed(&self) -> bool;
     fn get_process_id(&self) -> i32;
@@ -697,6 +743,7 @@ mod tests {
         ClientInner {
             inner: client,
             session: tokio::sync::watch::Sender::new(uuid::Uuid::new_v4()),
+            cancel: CancellationToken::new(),
             aux: MetricsAuxInfo {
                 endpoint_id: (&EndpointId::from("endpoint")).into(),
                 project_id: (&ProjectId::from("project")).into(),
diff --git a/proxy/src/serverless/http_util.rs b/proxy/src/serverless/http_util.rs
new file mode 100644
index 0000000000..ab9127b13e
--- /dev/null
+++ b/proxy/src/serverless/http_util.rs
@@ -0,0 +1,92 @@
+//! Things stolen from `libs/utils/src/http` to add hyper 1.0 compatibility
+//! Will merge back in at some point in the future.
+
+use bytes::Bytes;
+
+use anyhow::Context;
+use http::{Response, StatusCode};
+use http_body_util::Full;
+
+use serde::Serialize;
+use utils::http::error::ApiError;
+
+/// Like [`ApiError::into_response`]
+pub fn api_error_into_response(this: ApiError) -> Response<Full<Bytes>> {
+    match this {
+        ApiError::BadRequest(err) => HttpErrorBody::response_from_msg_and_status(
+            format!("{err:#?}"), // use debug printing so that we give the cause
+            StatusCode::BAD_REQUEST,
+        ),
+        ApiError::Forbidden(_) => {
+            HttpErrorBody::response_from_msg_and_status(this.to_string(), StatusCode::FORBIDDEN)
+        }
+        ApiError::Unauthorized(_) => {
+            HttpErrorBody::response_from_msg_and_status(this.to_string(), StatusCode::UNAUTHORIZED)
+        }
+        ApiError::NotFound(_) => {
+            HttpErrorBody::response_from_msg_and_status(this.to_string(), StatusCode::NOT_FOUND)
+        }
+        ApiError::Conflict(_) => {
+            HttpErrorBody::response_from_msg_and_status(this.to_string(), StatusCode::CONFLICT)
+        }
+        ApiError::PreconditionFailed(_) => HttpErrorBody::response_from_msg_and_status(
+            this.to_string(),
+            StatusCode::PRECONDITION_FAILED,
+        ),
+        ApiError::ShuttingDown => HttpErrorBody::response_from_msg_and_status(
+            "Shutting down".to_string(),
+            StatusCode::SERVICE_UNAVAILABLE,
+        ),
+        ApiError::ResourceUnavailable(err) => HttpErrorBody::response_from_msg_and_status(
+            err.to_string(),
+            StatusCode::SERVICE_UNAVAILABLE,
+        ),
+        ApiError::Timeout(err) => HttpErrorBody::response_from_msg_and_status(
+            err.to_string(),
+            StatusCode::REQUEST_TIMEOUT,
+        ),
+        ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status(
+            err.to_string(),
+            StatusCode::INTERNAL_SERVER_ERROR,
+        ),
+    }
+}
+
+/// Same as [`utils::http::error::HttpErrorBody`]
+#[derive(Serialize)]
+struct HttpErrorBody {
+    pub msg: String,
+}
+
+impl HttpErrorBody {
+    /// Same as [`utils::http::error::HttpErrorBody::response_from_msg_and_status`]
+    fn response_from_msg_and_status(msg: String, status: StatusCode) -> Response<Full<Bytes>> {
+        HttpErrorBody { msg }.to_response(status)
+    }
+
+    /// Same as [`utils::http::error::HttpErrorBody::to_response`]
+    fn to_response(&self, status: StatusCode) -> Response<Full<Bytes>> {
+        Response::builder()
+            .status(status)
+            .header(http::header::CONTENT_TYPE, "application/json")
+            // we do not have nested maps with non string keys so serialization shouldn't fail
+            .body(Full::new(Bytes::from(serde_json::to_string(self).unwrap())))
+            .unwrap()
+    }
+}
+
+/// Same as [`utils::http::json::json_response`]
+pub fn json_response<T: Serialize>(
+    status: StatusCode,
+    data: T,
+) -> Result<Response<Full<Bytes>>, ApiError> {
+    let json = serde_json::to_string(&data)
+        .context("Failed to serialize JSON response")
+        .map_err(ApiError::InternalServerError)?;
+    let response = Response::builder()
+        .status(status)
+        .header(http::header::CONTENT_TYPE, "application/json")
+        .body(Full::new(Bytes::from(json)))
+        .map_err(|e| ApiError::InternalServerError(e.into()))?;
+    Ok(response)
+}
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 00dffd5784..e856053a7e 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -1,18 +1,22 @@
 use std::pin::pin;
 use std::sync::Arc;
 
+use bytes::Bytes;
 use futures::future::select;
 use futures::future::try_join;
 use futures::future::Either;
 use futures::StreamExt;
 use futures::TryFutureExt;
-use hyper::body::HttpBody;
-use hyper::header;
-use hyper::http::HeaderName;
-use hyper::http::HeaderValue;
-use hyper::Response;
-use hyper::StatusCode;
-use hyper::{Body, HeaderMap, Request};
+use http_body_util::BodyExt;
+use http_body_util::Full;
+use hyper1::body::Body;
+use hyper1::body::Incoming;
+use hyper1::header;
+use hyper1::http::HeaderName;
+use hyper1::http::HeaderValue;
+use hyper1::Response;
+use hyper1::StatusCode;
+use hyper1::{HeaderMap, Request};
 use serde_json::json;
 use serde_json::Value;
 use tokio::time;
@@ -29,7 +33,6 @@ use tracing::error;
 use tracing::info;
 use url::Url;
 use utils::http::error::ApiError;
-use utils::http::json::json_response;
 
 use crate::auth::backend::ComputeUserInfo;
 use crate::auth::endpoint_sni;
@@ -40,8 +43,8 @@ use crate::context::RequestMonitoring;
 use crate::error::ErrorKind;
 use crate::error::ReportableError;
 use crate::error::UserFacingError;
-use crate::metrics::HTTP_CONTENT_LENGTH;
-use crate::metrics::NUM_CONNECTION_REQUESTS_GAUGE;
+use crate::metrics::HttpDirection;
+use crate::metrics::Metrics;
 use crate::proxy::run_until_cancelled;
 use crate::proxy::NeonOptions;
 use crate::serverless::backend::HttpConnError;
@@ -52,6 +55,7 @@ use crate::RoleName;
 use super::backend::PoolingBackend;
 use super::conn_pool::Client;
 use super::conn_pool::ConnInfo;
+use super::http_util::json_response;
 use super::json::json_to_pg_text;
 use super::json::pg_text_row_to_json;
 use super::json::JsonConversionError;
@@ -218,10 +222,10 @@ fn get_conn_info(
 pub async fn handle(
     config: &'static ProxyConfig,
     mut ctx: RequestMonitoring,
-    request: Request<Body>,
+    request: Request<Incoming>,
     backend: Arc<PoolingBackend>,
     cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
+) -> Result<Response<Full<Bytes>>, ApiError> {
     let result = handle_inner(cancel, config, &mut ctx, request, backend).await;
 
     let mut response = match result {
@@ -332,10 +336,9 @@ pub async fn handle(
         }
     };
 
-    response.headers_mut().insert(
-        "Access-Control-Allow-Origin",
-        hyper::http::HeaderValue::from_static("*"),
-    );
+    response
+        .headers_mut()
+        .insert("Access-Control-Allow-Origin", HeaderValue::from_static("*"));
     Ok(response)
 }
 
@@ -396,7 +399,7 @@ impl UserFacingError for SqlOverHttpError {
 #[derive(Debug, thiserror::Error)]
 pub enum ReadPayloadError {
     #[error("could not read the HTTP request body: {0}")]
-    Read(#[from] hyper::Error),
+    Read(#[from] hyper1::Error),
     #[error("could not parse the HTTP request body: {0}")]
     Parse(#[from] serde_json::Error),
 }
@@ -437,7 +440,7 @@ struct HttpHeaders {
 }
 
 impl HttpHeaders {
-    fn try_parse(headers: &hyper::http::HeaderMap) -> Result<Self, SqlOverHttpError> {
+    fn try_parse(headers: &hyper1::http::HeaderMap) -> Result<Self, SqlOverHttpError> {
         // Determine the output options. Default behaviour is 'false'. Anything that is not
         // strictly 'true' assumed to be false.
         let raw_output = headers.get(&RAW_TEXT_OUTPUT) == Some(&HEADER_VALUE_TRUE);
@@ -488,13 +491,14 @@ async fn handle_inner(
     cancel: CancellationToken,
     config: &'static ProxyConfig,
     ctx: &mut RequestMonitoring,
-    request: Request<Body>,
+    request: Request<Incoming>,
     backend: Arc<PoolingBackend>,
-) -> Result<Response<Body>, SqlOverHttpError> {
-    let _request_gauge = NUM_CONNECTION_REQUESTS_GAUGE
-        .with_label_values(&[ctx.protocol])
-        .guard();
-    info!("handling interactive connection from client");
+) -> Result<Response<Full<Bytes>>, SqlOverHttpError> {
+    let _requeset_gauge = Metrics::get().proxy.connection_requests.guard(ctx.protocol);
+    info!(
+        protocol = %ctx.protocol,
+        "handling interactive connection from client"
+    );
 
     //
     // Determine the destination and connection params
@@ -517,9 +521,10 @@ async fn handle_inner(
         None => MAX_REQUEST_SIZE + 1,
     };
     info!(request_content_length, "request size in bytes");
-    HTTP_CONTENT_LENGTH
-        .with_label_values(&["request"])
-        .observe(request_content_length as f64);
+    Metrics::get()
+        .proxy
+        .http_conn_content_length_bytes
+        .observe(HttpDirection::Request, request_content_length as f64);
 
     // we don't have a streaming request support yet so this is to prevent OOM
     // from a malicious user sending an extremely large request body
@@ -528,7 +533,7 @@ async fn handle_inner(
     }
 
     let fetch_and_process_request = async {
-        let body = hyper::body::to_bytes(request.into_body()).await?;
+        let body = request.into_body().collect().await?.to_bytes();
         info!(length = body.len(), "request payload read");
         let payload: Payload = serde_json::from_slice(&body)?;
         Ok::<Payload, ReadPayloadError>(payload) // Adjust error type accordingly
@@ -536,7 +541,9 @@ async fn handle_inner(
     .map_err(SqlOverHttpError::from);
 
     let authenticate_and_connect = async {
-        let keys = backend.authenticate(ctx, &conn_info).await?;
+        let keys = backend
+            .authenticate(ctx, &config.authentication_config, &conn_info)
+            .await?;
         let client = backend
             .connect_to_compute(ctx, conn_info, keys, !allow_pool)
             .await?;
@@ -596,7 +603,7 @@ async fn handle_inner(
     let body = serde_json::to_string(&result).expect("json serialization should not fail");
     let len = body.len();
     let response = response
-        .body(Body::from(body))
+        .body(Full::new(Bytes::from(body)))
         // only fails if invalid status code or invalid header/values are given.
         // these are not user configurable so it cannot fail dynamically
         .expect("building response payload should not fail");
@@ -604,9 +611,10 @@ async fn handle_inner(
     // count the egress bytes - we miss the TLS and header overhead but oh well...
     // moving this later in the stack is going to be a lot of effort and ehhhh
     metrics.record_egress(len as u64);
-    HTTP_CONTENT_LENGTH
-        .with_label_values(&["response"])
-        .observe(len as f64);
+    Metrics::get()
+        .proxy
+        .http_conn_content_length_bytes
+        .observe(HttpDirection::Response, len as f64);
 
     Ok(response)
 }
@@ -639,6 +647,7 @@ impl QueryData {
             }
             // The query was cancelled.
             Either::Right((_cancelled, query)) => {
+                tracing::info!("cancelling query");
                 if let Err(err) = cancel_token.cancel_query(NoTls).await {
                     tracing::error!(?err, "could not cancel query");
                 }
diff --git a/proxy/src/serverless/tls_listener.rs b/proxy/src/serverless/tls_listener.rs
deleted file mode 100644
index 33f194dd59..0000000000
--- a/proxy/src/serverless/tls_listener.rs
+++ /dev/null
@@ -1,123 +0,0 @@
-use std::{
-    convert::Infallible,
-    pin::Pin,
-    task::{Context, Poll},
-    time::Duration,
-};
-
-use hyper::server::{accept::Accept, conn::AddrStream};
-use pin_project_lite::pin_project;
-use tokio::{
-    io::{AsyncRead, AsyncWrite},
-    task::JoinSet,
-    time::timeout,
-};
-use tokio_rustls::{server::TlsStream, TlsAcceptor};
-use tracing::{info, warn, Instrument};
-
-use crate::{
-    metrics::TLS_HANDSHAKE_FAILURES,
-    protocol2::{WithClientIp, WithConnectionGuard},
-};
-
-pin_project! {
-    /// Wraps a `Stream` of connections (such as a TCP listener) so that each connection is itself
-    /// encrypted using TLS.
-    pub(crate) struct TlsListener<A: Accept> {
-        #[pin]
-        listener: A,
-        tls: TlsAcceptor,
-        waiting: JoinSet<Option<TlsStream<A::Conn>>>,
-        timeout: Duration,
-    }
-}
-
-impl<A: Accept> TlsListener<A> {
-    /// Create a `TlsListener` with default options.
-    pub(crate) fn new(tls: TlsAcceptor, listener: A, timeout: Duration) -> Self {
-        TlsListener {
-            listener,
-            tls,
-            waiting: JoinSet::new(),
-            timeout,
-        }
-    }
-}
-
-impl<A> Accept for TlsListener<A>
-where
-    A: Accept<Conn = WithConnectionGuard<WithClientIp<AddrStream>>>,
-    A::Error: std::error::Error,
-    A::Conn: AsyncRead + AsyncWrite + Unpin + Send + 'static,
-{
-    type Conn = TlsStream<A::Conn>;
-
-    type Error = Infallible;
-
-    fn poll_accept(
-        self: Pin<&mut Self>,
-        cx: &mut Context<'_>,
-    ) -> Poll<Option<Result<Self::Conn, Self::Error>>> {
-        let mut this = self.project();
-
-        loop {
-            match this.listener.as_mut().poll_accept(cx) {
-                Poll::Pending => break,
-                Poll::Ready(Some(Ok(mut conn))) => {
-                    let t = *this.timeout;
-                    let tls = this.tls.clone();
-                    let span = conn.span.clone();
-                    this.waiting.spawn(async move {
-                        let peer_addr = match conn.inner.wait_for_addr().await {
-                            Ok(Some(addr)) => addr,
-                            Err(e) => {
-                                tracing::error!("failed to accept TCP connection: invalid PROXY protocol V2 header: {e:#}");
-                                return None;
-                            }
-                            Ok(None) => conn.inner.inner.remote_addr()
-                        };
-
-                        let accept = tls.accept(conn);
-                        match timeout(t, accept).await {
-                            Ok(Ok(conn)) => {
-                                info!(%peer_addr, "accepted new TLS connection");
-                                Some(conn)
-                            },
-                            // The handshake failed, try getting another connection from the queue
-                            Ok(Err(e)) => {
-                                TLS_HANDSHAKE_FAILURES.inc();
-                                warn!(%peer_addr, "failed to accept TLS connection: {e:?}");
-                                None
-                            }
-                            // The handshake timed out, try getting another connection from the queue
-                            Err(_) => {
-                                TLS_HANDSHAKE_FAILURES.inc();
-                                warn!(%peer_addr, "failed to accept TLS connection: timeout");
-                                None
-                            }
-                        }
-                    }.instrument(span));
-                }
-                Poll::Ready(Some(Err(e))) => {
-                    tracing::error!("error accepting TCP connection: {e}");
-                    continue;
-                }
-                Poll::Ready(None) => return Poll::Ready(None),
-            }
-        }
-
-        loop {
-            return match this.waiting.poll_join_next(cx) {
-                Poll::Ready(Some(Ok(Some(conn)))) => Poll::Ready(Some(Ok(conn))),
-                // The handshake failed to complete, try getting another connection from the queue
-                Poll::Ready(Some(Ok(None))) => continue,
-                // The handshake panicked or was cancelled. ignore and get another connection
-                Poll::Ready(Some(Err(e))) => {
-                    tracing::warn!("handshake aborted: {e}");
-                    continue;
-                }
-                _ => Poll::Pending,
-            };
-        }
-    }
-}
diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs
index ada6c974f4..b6cd85af73 100644
--- a/proxy/src/serverless/websocket.rs
+++ b/proxy/src/serverless/websocket.rs
@@ -3,9 +3,8 @@ use crate::{
     config::ProxyConfig,
     context::RequestMonitoring,
     error::{io_error, ReportableError},
-    metrics::NUM_CLIENT_CONNECTION_GAUGE,
+    metrics::Metrics,
     proxy::{handle_client, ClientMode},
-    rate_limiter::EndpointRateLimiter,
 };
 use bytes::{Buf, Bytes};
 use futures::{Sink, Stream};
@@ -136,12 +135,12 @@ pub async fn serve_websocket(
     websocket: HyperWebsocket,
     cancellation_handler: Arc<CancellationHandlerMain>,
     hostname: Option<String>,
-    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
 ) -> anyhow::Result<()> {
     let websocket = websocket.await?;
-    let conn_gauge = NUM_CLIENT_CONNECTION_GAUGE
-        .with_label_values(&["ws"])
-        .guard();
+    let conn_gauge = Metrics::get()
+        .proxy
+        .client_connections
+        .guard(crate::metrics::Protocol::Ws);
 
     let res = handle_client(
         config,
@@ -149,7 +148,6 @@ pub async fn serve_websocket(
         cancellation_handler,
         WebSocketRw::new(websocket),
         ClientMode::Websockets { hostname },
-        endpoint_rate_limiter,
         conn_gauge,
     )
     .await;
@@ -158,17 +156,15 @@ pub async fn serve_websocket(
         Err(e) => {
             // todo: log and push to ctx the error kind
             ctx.set_error_kind(e.get_error_kind());
-            ctx.log();
             Err(e.into())
         }
         Ok(None) => {
             ctx.set_success();
-            ctx.log();
             Ok(())
         }
         Ok(Some(p)) => {
             ctx.set_success();
-            ctx.log();
+            ctx.log_connect();
             p.proxy_pass().await
         }
     }
diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs
index b6b7a85659..690e92ffb1 100644
--- a/proxy/src/stream.rs
+++ b/proxy/src/stream.rs
@@ -1,6 +1,6 @@
 use crate::config::TlsServerEndPoint;
 use crate::error::{ErrorKind, ReportableError, UserFacingError};
-use crate::metrics::TLS_HANDSHAKE_FAILURES;
+use crate::metrics::Metrics;
 use bytes::BytesMut;
 
 use pq_proto::framed::{ConnectionError, Framed};
@@ -223,12 +223,20 @@ pub enum StreamUpgradeError {
 
 impl<S: AsyncRead + AsyncWrite + Unpin> Stream<S> {
     /// If possible, upgrade raw stream into a secure TLS-based stream.
-    pub async fn upgrade(self, cfg: Arc<ServerConfig>) -> Result<TlsStream<S>, StreamUpgradeError> {
+    pub async fn upgrade(
+        self,
+        cfg: Arc<ServerConfig>,
+        record_handshake_error: bool,
+    ) -> Result<TlsStream<S>, StreamUpgradeError> {
         match self {
             Stream::Raw { raw } => Ok(tokio_rustls::TlsAcceptor::from(cfg)
                 .accept(raw)
                 .await
-                .inspect_err(|_| TLS_HANDSHAKE_FAILURES.inc())?),
+                .inspect_err(|_| {
+                    if record_handshake_error {
+                        Metrics::get().proxy.tls_handshake_failures.inc()
+                    }
+                })?),
             Stream::Tls { .. } => Err(StreamUpgradeError::AlreadyTls),
         }
     }
diff --git a/proxy/src/usage_metrics.rs b/proxy/src/usage_metrics.rs
index 5ffbf95c07..56ed2145dc 100644
--- a/proxy/src/usage_metrics.rs
+++ b/proxy/src/usage_metrics.rs
@@ -495,7 +495,7 @@ mod tests {
     use url::Url;
 
     use super::*;
-    use crate::{http, rate_limiter::RateLimiterConfig, BranchId, EndpointId};
+    use crate::{http, BranchId, EndpointId};
 
     #[tokio::test]
     async fn metrics() {
@@ -525,7 +525,7 @@ mod tests {
         tokio::spawn(server);
 
         let metrics = Metrics::default();
-        let client = http::new_client(RateLimiterConfig::default());
+        let client = http::new_client();
         let endpoint = Url::parse(&format!("http://{addr}")).unwrap();
         let now = Utc::now();
 
diff --git a/pyproject.toml b/pyproject.toml
index e41aa2cda8..11f410ef43 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -33,7 +33,7 @@ psutil = "^5.9.4"
 types-psutil = "^5.9.5.12"
 types-toml = "^0.10.8.6"
 pytest-httpserver = "^1.0.8"
-aiohttp = "3.9.2"
+aiohttp = "3.9.4"
 pytest-rerunfailures = "^13.0"
 types-pytest-lazy-fixture = "^0.6.3.3"
 pytest-split = "^0.8.1"
diff --git a/rust-toolchain.toml b/rust-toolchain.toml
index 50a5a4185b..214de0a77d 100644
--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@@ -1,5 +1,5 @@
 [toolchain]
-channel = "1.77.0"
+channel = "1.78.0"
 profile = "default"
 # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
 # https://rust-lang.github.io/rustup/concepts/profiles.html
diff --git a/s3_scrubber/Cargo.toml b/s3_scrubber/Cargo.toml
index 4d136472e0..dd5d453a2b 100644
--- a/s3_scrubber/Cargo.toml
+++ b/s3_scrubber/Cargo.toml
@@ -22,9 +22,15 @@ serde_with.workspace = true
 workspace_hack.workspace = true
 utils.workspace = true
 async-stream.workspace = true
+native-tls.workspace = true
+postgres-native-tls.workspace = true
+postgres_ffi.workspace = true
 tokio-stream.workspace = true
+tokio-postgres.workspace = true
+tokio-util = { workspace = true }
 futures-util.workspace = true
 itertools.workspace = true
+camino.workspace = true
 
 tokio = { workspace = true, features = ["macros", "rt-multi-thread"] }
 chrono = { workspace = true, default-features = false, features = ["clock", "serde"] }
diff --git a/s3_scrubber/README.md b/s3_scrubber/README.md
index 2f21b9f191..c1deab8852 100644
--- a/s3_scrubber/README.md
+++ b/s3_scrubber/README.md
@@ -67,10 +67,12 @@ the purge command will log all the keys that it would have deleted.
 
 #### `scan-metadata`
 
-Walk objects in a pageserver S3 bucket, and report statistics on the contents.
+Walk objects in a pageserver or safekeeper S3 bucket, and report statistics on the contents and checking consistency.
+Errors are logged to stderr and summary to stdout.
 
+For pageserver:
 ```
-env SSO_ACCOUNT_ID=123456 REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- scan-metadata
+env SSO_ACCOUNT_ID=123456 REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- scan-metadata --node-kind pageserver
 
 Timelines: 31106
 With errors: 3
@@ -82,6 +84,10 @@ Layer size bytes: min 24576, 1% 36879, 10% 36879, 50% 61471, 90% 44695551, 99% 2
 Timeline layer count: min 1, 1% 3, 10% 6, 50% 16, 90% 25, 99% 39, max 1053
 ```
 
+For safekeepers, dump_db_connstr and dump_db_table must be
+specified; they should point to table with debug dump which will be used
+to list timelines and find their backup and start LSNs.
+
 ## Cleaning up running pageservers
 
 If S3 state is altered first manually, pageserver in-memory state will contain wrong data about S3 state, and tenants/timelines may get recreated on S3 (due to any layer upload due to compaction, pageserver restart, etc.). So before proceeding, for tenants/timelines which are already deleted in the console, we must remove these from pageservers.
diff --git a/s3_scrubber/src/cloud_admin_api.rs b/s3_scrubber/src/cloud_admin_api.rs
index 45cac23690..70b108cf23 100644
--- a/s3_scrubber/src/cloud_admin_api.rs
+++ b/s3_scrubber/src/cloud_admin_api.rs
@@ -1,11 +1,13 @@
-use std::time::Duration;
-
 use chrono::{DateTime, Utc};
+use futures::Future;
 use hex::FromHex;
+
 use reqwest::{header, Client, StatusCode, Url};
 use serde::Deserialize;
 use tokio::sync::Semaphore;
 
+use tokio_util::sync::CancellationToken;
+use utils::backoff;
 use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;
 
@@ -137,7 +139,7 @@ pub struct ProjectData {
     pub region_id: String,
     pub platform_id: String,
     pub user_id: String,
-    pub pageserver_id: u64,
+    pub pageserver_id: Option<u64>,
     #[serde(deserialize_with = "from_nullable_id")]
     pub tenant: TenantId,
     pub safekeepers: Vec<SafekeeperData>,
@@ -155,7 +157,7 @@ pub struct ProjectData {
     pub maintenance_set: Option<String>,
 }
 
-#[derive(Debug, serde::Deserialize)]
+#[derive(Debug, Clone, serde::Deserialize)]
 pub struct BranchData {
     pub id: BranchId,
     pub created_at: DateTime<Utc>,
@@ -210,30 +212,39 @@ impl CloudAdminApiClient {
             .await
             .expect("Semaphore is not closed");
 
-        let response = self
-            .http_client
-            .get(self.append_url("/projects"))
-            .query(&[
-                ("tenant_id", tenant_id.to_string()),
-                ("show_deleted", "true".to_string()),
-            ])
-            .header(header::ACCEPT, "application/json")
-            .bearer_auth(&self.token)
-            .send()
-            .await
-            .map_err(|e| {
-                Error::new(
-                    "Find project for tenant".to_string(),
-                    ErrorKind::RequestSend(e),
-                )
-            })?;
+        let response = CloudAdminApiClient::with_retries(
+            || async {
+                let response = self
+                    .http_client
+                    .get(self.append_url("/projects"))
+                    .query(&[
+                        ("tenant_id", tenant_id.to_string()),
+                        ("show_deleted", "true".to_string()),
+                    ])
+                    .header(header::ACCEPT, "application/json")
+                    .bearer_auth(&self.token)
+                    .send()
+                    .await
+                    .map_err(|e| {
+                        Error::new(
+                            "Find project for tenant".to_string(),
+                            ErrorKind::RequestSend(e),
+                        )
+                    })?;
+
+                let response: AdminApiResponse<Vec<ProjectData>> =
+                    response.json().await.map_err(|e| {
+                        Error::new(
+                            "Find project for tenant".to_string(),
+                            ErrorKind::BodyRead(e),
+                        )
+                    })?;
+                Ok(response)
+            },
+            "find_tenant_project",
+        )
+        .await?;
 
-        let response: AdminApiResponse<Vec<ProjectData>> = response.json().await.map_err(|e| {
-            Error::new(
-                "Find project for tenant".to_string(),
-                ErrorKind::BodyRead(e),
-            )
-        })?;
         match response.data.len() {
             0 => Ok(None),
             1 => Ok(Some(
@@ -261,42 +272,34 @@ impl CloudAdminApiClient {
         const PAGINATION_LIMIT: usize = 512;
         let mut result: Vec<ProjectData> = Vec::with_capacity(PAGINATION_LIMIT);
         loop {
-            let response = self
-                .http_client
-                .get(self.append_url("/projects"))
-                .query(&[
-                    ("show_deleted", "false".to_string()),
-                    ("limit", format!("{PAGINATION_LIMIT}")),
-                    ("offset", format!("{pagination_offset}")),
-                ])
-                .header(header::ACCEPT, "application/json")
-                .bearer_auth(&self.token)
-                .send()
-                .await
-                .map_err(|e| {
-                    Error::new(
-                        "List active projects".to_string(),
-                        ErrorKind::RequestSend(e),
-                    )
-                })?;
+            let response_bytes = CloudAdminApiClient::with_retries(
+                || async {
+                    let response = self
+                        .http_client
+                        .get(self.append_url("/projects"))
+                        .query(&[
+                            ("show_deleted", "false".to_string()),
+                            ("limit", format!("{PAGINATION_LIMIT}")),
+                            ("offset", format!("{pagination_offset}")),
+                        ])
+                        .header(header::ACCEPT, "application/json")
+                        .bearer_auth(&self.token)
+                        .send()
+                        .await
+                        .map_err(|e| {
+                            Error::new(
+                                "List active projects".to_string(),
+                                ErrorKind::RequestSend(e),
+                            )
+                        })?;
 
-            match response.status() {
-                StatusCode::OK => {}
-                StatusCode::SERVICE_UNAVAILABLE | StatusCode::TOO_MANY_REQUESTS => {
-                    tokio::time::sleep(Duration::from_millis(500)).await;
-                    continue;
-                }
-                _status => {
-                    return Err(Error::new(
-                        "List active projects".to_string(),
-                        ErrorKind::ResponseStatus(response.status()),
-                    ))
-                }
-            }
-
-            let response_bytes = response.bytes().await.map_err(|e| {
-                Error::new("List active projects".to_string(), ErrorKind::BodyRead(e))
-            })?;
+                    response.bytes().await.map_err(|e| {
+                        Error::new("List active projects".to_string(), ErrorKind::BodyRead(e))
+                    })
+                },
+                "list_projects",
+            )
+            .await?;
 
             let decode_result =
                 serde_json::from_slice::<AdminApiResponse<Vec<ProjectData>>>(&response_bytes);
@@ -327,6 +330,7 @@ impl CloudAdminApiClient {
 
     pub async fn find_timeline_branch(
         &self,
+        tenant_id: TenantId,
         timeline_id: TimelineId,
     ) -> Result<Option<BranchData>, Error> {
         let _permit = self
@@ -335,43 +339,61 @@ impl CloudAdminApiClient {
             .await
             .expect("Semaphore is not closed");
 
-        let response = self
-            .http_client
-            .get(self.append_url("/branches"))
-            .query(&[
-                ("timeline_id", timeline_id.to_string()),
-                ("show_deleted", "true".to_string()),
-            ])
-            .header(header::ACCEPT, "application/json")
-            .bearer_auth(&self.token)
-            .send()
-            .await
-            .map_err(|e| {
-                Error::new(
-                    "Find branch for timeline".to_string(),
-                    ErrorKind::RequestSend(e),
-                )
-            })?;
+        let response = CloudAdminApiClient::with_retries(
+            || async {
+                let response = self
+                    .http_client
+                    .get(self.append_url("/branches"))
+                    .query(&[
+                        ("timeline_id", timeline_id.to_string()),
+                        ("show_deleted", "true".to_string()),
+                    ])
+                    .header(header::ACCEPT, "application/json")
+                    .bearer_auth(&self.token)
+                    .send()
+                    .await
+                    .map_err(|e| {
+                        Error::new(
+                            "Find branch for timeline".to_string(),
+                            ErrorKind::RequestSend(e),
+                        )
+                    })?;
 
-        let response: AdminApiResponse<Vec<BranchData>> = response.json().await.map_err(|e| {
-            Error::new(
-                "Find branch for timeline".to_string(),
-                ErrorKind::BodyRead(e),
-            )
-        })?;
-        match response.data.len() {
-            0 => Ok(None),
-            1 => Ok(Some(
-                response
-                    .data
-                    .into_iter()
-                    .next()
-                    .expect("Should have exactly one element"),
-            )),
-            too_many => Err(Error::new(
-                format!("Find branch for timeline returned {too_many} branches instead of 0 or 1"),
+                let response: AdminApiResponse<Vec<BranchData>> =
+                    response.json().await.map_err(|e| {
+                        Error::new(
+                            "Find branch for timeline".to_string(),
+                            ErrorKind::BodyRead(e),
+                        )
+                    })?;
+                Ok(response)
+            },
+            "find_timeline_branch",
+        )
+        .await?;
+
+        let mut branches: Vec<BranchData> = response.data.into_iter().collect();
+        // Normally timeline_id is unique. However, we do have at least one case
+        // of the same timeline_id in two different projects, apparently after
+        // manual recovery. So always recheck project_id (discovered through
+        // tenant_id).
+        let project_data = match self.find_tenant_project(tenant_id).await? {
+            Some(pd) => pd,
+            None => return Ok(None),
+        };
+        branches.retain(|b| b.project_id == project_data.id);
+        if branches.len() < 2 {
+            Ok(branches.first().cloned())
+        } else {
+            Err(Error::new(
+                format!(
+                    "Find branch for timeline {}/{} returned {} branches instead of 0 or 1",
+                    tenant_id,
+                    timeline_id,
+                    branches.len()
+                ),
                 ErrorKind::UnexpectedState,
-            )),
+            ))
         }
     }
 
@@ -532,4 +554,15 @@ impl CloudAdminApiClient {
             .parse()
             .unwrap_or_else(|e| panic!("Could not append {subpath} to base url: {e}"))
     }
+
+    async fn with_retries<T, O, F>(op: O, description: &str) -> Result<T, Error>
+    where
+        O: FnMut() -> F,
+        F: Future<Output = Result<T, Error>>,
+    {
+        let cancel = CancellationToken::new(); // not really used
+        backoff::retry(op, |_| false, 1, 20, description, &cancel)
+            .await
+            .expect("cancellations are disabled")
+    }
 }
diff --git a/s3_scrubber/src/garbage.rs b/s3_scrubber/src/garbage.rs
index 7a08dffc66..ce0ff10ec6 100644
--- a/s3_scrubber/src/garbage.rs
+++ b/s3_scrubber/src/garbage.rs
@@ -60,6 +60,7 @@ pub struct GarbageList {
     /// see garbage, we saw some active tenants too.  This protects against classes of bugs
     /// in the scrubber that might otherwise generate a "deleted all" result.
     active_tenant_count: usize,
+    active_timeline_count: usize,
 }
 
 impl GarbageList {
@@ -67,6 +68,7 @@ impl GarbageList {
         Self {
             items: Vec::new(),
             active_tenant_count: 0,
+            active_timeline_count: 0,
             node_kind,
             bucket_config,
         }
@@ -119,7 +121,10 @@ pub async fn find_garbage(
 const S3_CONCURRENCY: usize = 32;
 
 // How many concurrent API requests to make to the console API.
-const CONSOLE_CONCURRENCY: usize = 128;
+//
+// Be careful increasing this; roughly we shouldn't have more than ~100 rps. It
+// would be better to implement real rsp limiter.
+const CONSOLE_CONCURRENCY: usize = 16;
 
 struct ConsoleCache {
     /// Set of tenants found in the control plane API
@@ -221,6 +226,7 @@ async fn find_garbage_inner(
         } else {
             tracing::debug!("Tenant {tenant_shard_id} is active");
             active_tenants.push(tenant_shard_id);
+            garbage.active_tenant_count = active_tenants.len();
         }
 
         counter += 1;
@@ -261,7 +267,7 @@ async fn find_garbage_inner(
         let api_client = cloud_admin_api_client.clone();
         async move {
             api_client
-                .find_timeline_branch(ttid.timeline_id)
+                .find_timeline_branch(ttid.tenant_shard_id.tenant_id, ttid.timeline_id)
                 .await
                 .map_err(|e| anyhow::anyhow!(e))
                 .map(|r| (ttid, r))
@@ -271,15 +277,29 @@ async fn find_garbage_inner(
         std::pin::pin!(timelines_checked.try_buffer_unordered(CONSOLE_CONCURRENCY));
 
     // Update the GarbageList with any timelines which appear not to exist.
+    let mut active_timelines: Vec<TenantShardTimelineId> = vec![];
     while let Some(result) = timelines_checked.next().await {
         let (ttid, console_result) = result?;
         if garbage.maybe_append(GarbageEntity::Timeline(ttid), console_result) {
             tracing::debug!("Timeline {ttid} is garbage");
         } else {
             tracing::debug!("Timeline {ttid} is active");
+            active_timelines.push(ttid);
+            garbage.active_timeline_count = active_timelines.len();
         }
     }
 
+    let num_garbage_timelines = garbage
+        .items
+        .iter()
+        .filter(|g| matches!(g.entity, GarbageEntity::Timeline(_)))
+        .count();
+    tracing::info!(
+        "Found {}/{} garbage timelines in active tenants",
+        num_garbage_timelines,
+        active_timelines.len(),
+    );
+
     Ok(garbage)
 }
 
@@ -344,16 +364,22 @@ pub async fn get_timeline_objects(
 const MAX_KEYS_PER_DELETE: usize = 1000;
 
 /// Drain a buffer of keys into DeleteObjects requests
+///
+/// If `drain` is true, drains keys completely; otherwise stops when <
+/// MAX_KEYS_PER_DELETE keys are left.
+/// `num_deleted` returns number of deleted keys.
 async fn do_delete(
     s3_client: &Arc<Client>,
     bucket_name: &str,
     keys: &mut Vec<ObjectIdentifier>,
     dry_run: bool,
     drain: bool,
+    progress_tracker: &mut DeletionProgressTracker,
 ) -> anyhow::Result<()> {
     while (!keys.is_empty() && drain) || (keys.len() >= MAX_KEYS_PER_DELETE) {
         let request_keys =
             keys.split_off(keys.len() - (std::cmp::min(MAX_KEYS_PER_DELETE, keys.len())));
+        let num_deleted = request_keys.len();
         if dry_run {
             tracing::info!("Dry-run deletion of objects: ");
             for k in request_keys {
@@ -368,12 +394,30 @@ async fn do_delete(
                 .send()
                 .await
                 .context("DeleteObjects request")?;
+            progress_tracker.register(num_deleted);
         }
     }
 
     Ok(())
 }
 
+/// Simple tracker reporting each 10k deleted keys.
+#[derive(Default)]
+struct DeletionProgressTracker {
+    num_deleted: usize,
+    last_reported_num_deleted: usize,
+}
+
+impl DeletionProgressTracker {
+    fn register(&mut self, n: usize) {
+        self.num_deleted += n;
+        if self.num_deleted - self.last_reported_num_deleted > 10000 {
+            tracing::info!("progress: deleted {} keys", self.num_deleted);
+            self.last_reported_num_deleted = self.num_deleted;
+        }
+    }
+}
+
 pub async fn purge_garbage(
     input_path: String,
     mode: PurgeMode,
@@ -394,6 +438,14 @@ pub async fn purge_garbage(
     if garbage_list.active_tenant_count == 0 {
         anyhow::bail!("Refusing to purge a garbage list that reports 0 active tenants");
     }
+    if garbage_list
+        .items
+        .iter()
+        .any(|g| matches!(g.entity, GarbageEntity::Timeline(_)))
+        && garbage_list.active_timeline_count == 0
+    {
+        anyhow::bail!("Refusing to purge a garbage list containing garbage timelines that reports 0 active timelines");
+    }
 
     let filtered_items = garbage_list
         .items
@@ -429,6 +481,7 @@ pub async fn purge_garbage(
         std::pin::pin!(get_objects_results.try_buffer_unordered(S3_CONCURRENCY));
 
     let mut objects_to_delete = Vec::new();
+    let mut progress_tracker = DeletionProgressTracker::default();
     while let Some(result) = get_objects_results.next().await {
         let mut object_list = result?;
         objects_to_delete.append(&mut object_list);
@@ -439,6 +492,7 @@ pub async fn purge_garbage(
                 &mut objects_to_delete,
                 dry_run,
                 false,
+                &mut progress_tracker,
             )
             .await?;
         }
@@ -450,10 +504,11 @@ pub async fn purge_garbage(
         &mut objects_to_delete,
         dry_run,
         true,
+        &mut progress_tracker,
     )
     .await?;
 
-    tracing::info!("Fell through");
+    tracing::info!("{} keys deleted in total", progress_tracker.num_deleted);
 
     Ok(())
 }
diff --git a/s3_scrubber/src/lib.rs b/s3_scrubber/src/lib.rs
index d2842877d0..e976e66748 100644
--- a/s3_scrubber/src/lib.rs
+++ b/s3_scrubber/src/lib.rs
@@ -4,7 +4,9 @@ pub mod checks;
 pub mod cloud_admin_api;
 pub mod garbage;
 pub mod metadata_stream;
-pub mod scan_metadata;
+pub mod scan_pageserver_metadata;
+pub mod scan_safekeeper_metadata;
+pub mod tenant_snapshot;
 
 use std::env;
 use std::fmt::Display;
@@ -23,17 +25,18 @@ use aws_sdk_s3::config::{AsyncSleep, Region, SharedAsyncSleep};
 use aws_sdk_s3::{Client, Config};
 use aws_smithy_async::rt::sleep::TokioSleep;
 
+use camino::{Utf8Path, Utf8PathBuf};
 use clap::ValueEnum;
 use pageserver::tenant::TENANTS_SEGMENT_NAME;
 use pageserver_api::shard::TenantShardId;
 use reqwest::Url;
 use serde::{Deserialize, Serialize};
-use std::io::IsTerminal;
 use tokio::io::AsyncReadExt;
 use tracing::error;
 use tracing_appender::non_blocking::WorkerGuard;
 use tracing_subscriber::{fmt, prelude::*, EnvFilter};
-use utils::id::TimelineId;
+use utils::fs_ext;
+use utils::id::{TenantId, TimelineId};
 
 const MAX_RETRIES: usize = 20;
 const CLOUD_ADMIN_API_TOKEN_ENV_VAR: &str = "CLOUD_ADMIN_API_TOKEN";
@@ -139,12 +142,34 @@ impl RootTarget {
     pub fn tenants_root(&self) -> S3Target {
         match self {
             Self::Pageserver(root) => root.with_sub_segment(TENANTS_SEGMENT_NAME),
-            Self::Safekeeper(root) => root.with_sub_segment("wal"),
+            Self::Safekeeper(root) => root.clone(),
         }
     }
 
     pub fn tenant_root(&self, tenant_id: &TenantShardId) -> S3Target {
-        self.tenants_root().with_sub_segment(&tenant_id.to_string())
+        match self {
+            Self::Pageserver(_) => self.tenants_root().with_sub_segment(&tenant_id.to_string()),
+            Self::Safekeeper(_) => self
+                .tenants_root()
+                .with_sub_segment(&tenant_id.tenant_id.to_string()),
+        }
+    }
+
+    pub(crate) fn tenant_shards_prefix(&self, tenant_id: &TenantId) -> S3Target {
+        // Only pageserver remote storage contains tenant-shards
+        assert!(matches!(self, Self::Pageserver(_)));
+        let Self::Pageserver(root) = self else {
+            panic!();
+        };
+
+        S3Target {
+            bucket_name: root.bucket_name.clone(),
+            prefix_in_bucket: format!(
+                "{}/{TENANTS_SEGMENT_NAME}/{tenant_id}",
+                root.prefix_in_bucket
+            ),
+            delimiter: root.delimiter.clone(),
+        }
     }
 
     pub fn timelines_root(&self, tenant_id: &TenantShardId) -> S3Target {
@@ -240,7 +265,6 @@ pub fn init_logging(file_name: &str) -> WorkerGuard {
         .with_ansi(false)
         .with_writer(file_writer);
     let stderr_logs = fmt::Layer::new()
-        .with_ansi(std::io::stderr().is_terminal())
         .with_target(false)
         .with_writer(std::io::stderr);
     tracing_subscriber::registry()
@@ -319,9 +343,7 @@ fn init_remote(
         }),
         NodeKind::Safekeeper => RootTarget::Safekeeper(S3Target {
             bucket_name: bucket_config.bucket,
-            prefix_in_bucket: bucket_config
-                .prefix_in_bucket
-                .unwrap_or("safekeeper/v1".to_string()),
+            prefix_in_bucket: bucket_config.prefix_in_bucket.unwrap_or("wal/".to_string()),
             delimiter,
         }),
     };
@@ -346,7 +368,10 @@ async fn list_objects_with_retries(
         {
             Ok(response) => return Ok(response),
             Err(e) => {
-                error!("list_objects_v2 query failed: {e}");
+                error!(
+                    "list_objects_v2 query failed: {e}, bucket_name={}, prefix={}, delimiter={}",
+                    s3_target.bucket_name, s3_target.prefix_in_bucket, s3_target.delimiter
+                );
                 tokio::time::sleep(Duration::from_secs(1)).await;
             }
         }
@@ -396,3 +421,50 @@ async fn download_object_with_retries(
 
     anyhow::bail!("Failed to download objects with key {key} {MAX_RETRIES} times")
 }
+
+async fn download_object_to_file(
+    s3_client: &Client,
+    bucket_name: &str,
+    key: &str,
+    version_id: Option<&str>,
+    local_path: &Utf8Path,
+) -> anyhow::Result<()> {
+    let tmp_path = Utf8PathBuf::from(format!("{local_path}.tmp"));
+    for _ in 0..MAX_RETRIES {
+        tokio::fs::remove_file(&tmp_path)
+            .await
+            .or_else(fs_ext::ignore_not_found)?;
+
+        let mut file = tokio::fs::File::create(&tmp_path)
+            .await
+            .context("Opening output file")?;
+
+        let request = s3_client.get_object().bucket(bucket_name).key(key);
+
+        let request = match version_id {
+            Some(version_id) => request.version_id(version_id),
+            None => request,
+        };
+
+        let response_stream = match request.send().await {
+            Ok(response) => response,
+            Err(e) => {
+                error!(
+                    "Failed to download object for key {key} version {}: {e:#}",
+                    version_id.unwrap_or("")
+                );
+                tokio::time::sleep(Duration::from_secs(1)).await;
+                continue;
+            }
+        };
+
+        let mut read_stream = response_stream.body.into_async_read();
+
+        tokio::io::copy(&mut read_stream, &mut file).await?;
+
+        tokio::fs::rename(&tmp_path, local_path).await?;
+        return Ok(());
+    }
+
+    anyhow::bail!("Failed to download objects with key {key} {MAX_RETRIES} times")
+}
diff --git a/s3_scrubber/src/main.rs b/s3_scrubber/src/main.rs
index 957213856b..e49c280b99 100644
--- a/s3_scrubber/src/main.rs
+++ b/s3_scrubber/src/main.rs
@@ -1,9 +1,16 @@
+use anyhow::bail;
+use camino::Utf8PathBuf;
 use pageserver_api::shard::TenantShardId;
 use s3_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode};
-use s3_scrubber::scan_metadata::scan_metadata;
-use s3_scrubber::{init_logging, BucketConfig, ConsoleConfig, NodeKind, TraversingDepth};
+use s3_scrubber::scan_pageserver_metadata::scan_metadata;
+use s3_scrubber::tenant_snapshot::SnapshotDownloader;
+use s3_scrubber::{
+    init_logging, scan_safekeeper_metadata::scan_safekeeper_metadata, BucketConfig, ConsoleConfig,
+    NodeKind, TraversingDepth,
+};
 
 use clap::{Parser, Subcommand};
+use utils::id::TenantId;
 
 #[derive(Parser)]
 #[command(author, version, about, long_about = None)]
@@ -32,11 +39,28 @@ enum Command {
         #[arg(short, long, default_value_t = PurgeMode::DeletedOnly)]
         mode: PurgeMode,
     },
+    #[command(verbatim_doc_comment)]
     ScanMetadata {
+        #[arg(short, long)]
+        node_kind: NodeKind,
         #[arg(short, long, default_value_t = false)]
         json: bool,
         #[arg(long = "tenant-id", num_args = 0..)]
         tenant_ids: Vec<TenantShardId>,
+        #[arg(long, default_value = None)]
+        /// For safekeeper node_kind only, points to db with debug dump
+        dump_db_connstr: Option<String>,
+        /// For safekeeper node_kind only, table in the db with debug dump
+        #[arg(long, default_value = None)]
+        dump_db_table: Option<String>,
+    },
+    TenantSnapshot {
+        #[arg(long = "tenant-id")]
+        tenant_id: TenantId,
+        #[arg(long = "concurrency", short = 'j', default_value_t = 8)]
+        concurrency: usize,
+        #[arg(short, long)]
+        output_path: Utf8PathBuf,
     },
 }
 
@@ -50,6 +74,7 @@ async fn main() -> anyhow::Result<()> {
         Command::ScanMetadata { .. } => "scan",
         Command::FindGarbage { .. } => "find-garbage",
         Command::PurgeGarbage { .. } => "purge-garbage",
+        Command::TenantSnapshot { .. } => "tenant-snapshot",
     };
     let _guard = init_logging(&format!(
         "{}_{}_{}_{}.log",
@@ -60,33 +85,75 @@ async fn main() -> anyhow::Result<()> {
     ));
 
     match cli.command {
-        Command::ScanMetadata { json, tenant_ids } => {
-            match scan_metadata(bucket_config.clone(), tenant_ids).await {
-                Err(e) => {
-                    tracing::error!("Failed: {e}");
-                    Err(e)
+        Command::ScanMetadata {
+            json,
+            tenant_ids,
+            node_kind,
+            dump_db_connstr,
+            dump_db_table,
+        } => {
+            if let NodeKind::Safekeeper = node_kind {
+                let dump_db_connstr =
+                    dump_db_connstr.ok_or(anyhow::anyhow!("dump_db_connstr not specified"))?;
+                let dump_db_table =
+                    dump_db_table.ok_or(anyhow::anyhow!("dump_db_table not specified"))?;
+
+                let summary = scan_safekeeper_metadata(
+                    bucket_config.clone(),
+                    tenant_ids.iter().map(|tshid| tshid.tenant_id).collect(),
+                    dump_db_connstr,
+                    dump_db_table,
+                )
+                .await?;
+                if json {
+                    println!("{}", serde_json::to_string(&summary).unwrap())
+                } else {
+                    println!("{}", summary.summary_string());
                 }
-                Ok(summary) => {
-                    if json {
-                        println!("{}", serde_json::to_string(&summary).unwrap())
-                    } else {
-                        println!("{}", summary.summary_string());
+                if summary.is_fatal() {
+                    bail!("Fatal scrub errors detected");
+                }
+                if summary.is_empty() {
+                    // Strictly speaking an empty bucket is a valid bucket, but if someone ran the
+                    // scrubber they were likely expecting to scan something, and if we see no timelines
+                    // at all then it's likely due to some configuration issues like a bad prefix
+                    bail!(
+                        "No timelines found in bucket {} prefix {}",
+                        bucket_config.bucket,
+                        bucket_config
+                            .prefix_in_bucket
+                            .unwrap_or("<none>".to_string())
+                    );
+                }
+                Ok(())
+            } else {
+                match scan_metadata(bucket_config.clone(), tenant_ids).await {
+                    Err(e) => {
+                        tracing::error!("Failed: {e}");
+                        Err(e)
                     }
-                    if summary.is_fatal() {
-                        Err(anyhow::anyhow!("Fatal scrub errors detected"))
-                    } else if summary.is_empty() {
-                        // Strictly speaking an empty bucket is a valid bucket, but if someone ran the
-                        // scrubber they were likely expecting to scan something, and if we see no timelines
-                        // at all then it's likely due to some configuration issues like a bad prefix
-                        Err(anyhow::anyhow!(
-                            "No timelines found in bucket {} prefix {}",
-                            bucket_config.bucket,
-                            bucket_config
-                                .prefix_in_bucket
-                                .unwrap_or("<none>".to_string())
-                        ))
-                    } else {
-                        Ok(())
+                    Ok(summary) => {
+                        if json {
+                            println!("{}", serde_json::to_string(&summary).unwrap())
+                        } else {
+                            println!("{}", summary.summary_string());
+                        }
+                        if summary.is_fatal() {
+                            Err(anyhow::anyhow!("Fatal scrub errors detected"))
+                        } else if summary.is_empty() {
+                            // Strictly speaking an empty bucket is a valid bucket, but if someone ran the
+                            // scrubber they were likely expecting to scan something, and if we see no timelines
+                            // at all then it's likely due to some configuration issues like a bad prefix
+                            Err(anyhow::anyhow!(
+                                "No timelines found in bucket {} prefix {}",
+                                bucket_config.bucket,
+                                bucket_config
+                                    .prefix_in_bucket
+                                    .unwrap_or("<none>".to_string())
+                            ))
+                        } else {
+                            Ok(())
+                        }
                     }
                 }
             }
@@ -102,5 +169,14 @@ async fn main() -> anyhow::Result<()> {
         Command::PurgeGarbage { input_path, mode } => {
             purge_garbage(input_path, mode, !cli.delete).await
         }
+        Command::TenantSnapshot {
+            tenant_id,
+            output_path,
+            concurrency,
+        } => {
+            let downloader =
+                SnapshotDownloader::new(bucket_config, tenant_id, output_path, concurrency)?;
+            downloader.download().await
+        }
     }
 }
diff --git a/s3_scrubber/src/metadata_stream.rs b/s3_scrubber/src/metadata_stream.rs
index 073f37f319..c05874f556 100644
--- a/s3_scrubber/src/metadata_stream.rs
+++ b/s3_scrubber/src/metadata_stream.rs
@@ -5,7 +5,7 @@ use tokio_stream::Stream;
 
 use crate::{list_objects_with_retries, RootTarget, S3Target, TenantShardTimelineId};
 use pageserver_api::shard::TenantShardId;
-use utils::id::TimelineId;
+use utils::id::{TenantId, TimelineId};
 
 /// Given an S3 bucket, output a stream of TenantIds discovered via ListObjectsv2
 pub fn stream_tenants<'a>(
@@ -45,6 +45,62 @@ pub fn stream_tenants<'a>(
     }
 }
 
+pub async fn stream_tenant_shards<'a>(
+    s3_client: &'a Client,
+    target: &'a RootTarget,
+    tenant_id: TenantId,
+) -> anyhow::Result<impl Stream<Item = Result<TenantShardId, anyhow::Error>> + 'a> {
+    let mut tenant_shard_ids: Vec<Result<TenantShardId, anyhow::Error>> = Vec::new();
+    let mut continuation_token = None;
+    let shards_target = target.tenant_shards_prefix(&tenant_id);
+
+    loop {
+        tracing::info!("Listing in {}", shards_target.prefix_in_bucket);
+        let fetch_response =
+            list_objects_with_retries(s3_client, &shards_target, continuation_token.clone()).await;
+        let fetch_response = match fetch_response {
+            Err(e) => {
+                tenant_shard_ids.push(Err(e));
+                break;
+            }
+            Ok(r) => r,
+        };
+
+        let new_entry_ids = fetch_response
+            .common_prefixes()
+            .iter()
+            .filter_map(|prefix| prefix.prefix())
+            .filter_map(|prefix| -> Option<&str> {
+                prefix
+                    .strip_prefix(&target.tenants_root().prefix_in_bucket)?
+                    .strip_suffix('/')
+            })
+            .map(|entry_id_str| {
+                let first_part = entry_id_str.split('/').next().unwrap();
+
+                first_part
+                    .parse::<TenantShardId>()
+                    .with_context(|| format!("Incorrect entry id str: {first_part}"))
+            });
+
+        for i in new_entry_ids {
+            tenant_shard_ids.push(i);
+        }
+
+        match fetch_response.next_continuation_token {
+            Some(new_token) => continuation_token = Some(new_token),
+            None => break,
+        }
+    }
+
+    Ok(stream! {
+        for i in tenant_shard_ids {
+            let id = i?;
+            yield Ok(id);
+        }
+    })
+}
+
 /// Given a TenantShardId, output a stream of the timelines within that tenant, discovered
 /// using ListObjectsv2.  The listing is done before the stream is built, so that this
 /// function can be used to generate concurrency on a stream using buffer_unordered.
@@ -58,7 +114,7 @@ pub async fn stream_tenant_timelines<'a>(
     let timelines_target = target.timelines_root(&tenant);
 
     loop {
-        tracing::info!("Listing in {}", tenant);
+        tracing::debug!("Listing in {}", tenant);
         let fetch_response =
             list_objects_with_retries(s3_client, &timelines_target, continuation_token.clone())
                 .await;
@@ -95,7 +151,7 @@ pub async fn stream_tenant_timelines<'a>(
         }
     }
 
-    tracing::info!("Yielding for {}", tenant);
+    tracing::debug!("Yielding for {}", tenant);
     Ok(stream! {
         for i in timeline_ids {
             let id = i?;
diff --git a/s3_scrubber/src/scan_metadata.rs b/s3_scrubber/src/scan_pageserver_metadata.rs
similarity index 100%
rename from s3_scrubber/src/scan_metadata.rs
rename to s3_scrubber/src/scan_pageserver_metadata.rs
diff --git a/s3_scrubber/src/scan_safekeeper_metadata.rs b/s3_scrubber/src/scan_safekeeper_metadata.rs
new file mode 100644
index 0000000000..73dd49ceb5
--- /dev/null
+++ b/s3_scrubber/src/scan_safekeeper_metadata.rs
@@ -0,0 +1,236 @@
+use std::{collections::HashSet, str::FromStr};
+
+use aws_sdk_s3::Client;
+use futures::stream::{StreamExt, TryStreamExt};
+use pageserver_api::shard::TenantShardId;
+use postgres_ffi::{XLogFileName, PG_TLI};
+use serde::Serialize;
+use tokio_postgres::types::PgLsn;
+use tracing::{error, info, trace};
+use utils::{
+    id::{TenantId, TenantTimelineId, TimelineId},
+    lsn::Lsn,
+};
+
+use crate::{
+    cloud_admin_api::CloudAdminApiClient, init_remote, metadata_stream::stream_listing,
+    BucketConfig, ConsoleConfig, NodeKind, RootTarget, TenantShardTimelineId,
+};
+
+/// Generally we should ask safekeepers, but so far we use everywhere default 16MB.
+const WAL_SEGSIZE: usize = 16 * 1024 * 1024;
+
+#[derive(Serialize)]
+pub struct MetadataSummary {
+    timeline_count: usize,
+    with_errors: HashSet<TenantTimelineId>,
+    deleted_count: usize,
+}
+
+impl MetadataSummary {
+    fn new() -> Self {
+        Self {
+            timeline_count: 0,
+            with_errors: HashSet::new(),
+            deleted_count: 0,
+        }
+    }
+
+    pub fn summary_string(&self) -> String {
+        format!(
+            "timeline_count: {}, with_errors: {}",
+            self.timeline_count,
+            self.with_errors.len()
+        )
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.timeline_count == 0
+    }
+
+    pub fn is_fatal(&self) -> bool {
+        !self.with_errors.is_empty()
+    }
+}
+
+/// Scan the safekeeper metadata in an S3 bucket, reporting errors and
+/// statistics.
+///
+/// It works by listing timelines along with timeline_start_lsn and backup_lsn
+/// in debug dump in dump_db_table and verifying its s3 contents. If some WAL
+/// segments are missing, before complaining control plane is queried to check if
+/// the project wasn't deleted in the meanwhile.
+pub async fn scan_safekeeper_metadata(
+    bucket_config: BucketConfig,
+    tenant_ids: Vec<TenantId>,
+    dump_db_connstr: String,
+    dump_db_table: String,
+) -> anyhow::Result<MetadataSummary> {
+    info!(
+        "checking bucket {}, region {}, dump_db_table {}",
+        bucket_config.bucket, bucket_config.region, dump_db_table
+    );
+    // Use the native TLS implementation (Neon requires TLS)
+    let tls_connector =
+        postgres_native_tls::MakeTlsConnector::new(native_tls::TlsConnector::new().unwrap());
+    let (client, connection) = tokio_postgres::connect(&dump_db_connstr, tls_connector).await?;
+    // The connection object performs the actual communication with the database,
+    // so spawn it off to run on its own.
+    tokio::spawn(async move {
+        if let Err(e) = connection.await {
+            eprintln!("connection error: {}", e);
+        }
+    });
+
+    let tenant_filter_clause = if !tenant_ids.is_empty() {
+        format!(
+            "and tenant_id in ({})",
+            tenant_ids
+                .iter()
+                .map(|t| format!("'{}'", t))
+                .collect::<Vec<_>>()
+                .join(", ")
+        )
+    } else {
+        "".to_owned()
+    };
+    let query = format!(
+        "select tenant_id, timeline_id, min(timeline_start_lsn), max(backup_lsn) from \"{}\" where not is_cancelled {} group by tenant_id, timeline_id;",
+        dump_db_table, tenant_filter_clause,
+    );
+    info!("query is {}", query);
+    let timelines = client.query(&query, &[]).await?;
+    info!("loaded {} timelines", timelines.len());
+
+    let (s3_client, target) = init_remote(bucket_config, NodeKind::Safekeeper)?;
+    let console_config = ConsoleConfig::from_env()?;
+    let cloud_admin_api_client = CloudAdminApiClient::new(console_config);
+
+    let checks = futures::stream::iter(timelines.iter().map(Ok)).map_ok(|row| {
+        let tenant_id = TenantId::from_str(row.get(0)).expect("failed to parse tenant_id");
+        let timeline_id = TimelineId::from_str(row.get(1)).expect("failed to parse tenant_id");
+        let timeline_start_lsn_pg: PgLsn = row.get(2);
+        let timeline_start_lsn: Lsn = Lsn(u64::from(timeline_start_lsn_pg));
+        let backup_lsn_pg: PgLsn = row.get(3);
+        let backup_lsn: Lsn = Lsn(u64::from(backup_lsn_pg));
+        let ttid = TenantTimelineId::new(tenant_id, timeline_id);
+        check_timeline(
+            &s3_client,
+            &target,
+            &cloud_admin_api_client,
+            ttid,
+            timeline_start_lsn,
+            backup_lsn,
+        )
+    });
+    // Run multiple check_timeline's concurrently.
+    const CONCURRENCY: usize = 32;
+    let mut timelines = checks.try_buffered(CONCURRENCY);
+
+    let mut summary = MetadataSummary::new();
+    while let Some(r) = timelines.next().await {
+        let res = r?;
+        summary.timeline_count += 1;
+        if !res.is_ok {
+            summary.with_errors.insert(res.ttid);
+        }
+        if res.is_deleted {
+            summary.deleted_count += 1;
+        }
+    }
+
+    Ok(summary)
+}
+
+struct TimelineCheckResult {
+    ttid: TenantTimelineId,
+    is_ok: bool,
+    is_deleted: bool, // timeline is deleted in cplane
+}
+
+/// List s3 and check that is has all expected WAL for the ttid. Consistency
+/// errors are logged to stderr; returns Ok(true) if timeline is consistent,
+/// Ok(false) if not, Err if failed to check.
+async fn check_timeline(
+    s3_client: &Client,
+    root: &RootTarget,
+    api_client: &CloudAdminApiClient,
+    ttid: TenantTimelineId,
+    timeline_start_lsn: Lsn,
+    backup_lsn: Lsn,
+) -> anyhow::Result<TimelineCheckResult> {
+    trace!(
+        "checking ttid {}, should contain WAL [{}-{}]",
+        ttid,
+        timeline_start_lsn,
+        backup_lsn
+    );
+    // calculate expected segfiles
+    let expected_first_segno = timeline_start_lsn.segment_number(WAL_SEGSIZE);
+    let expected_last_segno = backup_lsn.segment_number(WAL_SEGSIZE);
+    let mut expected_segfiles: HashSet<String> = HashSet::from_iter(
+        (expected_first_segno..expected_last_segno)
+            .map(|segno| XLogFileName(PG_TLI, segno, WAL_SEGSIZE)),
+    );
+    let expected_files_num = expected_segfiles.len();
+    trace!("expecting {} files", expected_segfiles.len(),);
+
+    // now list s3 and check if it misses something
+    let ttshid =
+        TenantShardTimelineId::new(TenantShardId::unsharded(ttid.tenant_id), ttid.timeline_id);
+    let mut timeline_dir_target = root.timeline_root(&ttshid);
+    // stream_listing yields only common_prefixes if delimiter is not empty, but
+    // we need files, so unset it.
+    timeline_dir_target.delimiter = String::new();
+
+    let mut stream = std::pin::pin!(stream_listing(s3_client, &timeline_dir_target));
+    while let Some(obj) = stream.next().await {
+        let obj = obj?;
+        let key = obj.key();
+
+        let seg_name = key
+            .strip_prefix(&timeline_dir_target.prefix_in_bucket)
+            .expect("failed to extract segment name");
+        expected_segfiles.remove(seg_name);
+    }
+    if !expected_segfiles.is_empty() {
+        // Before complaining check cplane, probably timeline is already deleted.
+        let bdata = api_client
+            .find_timeline_branch(ttid.tenant_id, ttid.timeline_id)
+            .await?;
+        let deleted = match bdata {
+            Some(bdata) => bdata.deleted,
+            None => {
+                // note: should be careful with selecting proper cplane address
+                info!("ttid {} not found, assuming it is deleted", ttid);
+                true
+            }
+        };
+        if deleted {
+            // ok, branch is deleted
+            return Ok(TimelineCheckResult {
+                ttid,
+                is_ok: true,
+                is_deleted: true,
+            });
+        }
+        error!(
+            "ttid {}: missing {} files out of {}, timeline_start_lsn {}, wal_backup_lsn {}",
+            ttid,
+            expected_segfiles.len(),
+            expected_files_num,
+            timeline_start_lsn,
+            backup_lsn,
+        );
+        return Ok(TimelineCheckResult {
+            ttid,
+            is_ok: false,
+            is_deleted: false,
+        });
+    }
+    Ok(TimelineCheckResult {
+        ttid,
+        is_ok: true,
+        is_deleted: false,
+    })
+}
diff --git a/s3_scrubber/src/tenant_snapshot.rs b/s3_scrubber/src/tenant_snapshot.rs
new file mode 100644
index 0000000000..4eccad381b
--- /dev/null
+++ b/s3_scrubber/src/tenant_snapshot.rs
@@ -0,0 +1,293 @@
+use std::collections::HashMap;
+use std::sync::Arc;
+
+use crate::checks::{list_timeline_blobs, BlobDataParseResult, S3TimelineBlobData};
+use crate::metadata_stream::{stream_tenant_shards, stream_tenant_timelines};
+use crate::{
+    download_object_to_file, init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId,
+};
+use anyhow::Context;
+use async_stream::stream;
+use aws_sdk_s3::Client;
+use camino::Utf8PathBuf;
+use futures::{StreamExt, TryStreamExt};
+use pageserver::tenant::remote_timeline_client::index::IndexLayerMetadata;
+use pageserver::tenant::storage_layer::LayerFileName;
+use pageserver::tenant::IndexPart;
+use pageserver_api::shard::TenantShardId;
+use utils::generation::Generation;
+use utils::id::TenantId;
+
+pub struct SnapshotDownloader {
+    s3_client: Arc<Client>,
+    s3_root: RootTarget,
+    bucket_config: BucketConfig,
+    tenant_id: TenantId,
+    output_path: Utf8PathBuf,
+    concurrency: usize,
+}
+
+impl SnapshotDownloader {
+    pub fn new(
+        bucket_config: BucketConfig,
+        tenant_id: TenantId,
+        output_path: Utf8PathBuf,
+        concurrency: usize,
+    ) -> anyhow::Result<Self> {
+        let (s3_client, s3_root) = init_remote(bucket_config.clone(), NodeKind::Pageserver)?;
+        Ok(Self {
+            s3_client,
+            s3_root,
+            bucket_config,
+            tenant_id,
+            output_path,
+            concurrency,
+        })
+    }
+
+    async fn download_layer(
+        &self,
+        ttid: TenantShardTimelineId,
+        layer_name: LayerFileName,
+        layer_metadata: IndexLayerMetadata,
+    ) -> anyhow::Result<(LayerFileName, IndexLayerMetadata)> {
+        // Note this is local as in a local copy of S3 data, not local as in the pageserver's local format.  They use
+        // different layer names (remote-style has the generation suffix)
+        let local_path = self.output_path.join(format!(
+            "{}/timelines/{}/{}{}",
+            ttid.tenant_shard_id,
+            ttid.timeline_id,
+            layer_name.file_name(),
+            layer_metadata.generation.get_suffix()
+        ));
+
+        // We should only be called for layers that are owned by the input TTID
+        assert_eq!(layer_metadata.shard, ttid.tenant_shard_id.to_index());
+
+        // Assumption: we always write layer files atomically, and layer files are immutable.  Therefore if the file
+        // already exists on local disk, we assume it is fully correct and skip it.
+        if tokio::fs::try_exists(&local_path).await? {
+            tracing::debug!("{} already exists", local_path);
+            return Ok((layer_name, layer_metadata));
+        } else {
+            tracing::debug!("{} requires download...", local_path);
+
+            let timeline_root = self.s3_root.timeline_root(&ttid);
+            let remote_layer_path = format!(
+                "{}{}{}",
+                timeline_root.prefix_in_bucket,
+                layer_name.file_name(),
+                layer_metadata.generation.get_suffix()
+            );
+
+            // List versions: the object might be deleted.
+            let versions = self
+                .s3_client
+                .list_object_versions()
+                .bucket(self.bucket_config.bucket.clone())
+                .prefix(&remote_layer_path)
+                .send()
+                .await?;
+            let Some(version) = versions.versions.as_ref().and_then(|v| v.first()) else {
+                return Err(anyhow::anyhow!("No versions found for {remote_layer_path}"));
+            };
+            download_object_to_file(
+                &self.s3_client,
+                &self.bucket_config.bucket,
+                &remote_layer_path,
+                version.version_id.as_deref(),
+                &local_path,
+            )
+            .await?;
+
+            tracing::debug!("Downloaded successfully to {local_path}");
+        }
+
+        Ok((layer_name, layer_metadata))
+    }
+
+    /// Download many layers belonging to the same TTID, with some concurrency
+    async fn download_layers(
+        &self,
+        ttid: TenantShardTimelineId,
+        layers: Vec<(LayerFileName, IndexLayerMetadata)>,
+    ) -> anyhow::Result<()> {
+        let layer_count = layers.len();
+        tracing::info!("Downloading {} layers for timeline {ttid}...", layer_count);
+        let layers_stream = stream! {
+            for (layer_name, layer_metadata) in layers {
+                yield self.download_layer(ttid, layer_name, layer_metadata);
+            }
+        };
+
+        tokio::fs::create_dir_all(self.output_path.join(format!(
+            "{}/timelines/{}",
+            ttid.tenant_shard_id, ttid.timeline_id
+        )))
+        .await?;
+
+        let layer_results = layers_stream.buffered(self.concurrency);
+        let mut layer_results = std::pin::pin!(layer_results);
+
+        let mut err = None;
+        let mut download_count = 0;
+        while let Some(i) = layer_results.next().await {
+            download_count += 1;
+            match i {
+                Ok((layer_name, layer_metadata)) => {
+                    tracing::info!(
+                        "[{download_count}/{layer_count}] OK: {} bytes {ttid} {}",
+                        layer_metadata.file_size,
+                        layer_name.file_name()
+                    );
+                }
+                Err(e) => {
+                    // Warn and continue: we will download what we can
+                    tracing::warn!("Download error: {e}");
+                    err = Some(e);
+                }
+            }
+        }
+        if let Some(e) = err {
+            tracing::warn!("Some errors occurred downloading {ttid} layers, last error: {e}");
+            Err(e)
+        } else {
+            Ok(())
+        }
+    }
+
+    async fn download_timeline(
+        &self,
+        ttid: TenantShardTimelineId,
+        index_part: IndexPart,
+        index_part_generation: Generation,
+        ancestor_layers: &mut HashMap<
+            TenantShardTimelineId,
+            HashMap<LayerFileName, IndexLayerMetadata>,
+        >,
+    ) -> anyhow::Result<()> {
+        let index_bytes = serde_json::to_string(&index_part).unwrap();
+
+        let layers = index_part
+            .layer_metadata
+            .into_iter()
+            .filter_map(|(layer_name, layer_metadata)| {
+                if layer_metadata.shard.shard_count != ttid.tenant_shard_id.shard_count {
+                    // Accumulate ancestor layers for later download
+                    let ancestor_ttid = TenantShardTimelineId::new(
+                        TenantShardId {
+                            tenant_id: ttid.tenant_shard_id.tenant_id,
+                            shard_number: layer_metadata.shard.shard_number,
+                            shard_count: layer_metadata.shard.shard_count,
+                        },
+                        ttid.timeline_id,
+                    );
+                    let ancestor_ttid_layers = ancestor_layers.entry(ancestor_ttid).or_default();
+                    use std::collections::hash_map::Entry;
+                    match ancestor_ttid_layers.entry(layer_name) {
+                        Entry::Occupied(entry) => {
+                            // Descendent shards that reference a layer from an ancestor should always have matching metadata,
+                            // as their siblings, because it is read atomically during a shard split.
+                            assert_eq!(entry.get(), &layer_metadata);
+                        }
+                        Entry::Vacant(entry) => {
+                            entry.insert(layer_metadata);
+                        }
+                    }
+                    None
+                } else {
+                    Some((layer_name, layer_metadata))
+                }
+            })
+            .collect();
+
+        let download_result = self.download_layers(ttid, layers).await;
+
+        // Write index last, once all the layers it references are downloaded
+        let local_index_path = self.output_path.join(format!(
+            "{}/timelines/{}/index_part.json{}",
+            ttid.tenant_shard_id,
+            ttid.timeline_id,
+            index_part_generation.get_suffix()
+        ));
+        tokio::fs::write(&local_index_path, index_bytes)
+            .await
+            .context("writing index")?;
+
+        download_result
+    }
+
+    pub async fn download(&self) -> anyhow::Result<()> {
+        let (s3_client, target) = init_remote(self.bucket_config.clone(), NodeKind::Pageserver)?;
+
+        // Generate a stream of TenantShardId
+        let shards = stream_tenant_shards(&s3_client, &target, self.tenant_id).await?;
+        let shards: Vec<TenantShardId> = shards.try_collect().await?;
+
+        // Only read from shards that have the highest count: avoids redundantly downloading
+        // from ancestor shards.
+        let Some(shard_count) = shards.iter().map(|s| s.shard_count).max() else {
+            anyhow::bail!("No shards found");
+        };
+
+        // We will build a collection of layers in anccestor shards to download (this will only
+        // happen if this tenant has been split at some point)
+        let mut ancestor_layers: HashMap<
+            TenantShardTimelineId,
+            HashMap<LayerFileName, IndexLayerMetadata>,
+        > = Default::default();
+
+        for shard in shards.into_iter().filter(|s| s.shard_count == shard_count) {
+            // Generate a stream of TenantTimelineId
+            let timelines = stream_tenant_timelines(&s3_client, &self.s3_root, shard).await?;
+
+            // Generate a stream of S3TimelineBlobData
+            async fn load_timeline_index(
+                s3_client: &Client,
+                target: &RootTarget,
+                ttid: TenantShardTimelineId,
+            ) -> anyhow::Result<(TenantShardTimelineId, S3TimelineBlobData)> {
+                let data = list_timeline_blobs(s3_client, ttid, target).await?;
+                Ok((ttid, data))
+            }
+            let timelines = timelines.map_ok(|ttid| load_timeline_index(&s3_client, &target, ttid));
+            let mut timelines = std::pin::pin!(timelines.try_buffered(8));
+
+            while let Some(i) = timelines.next().await {
+                let (ttid, data) = i?;
+                match data.blob_data {
+                    BlobDataParseResult::Parsed {
+                        index_part,
+                        index_part_generation,
+                        s3_layers: _,
+                    } => {
+                        self.download_timeline(
+                            ttid,
+                            index_part,
+                            index_part_generation,
+                            &mut ancestor_layers,
+                        )
+                        .await
+                        .context("Downloading timeline")?;
+                    }
+                    BlobDataParseResult::Relic => {}
+                    BlobDataParseResult::Incorrect(_) => {
+                        tracing::error!("Bad metadata in timeline {ttid}");
+                    }
+                };
+            }
+        }
+
+        for (ttid, layers) in ancestor_layers.into_iter() {
+            tracing::info!(
+                "Downloading {} layers from ancvestor timeline {ttid}...",
+                layers.len()
+            );
+
+            self.download_layers(ttid, layers.into_iter().collect())
+                .await?;
+        }
+
+        Ok(())
+    }
+}
diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index e53ccaeb3d..09c565ce71 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -177,6 +177,10 @@ struct Args {
     /// Controls how long backup will wait until uploading the partial segment.
     #[arg(long, value_parser = humantime::parse_duration, default_value = DEFAULT_PARTIAL_BACKUP_TIMEOUT, verbatim_doc_comment)]
     partial_backup_timeout: Duration,
+    /// Disable task to push messages to broker every second. Supposed to
+    /// be used in tests.
+    #[arg(long)]
+    disable_periodic_broker_push: bool,
 }
 
 // Like PathBufValueParser, but allows empty string.
@@ -309,6 +313,7 @@ async fn main() -> anyhow::Result<()> {
         walsenders_keep_horizon: args.walsenders_keep_horizon,
         partial_backup_enabled: args.partial_backup_enabled,
         partial_backup_timeout: args.partial_backup_timeout,
+        disable_periodic_broker_push: args.disable_periodic_broker_push,
     };
 
     // initialize sentry if SENTRY_DSN is provided
diff --git a/safekeeper/src/broker.rs b/safekeeper/src/broker.rs
index 2b1db2714b..98f58d3e49 100644
--- a/safekeeper/src/broker.rs
+++ b/safekeeper/src/broker.rs
@@ -10,11 +10,20 @@ use anyhow::Result;
 use storage_broker::parse_proto_ttid;
 
 use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey as ProtoSubscriptionKey;
+use storage_broker::proto::FilterTenantTimelineId;
+use storage_broker::proto::MessageType;
+use storage_broker::proto::SafekeeperDiscoveryResponse;
+use storage_broker::proto::SubscribeByFilterRequest;
 use storage_broker::proto::SubscribeSafekeeperInfoRequest;
+use storage_broker::proto::TypeSubscription;
+use storage_broker::proto::TypedMessage;
 use storage_broker::Request;
 
+use std::sync::atomic::AtomicU64;
+use std::sync::Arc;
 use std::time::Duration;
 use std::time::Instant;
+use std::time::UNIX_EPOCH;
 use tokio::task::JoinHandle;
 use tokio::time::sleep;
 use tracing::*;
@@ -31,6 +40,12 @@ const PUSH_INTERVAL_MSEC: u64 = 1000;
 
 /// Push once in a while data about all active timelines to the broker.
 async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
+    if conf.disable_periodic_broker_push {
+        info!("broker push_loop is disabled, doing nothing...");
+        futures::future::pending::<()>().await; // sleep forever
+        return Ok(());
+    }
+
     let mut client =
         storage_broker::connect(conf.broker_endpoint.clone(), conf.broker_keepalive_interval)?;
     let push_interval = Duration::from_millis(PUSH_INTERVAL_MSEC);
@@ -75,7 +90,7 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
 }
 
 /// Subscribe and fetch all the interesting data from the broker.
-async fn pull_loop(conf: SafeKeeperConf) -> Result<()> {
+async fn pull_loop(conf: SafeKeeperConf, stats: Arc<BrokerStats>) -> Result<()> {
     let mut client = storage_broker::connect(conf.broker_endpoint, conf.broker_keepalive_interval)?;
 
     // TODO: subscribe only to local timelines instead of all
@@ -94,6 +109,8 @@ async fn pull_loop(conf: SafeKeeperConf) -> Result<()> {
     let err_counter = BROKER_PULLED_UPDATES.with_label_values(&["error"]);
 
     while let Some(msg) = stream.message().await? {
+        stats.update_pulled();
+
         let proto_ttid = msg
             .tenant_timeline_id
             .as_ref()
@@ -119,12 +136,93 @@ async fn pull_loop(conf: SafeKeeperConf) -> Result<()> {
     bail!("end of stream");
 }
 
+/// Process incoming discover requests. This is done in a separate task to avoid
+/// interfering with the normal pull/push loops.
+async fn discover_loop(conf: SafeKeeperConf, stats: Arc<BrokerStats>) -> Result<()> {
+    let mut client =
+        storage_broker::connect(conf.broker_endpoint.clone(), conf.broker_keepalive_interval)?;
+
+    let request = SubscribeByFilterRequest {
+        types: vec![TypeSubscription {
+            r#type: MessageType::SafekeeperDiscoveryRequest as i32,
+        }],
+        tenant_timeline_id: Some(FilterTenantTimelineId {
+            enabled: false,
+            tenant_timeline_id: None,
+        }),
+    };
+
+    let mut stream = client
+        .subscribe_by_filter(request)
+        .await
+        .context("subscribe_by_filter request failed")?
+        .into_inner();
+
+    let discover_counter = BROKER_PULLED_UPDATES.with_label_values(&["discover"]);
+
+    while let Some(typed_msg) = stream.message().await? {
+        stats.update_pulled();
+
+        match typed_msg.r#type() {
+            MessageType::SafekeeperDiscoveryRequest => {
+                let msg = typed_msg
+                    .safekeeper_discovery_request
+                    .expect("proto type mismatch from broker message");
+
+                let proto_ttid = msg
+                    .tenant_timeline_id
+                    .as_ref()
+                    .ok_or_else(|| anyhow!("missing tenant_timeline_id"))?;
+                let ttid = parse_proto_ttid(proto_ttid)?;
+                if let Ok(tli) = GlobalTimelines::get(ttid) {
+                    // we received a discovery request for a timeline we know about
+                    discover_counter.inc();
+
+                    // create and reply with discovery response
+                    let sk_info = tli.get_safekeeper_info(&conf).await;
+                    let response = SafekeeperDiscoveryResponse {
+                        safekeeper_id: sk_info.safekeeper_id,
+                        tenant_timeline_id: sk_info.tenant_timeline_id,
+                        commit_lsn: sk_info.commit_lsn,
+                        safekeeper_connstr: sk_info.safekeeper_connstr,
+                        availability_zone: sk_info.availability_zone,
+                    };
+
+                    // note this is a blocking call
+                    client
+                        .publish_one(TypedMessage {
+                            r#type: MessageType::SafekeeperDiscoveryResponse as i32,
+                            safekeeper_timeline_info: None,
+                            safekeeper_discovery_request: None,
+                            safekeeper_discovery_response: Some(response),
+                        })
+                        .await?;
+                }
+            }
+
+            _ => {
+                warn!(
+                    "unexpected message type i32 {}, {:?}",
+                    typed_msg.r#type,
+                    typed_msg.r#type()
+                );
+            }
+        }
+    }
+    bail!("end of stream");
+}
+
 pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> {
     info!("started, broker endpoint {:?}", conf.broker_endpoint);
 
     let mut ticker = tokio::time::interval(Duration::from_millis(RETRY_INTERVAL_MSEC));
     let mut push_handle: Option<JoinHandle<Result<(), Error>>> = None;
     let mut pull_handle: Option<JoinHandle<Result<(), Error>>> = None;
+    let mut discover_handle: Option<JoinHandle<Result<(), Error>>> = None;
+
+    let stats = Arc::new(BrokerStats::new());
+    let stats_task = task_stats(stats.clone());
+    tokio::pin!(stats_task);
 
     // Selecting on JoinHandles requires some squats; is there a better way to
     // reap tasks individually?
@@ -153,13 +251,77 @@ pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> {
                     };
                     pull_handle = None;
                 },
+                res = async { discover_handle.as_mut().unwrap().await }, if discover_handle.is_some() => {
+                    // was it panic or normal error?
+                    match res {
+                        Ok(res_internal) => if let Err(err_inner) = res_internal {
+                            warn!("discover task failed: {:?}", err_inner);
+                        }
+                        Err(err_outer) => { warn!("discover task panicked: {:?}", err_outer) }
+                    };
+                    discover_handle = None;
+                },
                 _ = ticker.tick() => {
                     if push_handle.is_none() {
                         push_handle = Some(tokio::spawn(push_loop(conf.clone())));
                     }
                     if pull_handle.is_none() {
-                        pull_handle = Some(tokio::spawn(pull_loop(conf.clone())));
+                        pull_handle = Some(tokio::spawn(pull_loop(conf.clone(), stats.clone())));
                     }
+                    if discover_handle.is_none() {
+                        discover_handle = Some(tokio::spawn(discover_loop(conf.clone(), stats.clone())));
+                    }
+                },
+                _ = &mut stats_task => {}
+        }
+    }
+}
+
+struct BrokerStats {
+    /// Timestamp of the last received message from the broker.
+    last_pulled_ts: AtomicU64,
+}
+
+impl BrokerStats {
+    fn new() -> Self {
+        BrokerStats {
+            last_pulled_ts: AtomicU64::new(0),
+        }
+    }
+
+    fn now_millis() -> u64 {
+        std::time::SystemTime::now()
+            .duration_since(UNIX_EPOCH)
+            .expect("time is before epoch")
+            .as_millis() as u64
+    }
+
+    /// Update last_pulled timestamp to current time.
+    fn update_pulled(&self) {
+        self.last_pulled_ts
+            .store(Self::now_millis(), std::sync::atomic::Ordering::Relaxed);
+    }
+}
+
+/// Periodically write to logs if there are issues with receiving data from the broker.
+async fn task_stats(stats: Arc<BrokerStats>) {
+    let warn_duration = Duration::from_secs(10);
+    let mut ticker = tokio::time::interval(warn_duration);
+
+    loop {
+        tokio::select! {
+            _ = ticker.tick() => {
+                let last_pulled = stats.last_pulled_ts.load(std::sync::atomic::Ordering::SeqCst);
+                if last_pulled == 0 {
+                    // no broker updates yet
+                    continue;
+                }
+
+                let now = BrokerStats::now_millis();
+                if now > last_pulled && now - last_pulled > warn_duration.as_millis() as u64 {
+                    let ts = chrono::NaiveDateTime::from_timestamp_millis(last_pulled as i64).expect("invalid timestamp");
+                    info!("no broker updates for some time, last update: {:?}", ts);
+                }
             }
         }
     }
diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs
index 9b4d4dbb38..543714a54e 100644
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -83,6 +83,7 @@ pub struct SafeKeeperConf {
     pub walsenders_keep_horizon: bool,
     pub partial_backup_enabled: bool,
     pub partial_backup_timeout: Duration,
+    pub disable_periodic_broker_push: bool,
 }
 
 impl SafeKeeperConf {
@@ -129,6 +130,7 @@ impl SafeKeeperConf {
             walsenders_keep_horizon: false,
             partial_backup_enabled: false,
             partial_backup_timeout: Duration::from_secs(0),
+            disable_periodic_broker_push: false,
         }
     }
 }
diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs
index f2ee0403eb..e671d4f36a 100644
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -725,6 +725,18 @@ where
             self.state.inmem.commit_lsn
         );
 
+        // Before first WAL write initialize its segment. It makes first segment
+        // pg_waldump'able because stream from compute doesn't include its
+        // segment and page headers.
+        //
+        // If we fail before first WAL write flush this action would be
+        // repeated, that's ok because it is idempotent.
+        if self.wal_store.flush_lsn() == Lsn::INVALID {
+            self.wal_store
+                .initialize_first_segment(msg.start_streaming_at)
+                .await?;
+        }
+
         // TODO: cross check divergence point, check if msg.start_streaming_at corresponds to
         // intersection of our history and history from msg
 
@@ -1007,6 +1019,10 @@ mod tests {
             self.lsn
         }
 
+        async fn initialize_first_segment(&mut self, _init_lsn: Lsn) -> Result<()> {
+            Ok(())
+        }
+
         async fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()> {
             self.lsn = startpos + buf.len() as u64;
             Ok(())
diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs
index e3f6a606a0..e496f07114 100644
--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -18,7 +18,7 @@ use std::time::Duration;
 use postgres_ffi::v14::xlog_utils::XLogSegNoOffsetToRecPtr;
 use postgres_ffi::XLogFileName;
 use postgres_ffi::{XLogSegNo, PG_TLI};
-use remote_storage::{GenericRemoteStorage, RemotePath, StorageMetadata};
+use remote_storage::{GenericRemoteStorage, ListingMode, RemotePath, StorageMetadata};
 use tokio::fs::File;
 
 use tokio::select;
@@ -601,12 +601,18 @@ pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> {
     backoff::retry(
         || async {
             // Do list-delete in batch_size batches to make progress even if there a lot of files.
-            // Alternatively we could make list_files return iterator, but it is more complicated and
+            // Alternatively we could make remote storage list return iterator, but it is more complicated and
             // I'm not sure deleting while iterating is expected in s3.
             loop {
                 let files = storage
-                    .list_files(Some(&remote_path), Some(batch_size), &cancel)
-                    .await?;
+                    .list(
+                        Some(&remote_path),
+                        ListingMode::NoDelimiter,
+                        Some(batch_size),
+                        &cancel,
+                    )
+                    .await?
+                    .keys;
                 if files.is_empty() {
                     return Ok(()); // done
                 }
@@ -666,8 +672,9 @@ pub async fn copy_s3_segments(
     let cancel = CancellationToken::new();
 
     let files = storage
-        .list_files(Some(&remote_path), None, &cancel)
-        .await?;
+        .list(Some(&remote_path), ListingMode::NoDelimiter, None, &cancel)
+        .await?
+        .keys;
 
     let uploaded_segments = &files
         .iter()
diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs
index 147f318b9f..6bc8c7c3f9 100644
--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -38,6 +38,12 @@ pub trait Storage {
     /// LSN of last durably stored WAL record.
     fn flush_lsn(&self) -> Lsn;
 
+    /// Initialize segment by creating proper long header at the beginning of
+    /// the segment and short header at the page of given LSN. This is only used
+    /// for timeline initialization because compute will stream data only since
+    /// init_lsn. Other segment headers are included in compute stream.
+    async fn initialize_first_segment(&mut self, init_lsn: Lsn) -> Result<()>;
+
     /// Write piece of WAL from buf to disk, but not necessarily sync it.
     async fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()>;
 
@@ -78,6 +84,8 @@ pub struct PhysicalStorage {
 
     /// Size of WAL segment in bytes.
     wal_seg_size: usize,
+    pg_version: u32,
+    system_id: u64,
 
     /// Written to disk, but possibly still in the cache and not fully persisted.
     /// Also can be ahead of record_lsn, if happen to be in the middle of a WAL record.
@@ -169,6 +177,8 @@ impl PhysicalStorage {
             timeline_dir,
             conf: conf.clone(),
             wal_seg_size,
+            pg_version: state.server.pg_version,
+            system_id: state.server.system_id,
             write_lsn,
             write_record_lsn: write_lsn,
             flush_record_lsn: flush_lsn,
@@ -324,6 +334,20 @@ impl Storage for PhysicalStorage {
         self.flush_record_lsn
     }
 
+    async fn initialize_first_segment(&mut self, init_lsn: Lsn) -> Result<()> {
+        let segno = init_lsn.segment_number(self.wal_seg_size);
+        let (mut file, _) = self.open_or_create(segno).await?;
+        let major_pg_version = self.pg_version / 10000;
+        let wal_seg =
+            postgres_ffi::generate_wal_segment(segno, self.system_id, major_pg_version, init_lsn)?;
+        file.seek(SeekFrom::Start(0)).await?;
+        file.write_all(&wal_seg).await?;
+        file.flush().await?;
+        info!("initialized segno {} at lsn {}", segno, init_lsn);
+        // note: file is *not* fsynced
+        Ok(())
+    }
+
     /// Write WAL to disk.
     async fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()> {
         // Disallow any non-sequential writes, which can result in gaps or overwrites.
diff --git a/safekeeper/tests/walproposer_sim/safekeeper.rs b/safekeeper/tests/walproposer_sim/safekeeper.rs
index bc21c4d765..27e2a4453b 100644
--- a/safekeeper/tests/walproposer_sim/safekeeper.rs
+++ b/safekeeper/tests/walproposer_sim/safekeeper.rs
@@ -178,6 +178,7 @@ pub fn run_server(os: NodeOs, disk: Arc<SafekeeperDisk>) -> Result<()> {
         walsenders_keep_horizon: false,
         partial_backup_enabled: false,
         partial_backup_timeout: Duration::from_secs(0),
+        disable_periodic_broker_push: false,
     };
 
     let mut global = GlobalMap::new(disk, conf.clone())?;
diff --git a/safekeeper/tests/walproposer_sim/safekeeper_disk.rs b/safekeeper/tests/walproposer_sim/safekeeper_disk.rs
index 35bca325aa..c2db9de78a 100644
--- a/safekeeper/tests/walproposer_sim/safekeeper_disk.rs
+++ b/safekeeper/tests/walproposer_sim/safekeeper_disk.rs
@@ -182,6 +182,10 @@ impl wal_storage::Storage for DiskWALStorage {
         self.flush_record_lsn
     }
 
+    async fn initialize_first_segment(&mut self, _init_lsn: Lsn) -> Result<()> {
+        Ok(())
+    }
+
     /// Write piece of WAL from buf to disk, but not necessarily sync it.
     async fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()> {
         if self.write_lsn != startpos {
diff --git a/scripts/export_import_between_pageservers.py b/scripts/export_import_between_pageservers.py
deleted file mode 100755
index 84b69cb36a..0000000000
--- a/scripts/export_import_between_pageservers.py
+++ /dev/null
@@ -1,730 +0,0 @@
-#
-# Script to export tenants from one pageserver and import them into another page server.
-#
-# Outline of steps:
-# 1. Get `(last_lsn, prev_lsn)` from old pageserver
-# 2. Get `fullbackup` from old pageserver, which creates a basebackup tar file
-# 3. This tar file might be missing relation files for empty relations, if the pageserver
-#    is old enough (we didn't always store those). So to recreate them, we start a local
-#    vanilla postgres on this basebackup and ask it what relations should exist, then touch
-#    any missing files and re-pack the tar.
-#    TODO This functionality is no longer needed, so we can delete it later if we don't
-#         end up using the same utils for the pg 15 upgrade. Not sure.
-# 4. We import the patched basebackup into a new pageserver
-# 5. We export again via fullbackup, now from the new pageserver and compare the returned
-#    tar file with the one we imported. This confirms that we imported everything that was
-#    exported, but doesn't guarantee correctness (what if we didn't **export** everything
-#    initially?)
-# 6. We wait for the new pageserver's remote_consistent_lsn to catch up
-#
-# For more context on how to use this, see:
-# https://www.notion.so/neondatabase/Storage-format-migration-9a8eba33ccf8417ea8cf50e6a0c542cf
-
-import argparse
-import os
-import shutil
-import subprocess
-import tempfile
-import time
-import uuid
-from contextlib import closing
-from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple, cast
-
-import psycopg2
-import requests
-from psycopg2.extensions import connection as PgConnection
-from psycopg2.extensions import parse_dsn
-
-###############################################
-### client-side utils copied from test fixtures
-###############################################
-
-Env = Dict[str, str]
-
-_global_counter = 0
-
-
-def global_counter() -> int:
-    """A really dumb global counter.
-    This is useful for giving output files a unique number, so if we run the
-    same command multiple times we can keep their output separate.
-    """
-    global _global_counter
-    _global_counter += 1
-    return _global_counter
-
-
-def subprocess_capture(capture_dir: str, cmd: List[str], **kwargs: Any) -> str:
-    """Run a process and capture its output
-    Output will go to files named "cmd_NNN.stdout" and "cmd_NNN.stderr"
-    where "cmd" is the name of the program and NNN is an incrementing
-    counter.
-    If those files already exist, we will overwrite them.
-    Returns basepath for files with captured output.
-    """
-    assert isinstance(cmd, list)
-    base = f"{os.path.basename(cmd[0])}_{global_counter()}"
-    basepath = os.path.join(capture_dir, base)
-    stdout_filename = basepath + ".stdout"
-    stderr_filename = basepath + ".stderr"
-
-    with open(stdout_filename, "w") as stdout_f:
-        with open(stderr_filename, "w") as stderr_f:
-            print(f'(capturing output to "{base}.stdout")')
-            subprocess.run(cmd, **kwargs, stdout=stdout_f, stderr=stderr_f)
-
-    return basepath
-
-
-class PgBin:
-    """A helper class for executing postgres binaries"""
-
-    def __init__(self, log_dir: Path, pg_distrib_dir, pg_version):
-        self.log_dir = log_dir
-        self.pg_bin_path = os.path.join(str(pg_distrib_dir), f"v{pg_version}", "bin")
-        self.env = os.environ.copy()
-        self.env["LD_LIBRARY_PATH"] = os.path.join(str(pg_distrib_dir), f"v{pg_version}", "lib")
-
-    def _fixpath(self, command: List[str]):
-        if "/" not in command[0]:
-            command[0] = os.path.join(self.pg_bin_path, command[0])
-
-    def _build_env(self, env_add: Optional[Env]) -> Env:
-        if env_add is None:
-            return self.env
-        env = self.env.copy()
-        env.update(env_add)
-        return env
-
-    def run(self, command: List[str], env: Optional[Env] = None, cwd: Optional[str] = None):
-        """
-        Run one of the postgres binaries.
-        The command should be in list form, e.g. ['pgbench', '-p', '55432']
-        All the necessary environment variables will be set.
-        If the first argument (the command name) doesn't include a path (no '/'
-        characters present), then it will be edited to include the correct path.
-        If you want stdout/stderr captured to files, use `run_capture` instead.
-        """
-
-        self._fixpath(command)
-        print(f'Running command "{" ".join(command)}"')
-        env = self._build_env(env)
-        subprocess.run(command, env=env, cwd=cwd, check=True)
-
-    def run_capture(
-        self,
-        command: List[str],
-        env: Optional[Env] = None,
-        cwd: Optional[str] = None,
-        **kwargs: Any,
-    ) -> str:
-        """
-        Run one of the postgres binaries, with stderr and stdout redirected to a file.
-        This is just like `run`, but for chatty programs. Returns basepath for files
-        with captured output.
-        """
-
-        self._fixpath(command)
-        print(f'Running command "{" ".join(command)}"')
-        env = self._build_env(env)
-        return subprocess_capture(
-            str(self.log_dir), command, env=env, cwd=cwd, check=True, **kwargs
-        )
-
-
-class PgProtocol:
-    """Reusable connection logic"""
-
-    def __init__(self, **kwargs):
-        self.default_options = kwargs
-
-    def conn_options(self, **kwargs):
-        conn_options = self.default_options.copy()
-        if "dsn" in kwargs:
-            conn_options.update(parse_dsn(kwargs["dsn"]))
-        conn_options.update(kwargs)
-
-        # Individual statement timeout in seconds. 2 minutes should be
-        # enough for our tests, but if you need a longer, you can
-        # change it by calling "SET statement_timeout" after
-        # connecting.
-        conn_options["options"] = f"-cstatement_timeout=120s {conn_options.get('options', '')}"
-
-        return conn_options
-
-    # autocommit=True here by default because that's what we need most of the time
-    def connect(self, autocommit=True, **kwargs) -> PgConnection:
-        """
-        Connect to the node.
-        Returns psycopg2's connection object.
-        This method passes all extra params to connstr.
-        """
-        conn: PgConnection = psycopg2.connect(**self.conn_options(**kwargs))
-
-        # WARNING: this setting affects *all* tests!
-        conn.autocommit = autocommit
-        return conn
-
-    def safe_psql(self, query: str, **kwargs: Any) -> List[Tuple[Any, ...]]:
-        """
-        Execute query against the node and return all rows.
-        This method passes all extra params to connstr.
-        """
-        return self.safe_psql_many([query], **kwargs)[0]
-
-    def safe_psql_many(self, queries: List[str], **kwargs: Any) -> List[List[Tuple[Any, ...]]]:
-        """
-        Execute queries against the node and return all rows.
-        This method passes all extra params to connstr.
-        """
-        result: List[List[Any]] = []
-        with closing(self.connect(**kwargs)) as conn:
-            with conn.cursor() as cur:
-                for query in queries:
-                    print(f"Executing query: {query}")
-                    cur.execute(query)
-
-                    if cur.description is None:
-                        result.append([])  # query didn't return data
-                    else:
-                        result.append(cast(List[Any], cur.fetchall()))
-        return result
-
-
-class VanillaPostgres(PgProtocol):
-    def __init__(self, pgdatadir: Path, pg_bin: PgBin, port: int, init=True):
-        super().__init__(host="localhost", port=port, dbname="postgres")
-        self.pgdatadir = pgdatadir
-        self.pg_bin = pg_bin
-        self.running = False
-        if init:
-            self.pg_bin.run_capture(["initdb", "-D", str(pgdatadir)])
-        self.configure([f"port = {port}\n"])
-
-    def configure(self, options: List[str]):
-        """Append lines into postgresql.conf file."""
-        assert not self.running
-        with open(os.path.join(self.pgdatadir, "postgresql.conf"), "a") as conf_file:
-            conf_file.write("\n".join(options))
-
-    def start(self, log_path: Optional[str] = None):
-        assert not self.running
-        self.running = True
-
-        log_path = log_path or os.path.join(self.pgdatadir, "pg.log")
-
-        self.pg_bin.run_capture(
-            ["pg_ctl", "-w", "-D", str(self.pgdatadir), "-l", log_path, "start"]
-        )
-
-    def stop(self):
-        assert self.running
-        self.running = False
-        self.pg_bin.run_capture(["pg_ctl", "-w", "-D", str(self.pgdatadir), "stop"])
-
-    def __enter__(self):
-        return self
-
-    def __exit__(self, exc_type, exc, tb):
-        if self.running:
-            self.stop()
-
-
-class NeonPageserverApiException(Exception):
-    pass
-
-
-class NeonPageserverHttpClient(requests.Session):
-    def __init__(self, host, port):
-        super().__init__()
-        self.host = host
-        self.port = port
-
-    def verbose_error(self, res: requests.Response):
-        try:
-            res.raise_for_status()
-        except requests.RequestException as e:
-            try:
-                msg = res.json()["msg"]
-            except:  # noqa: E722
-                msg = ""
-            raise NeonPageserverApiException(msg) from e
-
-    def check_status(self):
-        self.get(f"http://{self.host}:{self.port}/v1/status").raise_for_status()
-
-    def tenant_list(self):
-        res = self.get(f"http://{self.host}:{self.port}/v1/tenant")
-        self.verbose_error(res)
-        res_json = res.json()
-        assert isinstance(res_json, list)
-        return res_json
-
-    def tenant_create(self, new_tenant_id: uuid.UUID, ok_if_exists):
-        res = self.post(
-            f"http://{self.host}:{self.port}/v1/tenant",
-            json={"new_tenant_id": new_tenant_id.hex, "generation": 1},
-        )
-
-        if res.status_code == 409:
-            if ok_if_exists:
-                print(f"could not create tenant: already exists for id {new_tenant_id}")
-            else:
-                res.raise_for_status()
-        elif res.status_code == 201:
-            print(f"created tenant {new_tenant_id}")
-        else:
-            self.verbose_error(res)
-
-        return new_tenant_id
-
-    def timeline_list(self, tenant_id: uuid.UUID):
-        res = self.get(f"http://{self.host}:{self.port}/v1/tenant/{tenant_id.hex}/timeline")
-        self.verbose_error(res)
-        res_json = res.json()
-        assert isinstance(res_json, list)
-        return res_json
-
-    def timeline_detail(self, tenant_id: uuid.UUID, timeline_id: uuid.UUID) -> Dict[Any, Any]:
-        res = self.get(
-            f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline/{timeline_id.hex}?include-non-incremental-logical-size=true"
-        )
-        self.verbose_error(res)
-        res_json = res.json()
-        assert isinstance(res_json, dict)
-        return res_json
-
-
-def lsn_to_hex(num: int) -> str:
-    """Convert lsn from int to standard hex notation."""
-    return f"{num >> 32:X}/{num & 0xFFFFFFFF:X}"
-
-
-def lsn_from_hex(lsn_hex: str) -> int:
-    """Convert lsn from hex notation to int."""
-    left, right = lsn_hex.split("/")
-    return (int(left, 16) << 32) + int(right, 16)
-
-
-def remote_consistent_lsn(
-    pageserver_http_client: NeonPageserverHttpClient, tenant: uuid.UUID, timeline: uuid.UUID
-) -> int:
-    detail = pageserver_http_client.timeline_detail(tenant, timeline)
-
-    lsn_str = detail["remote_consistent_lsn"]
-    assert isinstance(lsn_str, str)
-    return lsn_from_hex(lsn_str)
-
-
-def wait_for_upload(
-    pageserver_http_client: NeonPageserverHttpClient,
-    tenant: uuid.UUID,
-    timeline: uuid.UUID,
-    lsn: int,
-):
-    """waits for local timeline upload up to specified lsn"""
-    for i in range(10):
-        current_lsn = remote_consistent_lsn(pageserver_http_client, tenant, timeline)
-        if current_lsn >= lsn:
-            return
-        print(
-            f"waiting for remote_consistent_lsn to reach {lsn_to_hex(lsn)}, now {lsn_to_hex(current_lsn)}, iteration {i + 1}"
-        )
-        time.sleep(1)
-
-    raise Exception(
-        f"timed out while waiting for remote_consistent_lsn to reach {lsn_to_hex(lsn)}, was {lsn_to_hex(current_lsn)}"
-    )
-
-
-##############
-# End of utils
-##############
-
-
-def pack_base(log_dir, restored_dir, output_tar):
-    """Create tar file from basebackup, being careful to produce relative filenames."""
-    tmp_tar_name = "tmp.tar"
-    tmp_tar_path = os.path.join(restored_dir, tmp_tar_name)
-    cmd = ["tar", "-cf", tmp_tar_name] + os.listdir(restored_dir)
-    # We actually cd into the dir and call tar from there. If we call tar from
-    # outside we won't encode filenames as relative, and they won't parse well
-    # on import.
-    subprocess_capture(log_dir, cmd, cwd=restored_dir)
-    shutil.move(tmp_tar_path, output_tar)
-
-
-def reconstruct_paths(log_dir, pg_bin, base_tar, port: int):
-    """Reconstruct what relation files should exist in the datadir by querying postgres."""
-    with tempfile.TemporaryDirectory() as restored_dir:
-        # Unpack the base tar
-        subprocess_capture(log_dir, ["tar", "-xf", base_tar, "-C", restored_dir])
-
-        # Start a vanilla postgres from the given datadir and query it to find
-        # what relfiles should exist, but possibly don't.
-        with VanillaPostgres(Path(restored_dir), pg_bin, port, init=False) as vanilla_pg:
-            vanilla_pg.configure([f"port={port}"])
-            vanilla_pg.start(log_path=os.path.join(log_dir, "tmp_pg.log"))
-
-            # Create database based on template0 because we can't connect to template0
-            query = "create database template0copy template template0"
-            vanilla_pg.safe_psql(query, user="cloud_admin")
-            vanilla_pg.safe_psql("CHECKPOINT", user="cloud_admin")
-
-            # Get all databases
-            query = "select oid, datname from pg_database"
-            oid_dbname_pairs = vanilla_pg.safe_psql(query, user="cloud_admin")
-            template0_oid = [
-                oid for (oid, database) in oid_dbname_pairs if database == "template0"
-            ][0]
-
-            # Get rel paths for each database
-            for oid, database in oid_dbname_pairs:
-                if database == "template0":
-                    # We can't connect to template0
-                    continue
-
-                query = "select relname, pg_relation_filepath(oid) from pg_class"
-                result = vanilla_pg.safe_psql(query, user="cloud_admin", dbname=database)
-                for _relname, filepath in result:
-                    if filepath is not None:
-                        if database == "template0copy":
-                            # Add all template0copy paths to template0
-                            prefix = f"base/{oid}/"
-                            if filepath.startswith(prefix):
-                                suffix = filepath[len(prefix) :]
-                                yield f"base/{template0_oid}/{suffix}"
-                            elif filepath.startswith("global"):
-                                print(f"skipping {database} global file {filepath}")
-                            else:
-                                raise AssertionError
-                        else:
-                            yield filepath
-
-
-def touch_missing_rels(log_dir, corrupt_tar, output_tar, paths):
-    """Add the appropriate empty files to a basebadkup tar."""
-    with tempfile.TemporaryDirectory() as restored_dir:
-        # Unpack the base tar
-        subprocess_capture(log_dir, ["tar", "-xf", corrupt_tar, "-C", restored_dir])
-
-        # Touch files that don't exist
-        for path in paths:
-            absolute_path = os.path.join(restored_dir, path)
-            exists = os.path.exists(absolute_path)
-            if not exists:
-                print(f"File {absolute_path} didn't exist. Creating..")
-                Path(absolute_path).touch()
-
-        # Repackage
-        pack_base(log_dir, restored_dir, output_tar)
-
-
-# HACK This is a workaround for exporting from old pageservers that
-#      can't export empty relations. In this case we need to start
-#      a vanilla postgres from the exported datadir, and query it
-#      to see what empty relations are missing, and then create
-#      those empty files before importing.
-def add_missing_rels(base_tar, output_tar, log_dir, pg_bin, tmp_pg_port: int):
-    reconstructed_paths = set(reconstruct_paths(log_dir, pg_bin, base_tar, tmp_pg_port))
-    touch_missing_rels(log_dir, base_tar, output_tar, reconstructed_paths)
-
-
-def get_rlsn(pageserver_connstr, tenant_id, timeline_id):
-    with closing(psycopg2.connect(pageserver_connstr)) as conn:
-        conn.autocommit = True
-        with conn.cursor() as cur:
-            cmd = f"get_last_record_rlsn {tenant_id} {timeline_id}"
-            cur.execute(cmd)
-            res = cur.fetchone()
-            assert res is not None
-            prev_lsn = res[0]
-            last_lsn = res[1]
-
-    return last_lsn, prev_lsn
-
-
-def import_timeline(
-    args,
-    psql_path,
-    pageserver_connstr,
-    pageserver_http,
-    tenant_id,
-    timeline_id,
-    last_lsn,
-    prev_lsn,
-    tar_filename,
-    pg_version,
-):
-    # Import timelines to new pageserver
-    import_cmd = f"import basebackup {tenant_id} {timeline_id} {last_lsn} {last_lsn} {pg_version}"
-    full_cmd = rf"""cat {tar_filename} | {psql_path} {pageserver_connstr} -c '{import_cmd}' """
-
-    stderr_filename2 = os.path.join(args.work_dir, f"import_{tenant_id}_{timeline_id}.stderr")
-    stdout_filename = os.path.join(args.work_dir, f"import_{tenant_id}_{timeline_id}.stdout")
-
-    print(f"Running: {full_cmd}")
-
-    with open(stdout_filename, "w") as stdout_f:
-        with open(stderr_filename2, "w") as stderr_f:
-            print(f"(capturing output to {stdout_filename})")
-            pg_bin = PgBin(args.work_dir, args.pg_distrib_dir, pg_version)
-            subprocess.run(
-                full_cmd,
-                stdout=stdout_f,
-                stderr=stderr_f,
-                env=pg_bin._build_env(None),
-                shell=True,
-                check=True,
-            )
-
-            print("Done import")
-
-    # Wait until pageserver persists the files
-    wait_for_upload(
-        pageserver_http, uuid.UUID(tenant_id), uuid.UUID(timeline_id), lsn_from_hex(last_lsn)
-    )
-
-
-def export_timeline(
-    args,
-    psql_path,
-    pageserver_connstr,
-    tenant_id,
-    timeline_id,
-    last_lsn,
-    prev_lsn,
-    tar_filename,
-    pg_version,
-):
-    # Choose filenames
-    incomplete_filename = tar_filename + ".incomplete"
-    stderr_filename = os.path.join(args.work_dir, f"{tenant_id}_{timeline_id}.stderr")
-
-    # Construct export command
-    query = f"fullbackup {tenant_id} {timeline_id} {last_lsn} {prev_lsn}"
-    cmd = [psql_path, "--no-psqlrc", pageserver_connstr, "-c", query]
-
-    # Run export command
-    print(f"Running: {cmd}")
-    with open(incomplete_filename, "w") as stdout_f:
-        with open(stderr_filename, "w") as stderr_f:
-            print(f"(capturing output to {incomplete_filename})")
-            pg_bin = PgBin(args.work_dir, args.pg_distrib_dir, pg_version)
-            subprocess.run(
-                cmd, stdout=stdout_f, stderr=stderr_f, env=pg_bin._build_env(None), check=True
-            )
-
-    # Add missing rels
-    pg_bin = PgBin(args.work_dir, args.pg_distrib_dir, pg_version)
-    add_missing_rels(incomplete_filename, tar_filename, args.work_dir, pg_bin, args.tmp_pg_port)
-
-    # Log more info
-    file_size = os.path.getsize(tar_filename)
-    print(f"Done export: {tar_filename}, size {file_size}")
-
-
-def main(args: argparse.Namespace):
-    # any psql version will do here. use current DEFAULT_PG_VERSION = 15
-    psql_path = str(Path(args.pg_distrib_dir) / "v15" / "bin" / "psql")
-
-    old_pageserver_host = args.old_pageserver_host
-    new_pageserver_host = args.new_pageserver_host
-
-    old_http_client = NeonPageserverHttpClient(old_pageserver_host, args.old_pageserver_http_port)
-    old_http_client.check_status()
-    old_pageserver_connstr = f"postgresql://{old_pageserver_host}:{args.old_pageserver_pg_port}"
-
-    new_http_client = NeonPageserverHttpClient(new_pageserver_host, args.new_pageserver_http_port)
-    new_http_client.check_status()
-    new_pageserver_connstr = f"postgresql://{new_pageserver_host}:{args.new_pageserver_pg_port}"
-
-    for tenant_id in args.tenants:
-        print(f"Tenant: {tenant_id}")
-        timelines = old_http_client.timeline_list(uuid.UUID(tenant_id))
-        print(f"Timelines: {timelines}")
-
-        # Create tenant in new pageserver
-        if args.only_import is False and not args.timelines:
-            new_http_client.tenant_create(uuid.UUID(tenant_id), args.ok_if_exists)
-
-        for timeline in timelines:
-            # Skip timelines we don't need to export
-            if args.timelines and timeline["timeline_id"] not in args.timelines:
-                print(f"Skipping timeline {timeline['timeline_id']}")
-                continue
-
-            # Choose filenames
-            tar_filename = os.path.join(
-                args.work_dir, f"{timeline['tenant_id']}_{timeline['timeline_id']}.tar"
-            )
-
-            pg_version = timeline["pg_version"]
-
-            # Export timeline from old pageserver
-            if args.only_import is False:
-                last_lsn, prev_lsn = get_rlsn(
-                    old_pageserver_connstr,
-                    timeline["tenant_id"],
-                    timeline["timeline_id"],
-                )
-                export_timeline(
-                    args,
-                    psql_path,
-                    old_pageserver_connstr,
-                    timeline["tenant_id"],
-                    timeline["timeline_id"],
-                    last_lsn,
-                    prev_lsn,
-                    tar_filename,
-                    pg_version,
-                )
-
-            # Import into new pageserver
-            import_timeline(
-                args,
-                psql_path,
-                new_pageserver_connstr,
-                new_http_client,
-                timeline["tenant_id"],
-                timeline["timeline_id"],
-                last_lsn,
-                prev_lsn,
-                tar_filename,
-                pg_version,
-            )
-
-            # Re-export and compare
-            re_export_filename = tar_filename + ".reexport"
-            export_timeline(
-                args,
-                psql_path,
-                new_pageserver_connstr,
-                timeline["tenant_id"],
-                timeline["timeline_id"],
-                last_lsn,
-                prev_lsn,
-                re_export_filename,
-                pg_version,
-            )
-
-            # Check the size is the same
-            old_size = (os.path.getsize(tar_filename),)
-            new_size = (os.path.getsize(re_export_filename),)
-            if old_size != new_size:
-                raise AssertionError(f"Sizes don't match old: {old_size} new: {new_size}")
-
-
-def non_zero_tcp_port(arg: Any):
-    port = int(arg)
-    if port < 1 or port > 65535:
-        raise argparse.ArgumentTypeError(f"invalid tcp port: {arg}")
-    return port
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--tenant-id",
-        dest="tenants",
-        required=True,
-        nargs="+",
-        help="Id of the tenant to migrate. You can pass multiple arguments",
-    )
-    parser.add_argument(
-        "--timeline-id",
-        dest="timelines",
-        required=False,
-        nargs="+",
-        help="Id of the timeline to migrate. You can pass multiple arguments",
-    )
-    parser.add_argument(
-        "--from-host",
-        dest="old_pageserver_host",
-        required=True,
-        help="Host of the pageserver to migrate data from",
-    )
-    parser.add_argument(
-        "--from-http-port",
-        dest="old_pageserver_http_port",
-        required=False,
-        type=int,
-        default=9898,
-        help="HTTP port of the pageserver to migrate data from. Default: 9898",
-    )
-    parser.add_argument(
-        "--from-pg-port",
-        dest="old_pageserver_pg_port",
-        required=False,
-        type=int,
-        default=6400,
-        help="pg port of the pageserver to migrate data from. Default: 6400",
-    )
-    parser.add_argument(
-        "--to-host",
-        dest="new_pageserver_host",
-        required=True,
-        help="Host of the pageserver to migrate data to",
-    )
-    parser.add_argument(
-        "--to-http-port",
-        dest="new_pageserver_http_port",
-        required=False,
-        default=9898,
-        type=int,
-        help="HTTP port of the pageserver to migrate data to. Default: 9898",
-    )
-    parser.add_argument(
-        "--to-pg-port",
-        dest="new_pageserver_pg_port",
-        required=False,
-        default=6400,
-        type=int,
-        help="pg port of the pageserver to migrate data to. Default: 6400",
-    )
-    parser.add_argument(
-        "--ignore-tenant-exists",
-        dest="ok_if_exists",
-        required=False,
-        help="Ignore error if we are trying to create the tenant that already exists. It can be dangerous if existing tenant already contains some data.",
-    )
-    parser.add_argument(
-        "--pg-distrib-dir",
-        dest="pg_distrib_dir",
-        required=False,
-        default="/usr/local/",
-        help="Path where postgres binaries are installed. Default: /usr/local/",
-    )
-    parser.add_argument(
-        "--psql-path",
-        dest="psql_path",
-        required=False,
-        default="/usr/local/v14/bin/psql",
-        help="Path to the psql binary. Default: /usr/local/v14/bin/psql",
-    )
-    parser.add_argument(
-        "--only-import",
-        dest="only_import",
-        required=False,
-        default=False,
-        action="store_true",
-        help="Skip export and tenant creation part",
-    )
-    parser.add_argument(
-        "--work-dir",
-        dest="work_dir",
-        required=True,
-        default=False,
-        help="directory where temporary tar files are stored",
-    )
-    parser.add_argument(
-        "--tmp-pg-port",
-        dest="tmp_pg_port",
-        required=False,
-        default=55439,
-        type=non_zero_tcp_port,
-        help="localhost port to use for temporary postgres instance",
-    )
-    args = parser.parse_args()
-    main(args)
diff --git a/scripts/flaky_tests.py b/scripts/flaky_tests.py
index 853c67d218..878840fcee 100755
--- a/scripts/flaky_tests.py
+++ b/scripts/flaky_tests.py
@@ -15,8 +15,7 @@ FLAKY_TESTS_QUERY = """
         DISTINCT parent_suite, suite, name
     FROM results
     WHERE
-        started_at > CURRENT_DATE - INTERVAL '10' day
-        AND started_at > '2024-03-11 14:50:11.845+00' -- we switched the default PAGESERVER_VIRTUAL_FILE_IO_ENGINE to `tokio-epoll-uring` from `std-fs` on this date, we want to ignore the flaky tests for `std-fs`
+        started_at > CURRENT_DATE - INTERVAL '%s' day
         AND (
             (status IN ('failed', 'broken') AND reference = 'refs/heads/main')
             OR flaky
diff --git a/scripts/sk_cleanup_tenants/script.py b/scripts/sk_cleanup_tenants/script.py
index fa22433614..c20a4bb830 100644
--- a/scripts/sk_cleanup_tenants/script.py
+++ b/scripts/sk_cleanup_tenants/script.py
@@ -22,7 +22,7 @@ parser.add_argument("--safekeeper-host", required=True, type=str)
 args = parser.parse_args()
 
 access_key = os.getenv("CONSOLE_API_TOKEN")
-endpoint: str = "https://console.stage.neon.tech/api"
+endpoint: str = "https://console-stage.neon.build/api"
 
 trash_dir: Path = args.trash_dir
 dry_run: bool = args.dry_run
diff --git a/scripts/sk_collect_dumps/readme.md b/scripts/sk_collect_dumps/readme.md
index 7494a6cb78..5ae55e058b 100644
--- a/scripts/sk_collect_dumps/readme.md
+++ b/scripts/sk_collect_dumps/readme.md
@@ -3,7 +3,7 @@
 3. Issue admin token (add/remove .stage from url for staging/prod and setting proper API key):
 ```
 # staging:
-AUTH_TOKEN=$(curl https://console.stage.neon.tech/regions/console/api/v1/admin/issue_token -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer $NEON_STAGING_KEY" -X POST -d '{"ttl_seconds": 43200, "scope": "safekeeperdata"}' 2>/dev/null | jq --raw-output '.jwt')
+AUTH_TOKEN=$(curl https://console-stage.neon.build/regions/console/api/v1/admin/issue_token -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer $NEON_STAGING_KEY" -X POST -d '{"ttl_seconds": 43200, "scope": "safekeeperdata"}' 2>/dev/null | jq --raw-output '.jwt')
 # prod:
 AUTH_TOKEN=$(curl https://console.neon.tech/regions/console/api/v1/admin/issue_token -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer $NEON_PROD_KEY" -X POST -d '{"ttl_seconds": 43200, "scope": "safekeeperdata"}' 2>/dev/null | jq --raw-output '.jwt')
 # check
diff --git a/storage_broker/src/bin/storage_broker.rs b/storage_broker/src/bin/storage_broker.rs
index 4e5f8ed724..8c88b61abc 100644
--- a/storage_broker/src/bin/storage_broker.rs
+++ b/storage_broker/src/bin/storage_broker.rs
@@ -196,8 +196,13 @@ impl SubscriptionKey {
 
     /// Parse from FilterTenantTimelineId
     pub fn from_proto_filter_tenant_timeline_id(
-        f: &FilterTenantTimelineId,
+        opt: Option<&FilterTenantTimelineId>,
     ) -> Result<Self, Status> {
+        if opt.is_none() {
+            return Ok(SubscriptionKey::All);
+        }
+
+        let f = opt.unwrap();
         if !f.enabled {
             return Ok(SubscriptionKey::All);
         }
@@ -534,10 +539,7 @@ impl BrokerService for Broker {
             .remote_addr()
             .expect("TCPConnectInfo inserted by handler");
         let proto_filter = request.into_inner();
-        let ttid_filter = proto_filter
-            .tenant_timeline_id
-            .as_ref()
-            .ok_or_else(|| Status::new(Code::InvalidArgument, "missing tenant_timeline_id"))?;
+        let ttid_filter = proto_filter.tenant_timeline_id.as_ref();
 
         let sub_key = SubscriptionKey::from_proto_filter_tenant_timeline_id(ttid_filter)?;
         let types_set = proto_filter
diff --git a/storage_controller/Cargo.toml b/storage_controller/Cargo.toml
index 165cafaf4e..194619a496 100644
--- a/storage_controller/Cargo.toml
+++ b/storage_controller/Cargo.toml
@@ -31,7 +31,7 @@ once_cell.workspace = true
 pageserver_api.workspace = true
 pageserver_client.workspace = true
 postgres_connection.workspace = true
-reqwest.workspace = true
+reqwest = { workspace = true, features = ["stream"] }
 routerify.workspace = true
 serde.workspace = true
 serde_json.workspace = true
@@ -40,6 +40,8 @@ tokio.workspace = true
 tokio-util.workspace = true
 tracing.workspace = true
 measured.workspace = true
+strum.workspace = true
+strum_macros.workspace = true
 
 diesel = { version = "2.1.4", features = ["serde_json", "postgres", "r2d2"] }
 diesel_migrations = { version = "2.1.0" }
diff --git a/storage_controller/src/compute_hook.rs b/storage_controller/src/compute_hook.rs
index eb0c4472e4..9d326ef82d 100644
--- a/storage_controller/src/compute_hook.rs
+++ b/storage_controller/src/compute_hook.rs
@@ -3,11 +3,13 @@ use std::{collections::HashMap, time::Duration};
 
 use control_plane::endpoint::{ComputeControlPlane, EndpointStatus};
 use control_plane::local_env::LocalEnv;
-use hyper::{Method, StatusCode};
+use futures::StreamExt;
+use hyper::StatusCode;
 use pageserver_api::shard::{ShardCount, ShardNumber, ShardStripeSize, TenantShardId};
 use postgres_connection::parse_host_port;
 use serde::{Deserialize, Serialize};
 use tokio_util::sync::CancellationToken;
+use tracing::{info_span, Instrument};
 use utils::{
     backoff::{self},
     id::{NodeId, TenantId},
@@ -17,6 +19,8 @@ use crate::service::Config;
 
 const SLOWDOWN_DELAY: Duration = Duration::from_secs(5);
 
+const NOTIFY_REQUEST_TIMEOUT: Duration = Duration::from_secs(10);
+
 pub(crate) const API_CONCURRENCY: usize = 32;
 
 struct UnshardedComputeHookTenant {
@@ -242,6 +246,10 @@ pub(super) struct ComputeHook {
 
     // This lock is only used in testing enviroments, to serialize calls into neon_lock
     neon_local_lock: tokio::sync::Mutex<()>,
+
+    // We share a client across all notifications to enable connection re-use etc when
+    // sending large numbers of notifications
+    client: reqwest::Client,
 }
 
 impl ComputeHook {
@@ -251,12 +259,18 @@ impl ComputeHook {
             .clone()
             .map(|jwt| format!("Bearer {}", jwt));
 
+        let client = reqwest::ClientBuilder::new()
+            .timeout(NOTIFY_REQUEST_TIMEOUT)
+            .build()
+            .expect("Failed to construct HTTP client");
+
         Self {
             state: Default::default(),
             config,
             authorization_header,
             neon_local_lock: Default::default(),
             api_concurrency: tokio::sync::Semaphore::new(API_CONCURRENCY),
+            client,
         }
     }
 
@@ -310,12 +324,11 @@ impl ComputeHook {
 
     async fn do_notify_iteration(
         &self,
-        client: &reqwest::Client,
         url: &String,
         reconfigure_request: &ComputeHookNotifyRequest,
         cancel: &CancellationToken,
     ) -> Result<(), NotifyError> {
-        let req = client.request(Method::PUT, url);
+        let req = self.client.request(reqwest::Method::PUT, url);
         let req = if let Some(value) = &self.authorization_header {
             req.header(reqwest::header::AUTHORIZATION, value)
         } else {
@@ -334,8 +347,10 @@ impl ComputeHook {
         };
 
         // Treat all 2xx responses as success
-        if response.status() >= StatusCode::OK && response.status() < StatusCode::MULTIPLE_CHOICES {
-            if response.status() != StatusCode::OK {
+        if response.status() >= reqwest::StatusCode::OK
+            && response.status() < reqwest::StatusCode::MULTIPLE_CHOICES
+        {
+            if response.status() != reqwest::StatusCode::OK {
                 // Non-200 2xx response: it doesn't make sense to retry, but this is unexpected, so
                 // log a warning.
                 tracing::warn!(
@@ -349,7 +364,7 @@ impl ComputeHook {
 
         // Error response codes
         match response.status() {
-            StatusCode::TOO_MANY_REQUESTS => {
+            reqwest::StatusCode::TOO_MANY_REQUESTS => {
                 // TODO: 429 handling should be global: set some state visible to other requests
                 // so that they will delay before starting, rather than all notifications trying
                 // once before backing off.
@@ -358,20 +373,30 @@ impl ComputeHook {
                     .ok();
                 Err(NotifyError::SlowDown)
             }
-            StatusCode::LOCKED => {
+            reqwest::StatusCode::LOCKED => {
                 // We consider this fatal, because it's possible that the operation blocking the control one is
                 // also the one that is waiting for this reconcile.  We should let the reconciler calling
                 // this hook fail, to give control plane a chance to un-lock.
                 tracing::info!("Control plane reports tenant is locked, dropping out of notify");
                 Err(NotifyError::Busy)
             }
-            StatusCode::SERVICE_UNAVAILABLE
-            | StatusCode::GATEWAY_TIMEOUT
-            | StatusCode::BAD_GATEWAY => Err(NotifyError::Unavailable(response.status())),
-            StatusCode::BAD_REQUEST | StatusCode::UNAUTHORIZED | StatusCode::FORBIDDEN => {
-                Err(NotifyError::Fatal(response.status()))
+            reqwest::StatusCode::SERVICE_UNAVAILABLE => {
+                Err(NotifyError::Unavailable(StatusCode::SERVICE_UNAVAILABLE))
             }
-            _ => Err(NotifyError::Unexpected(response.status())),
+            reqwest::StatusCode::GATEWAY_TIMEOUT => {
+                Err(NotifyError::Unavailable(StatusCode::GATEWAY_TIMEOUT))
+            }
+            reqwest::StatusCode::BAD_GATEWAY => {
+                Err(NotifyError::Unavailable(StatusCode::BAD_GATEWAY))
+            }
+
+            reqwest::StatusCode::BAD_REQUEST => Err(NotifyError::Fatal(StatusCode::BAD_REQUEST)),
+            reqwest::StatusCode::UNAUTHORIZED => Err(NotifyError::Fatal(StatusCode::UNAUTHORIZED)),
+            reqwest::StatusCode::FORBIDDEN => Err(NotifyError::Fatal(StatusCode::FORBIDDEN)),
+            status => Err(NotifyError::Unexpected(
+                hyper::StatusCode::from_u16(status.as_u16())
+                    .unwrap_or(StatusCode::INTERNAL_SERVER_ERROR),
+            )),
         }
     }
 
@@ -381,8 +406,6 @@ impl ComputeHook {
         reconfigure_request: &ComputeHookNotifyRequest,
         cancel: &CancellationToken,
     ) -> Result<(), NotifyError> {
-        let client = reqwest::Client::new();
-
         // We hold these semaphore units across all retries, rather than only across each
         // HTTP request: this is to preserve fairness and avoid a situation where a retry might
         // time out waiting for a semaphore.
@@ -394,7 +417,7 @@ impl ComputeHook {
             .map_err(|_| NotifyError::ShuttingDown)?;
 
         backoff::retry(
-            || self.do_notify_iteration(&client, url, reconfigure_request, cancel),
+            || self.do_notify_iteration(url, reconfigure_request, cancel),
             |e| {
                 matches!(
                     e,
@@ -411,48 +434,37 @@ impl ComputeHook {
         .and_then(|x| x)
     }
 
-    /// Call this to notify the compute (postgres) tier of new pageservers to use
-    /// for a tenant.  notify() is called by each shard individually, and this function
-    /// will decide whether an update to the tenant is sent.  An update is sent on the
-    /// condition that:
-    /// - We know a pageserver for every shard.
-    /// - All the shards have the same shard_count (i.e. we are not mid-split)
-    ///
-    /// Cancellation token enables callers to drop out, e.g. if calling from a Reconciler
-    /// that is cancelled.
-    ///
-    /// This function is fallible, including in the case that the control plane is transiently
-    /// unavailable.  A limited number of retries are done internally to efficiently hide short unavailability
-    /// periods, but we don't retry forever.  The **caller** is responsible for handling failures and
-    /// ensuring that they eventually call again to ensure that the compute is eventually notified of
-    /// the proper pageserver nodes for a tenant.
-    #[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), node_id))]
-    pub(super) async fn notify(
+    /// Synchronous phase: update the per-tenant state for the next intended notification
+    fn notify_prepare(
         &self,
         tenant_shard_id: TenantShardId,
         node_id: NodeId,
         stripe_size: ShardStripeSize,
+    ) -> MaybeSendResult {
+        let mut state_locked = self.state.lock().unwrap();
+
+        use std::collections::hash_map::Entry;
+        let tenant = match state_locked.entry(tenant_shard_id.tenant_id) {
+            Entry::Vacant(e) => e.insert(ComputeHookTenant::new(
+                tenant_shard_id,
+                stripe_size,
+                node_id,
+            )),
+            Entry::Occupied(e) => {
+                let tenant = e.into_mut();
+                tenant.update(tenant_shard_id, stripe_size, node_id);
+                tenant
+            }
+        };
+        tenant.maybe_send(tenant_shard_id.tenant_id, None)
+    }
+
+    async fn notify_execute(
+        &self,
+        maybe_send_result: MaybeSendResult,
+        tenant_shard_id: TenantShardId,
         cancel: &CancellationToken,
     ) -> Result<(), NotifyError> {
-        let maybe_send_result = {
-            let mut state_locked = self.state.lock().unwrap();
-
-            use std::collections::hash_map::Entry;
-            let tenant = match state_locked.entry(tenant_shard_id.tenant_id) {
-                Entry::Vacant(e) => e.insert(ComputeHookTenant::new(
-                    tenant_shard_id,
-                    stripe_size,
-                    node_id,
-                )),
-                Entry::Occupied(e) => {
-                    let tenant = e.into_mut();
-                    tenant.update(tenant_shard_id, stripe_size, node_id);
-                    tenant
-                }
-            };
-            tenant.maybe_send(tenant_shard_id.tenant_id, None)
-        };
-
         // Process result: we may get an update to send, or we may have to wait for a lock
         // before trying again.
         let (request, mut send_lock_guard) = match maybe_send_result {
@@ -460,7 +472,12 @@ impl ComputeHook {
                 return Ok(());
             }
             MaybeSendResult::AwaitLock(send_lock) => {
-                let send_locked = send_lock.lock_owned().await;
+                let send_locked = tokio::select! {
+                    guard = send_lock.lock_owned() => {guard},
+                    _ = cancel.cancelled() => {
+                        return Err(NotifyError::ShuttingDown)
+                    }
+                };
 
                 // Lock order: maybe_send is called within the `[Self::state]` lock, and takes the send lock, but here
                 // we have acquired the send lock and take `[Self::state]` lock.  This is safe because maybe_send only uses
@@ -499,6 +516,94 @@ impl ComputeHook {
         }
         result
     }
+
+    /// Infallible synchronous fire-and-forget version of notify(), that sends its results to
+    /// a channel.  Something should consume the channel and arrange to try notifying again
+    /// if something failed.
+    pub(super) fn notify_background(
+        self: &Arc<Self>,
+        notifications: Vec<(TenantShardId, NodeId, ShardStripeSize)>,
+        result_tx: tokio::sync::mpsc::Sender<Result<(), (TenantShardId, NotifyError)>>,
+        cancel: &CancellationToken,
+    ) {
+        let mut maybe_sends = Vec::new();
+        for (tenant_shard_id, node_id, stripe_size) in notifications {
+            let maybe_send_result = self.notify_prepare(tenant_shard_id, node_id, stripe_size);
+            maybe_sends.push((tenant_shard_id, maybe_send_result))
+        }
+
+        let this = self.clone();
+        let cancel = cancel.clone();
+
+        tokio::task::spawn(async move {
+            // Construct an async stream of futures to invoke the compute notify function: we do this
+            // in order to subsequently use .buffered() on the stream to execute with bounded parallelism.  The
+            // ComputeHook semaphore already limits concurrency, but this way we avoid constructing+polling lots of futures which
+            // would mostly just be waiting on that semaphore.
+            let mut stream = futures::stream::iter(maybe_sends)
+                .map(|(tenant_shard_id, maybe_send_result)| {
+                    let this = this.clone();
+                    let cancel = cancel.clone();
+
+                    async move {
+                        this
+                            .notify_execute(maybe_send_result, tenant_shard_id, &cancel)
+                            .await.map_err(|e| (tenant_shard_id, e))
+                    }.instrument(info_span!(
+                        "notify_background", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()
+                    ))
+                })
+                .buffered(API_CONCURRENCY);
+
+            loop {
+                tokio::select! {
+                    next = stream.next() => {
+                        match next {
+                            Some(r) => {
+                                result_tx.send(r).await.ok();
+                            },
+                            None => {
+                                tracing::info!("Finished sending background compute notifications");
+                                break;
+                            }
+                        }
+                    },
+                    _ = cancel.cancelled() => {
+                        tracing::info!("Shutdown while running background compute notifications");
+                        break;
+                    }
+                };
+            }
+        });
+    }
+
+    /// Call this to notify the compute (postgres) tier of new pageservers to use
+    /// for a tenant.  notify() is called by each shard individually, and this function
+    /// will decide whether an update to the tenant is sent.  An update is sent on the
+    /// condition that:
+    /// - We know a pageserver for every shard.
+    /// - All the shards have the same shard_count (i.e. we are not mid-split)
+    ///
+    /// Cancellation token enables callers to drop out, e.g. if calling from a Reconciler
+    /// that is cancelled.
+    ///
+    /// This function is fallible, including in the case that the control plane is transiently
+    /// unavailable.  A limited number of retries are done internally to efficiently hide short unavailability
+    /// periods, but we don't retry forever.  The **caller** is responsible for handling failures and
+    /// ensuring that they eventually call again to ensure that the compute is eventually notified of
+    /// the proper pageserver nodes for a tenant.
+    #[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), node_id))]
+    pub(super) async fn notify(
+        &self,
+        tenant_shard_id: TenantShardId,
+        node_id: NodeId,
+        stripe_size: ShardStripeSize,
+        cancel: &CancellationToken,
+    ) -> Result<(), NotifyError> {
+        let maybe_send_result = self.notify_prepare(tenant_shard_id, node_id, stripe_size);
+        self.notify_execute(maybe_send_result, tenant_shard_id, cancel)
+            .await
+    }
 }
 
 #[cfg(test)]
diff --git a/storage_controller/src/heartbeater.rs b/storage_controller/src/heartbeater.rs
index 7669680eb6..1ef97e78eb 100644
--- a/storage_controller/src/heartbeater.rs
+++ b/storage_controller/src/heartbeater.rs
@@ -184,6 +184,19 @@ impl HeartbeaterTask {
                 }
             }
         }
+        tracing::info!(
+            "Heartbeat round complete for {} nodes, {} offline",
+            new_state.len(),
+            new_state
+                .values()
+                .filter(|s| match s {
+                    PageserverState::Available { .. } => {
+                        false
+                    }
+                    PageserverState::Offline => true,
+                })
+                .count()
+        );
 
         let mut deltas = Vec::new();
         let now = Instant::now();
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index c59bcaa174..604ad6fbaa 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -4,10 +4,12 @@ use crate::metrics::{
 };
 use crate::reconciler::ReconcileError;
 use crate::service::{Service, STARTUP_RECONCILE_TIMEOUT};
+use anyhow::Context;
 use futures::Future;
 use hyper::header::CONTENT_TYPE;
 use hyper::{Body, Request, Response};
 use hyper::{StatusCode, Uri};
+use metrics::{BuildInfo, NeonMetrics};
 use pageserver_api::models::{
     TenantConfigRequest, TenantCreateRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
     TenantTimeTravelRequest, TimelineCreateRequest,
@@ -44,15 +46,19 @@ use control_plane::storage_controller::{AttachHookRequest, InspectRequest};
 use routerify::Middleware;
 
 /// State available to HTTP request handlers
-#[derive(Clone)]
 pub struct HttpState {
     service: Arc<crate::service::Service>,
     auth: Option<Arc<SwappableJwtAuth>>,
+    neon_metrics: NeonMetrics,
     allowlist_routes: Vec<Uri>,
 }
 
 impl HttpState {
-    pub fn new(service: Arc<crate::service::Service>, auth: Option<Arc<SwappableJwtAuth>>) -> Self {
+    pub fn new(
+        service: Arc<crate::service::Service>,
+        auth: Option<Arc<SwappableJwtAuth>>,
+        build_info: BuildInfo,
+    ) -> Self {
         let allowlist_routes = ["/status", "/ready", "/metrics"]
             .iter()
             .map(|v| v.parse().unwrap())
@@ -60,6 +66,7 @@ impl HttpState {
         Self {
             service,
             auth,
+            neon_metrics: NeonMetrics::new(build_info),
             allowlist_routes,
         }
     }
@@ -252,6 +259,12 @@ async fn handle_tenant_time_travel_remote_storage(
     json_response(StatusCode::OK, ())
 }
 
+fn map_reqwest_hyper_status(status: reqwest::StatusCode) -> Result<hyper::StatusCode, ApiError> {
+    hyper::StatusCode::from_u16(status.as_u16())
+        .context("invalid status code")
+        .map_err(ApiError::InternalServerError)
+}
+
 async fn handle_tenant_secondary_download(
     service: Arc<Service>,
     req: Request<Body>,
@@ -260,7 +273,7 @@ async fn handle_tenant_secondary_download(
     let wait = parse_query_param(&req, "wait_ms")?.map(Duration::from_millis);
 
     let (status, progress) = service.tenant_secondary_download(tenant_id, wait).await?;
-    json_response(status, progress)
+    json_response(map_reqwest_hyper_status(status)?, progress)
 }
 
 async fn handle_tenant_delete(
@@ -271,7 +284,10 @@ async fn handle_tenant_delete(
     check_permissions(&req, Scope::PageServerApi)?;
 
     deletion_wrapper(service, move |service| async move {
-        service.tenant_delete(tenant_id).await
+        service
+            .tenant_delete(tenant_id)
+            .await
+            .and_then(map_reqwest_hyper_status)
     })
     .await
 }
@@ -302,7 +318,10 @@ async fn handle_tenant_timeline_delete(
     let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
 
     deletion_wrapper(service, move |service| async move {
-        service.tenant_timeline_delete(tenant_id, timeline_id).await
+        service
+            .tenant_timeline_delete(tenant_id, timeline_id)
+            .await
+            .and_then(map_reqwest_hyper_status)
     })
     .await
 }
@@ -365,11 +384,9 @@ async fn handle_tenant_timeline_passthrough(
     }
 
     // We have a reqest::Response, would like a http::Response
-    let mut builder = hyper::Response::builder()
-        .status(resp.status())
-        .version(resp.version());
+    let mut builder = hyper::Response::builder().status(map_reqwest_hyper_status(resp.status())?);
     for (k, v) in resp.headers() {
-        builder = builder.header(k, v);
+        builder = builder.header(k.as_str(), v.as_bytes());
     }
 
     let response = builder
@@ -516,6 +533,18 @@ async fn handle_tenant_drop(req: Request<Body>) -> Result<Response<Body>, ApiErr
     json_response(StatusCode::OK, state.service.tenant_drop(tenant_id).await?)
 }
 
+async fn handle_tenant_import(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    check_permissions(&req, Scope::PageServerApi)?;
+
+    let state = get_state(&req);
+
+    json_response(
+        StatusCode::OK,
+        state.service.tenant_import(tenant_id).await?,
+    )
+}
+
 async fn handle_tenants_dump(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
@@ -672,10 +701,11 @@ fn epilogue_metrics_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>
     })
 }
 
-pub async fn measured_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
+pub async fn measured_metrics_handler(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     pub const TEXT_FORMAT: &str = "text/plain; version=0.0.4";
 
-    let payload = crate::metrics::METRICS_REGISTRY.encode();
+    let state = get_state(&req);
+    let payload = crate::metrics::METRICS_REGISTRY.encode(&state.neon_metrics);
     let response = Response::builder()
         .status(200)
         .header(CONTENT_TYPE, TEXT_FORMAT)
@@ -704,6 +734,7 @@ where
 pub fn make_router(
     service: Arc<Service>,
     auth: Option<Arc<SwappableJwtAuth>>,
+    build_info: BuildInfo,
 ) -> RouterBuilder<hyper::Body, ApiError> {
     let mut router = endpoint::make_router()
         .middleware(prologue_metrics_middleware())
@@ -720,7 +751,7 @@ pub fn make_router(
     }
 
     router
-        .data(Arc::new(HttpState::new(service, auth)))
+        .data(Arc::new(HttpState::new(service, auth, build_info)))
         .get("/metrics", |r| {
             named_request_span(r, measured_metrics_handler, RequestName("metrics"))
         })
@@ -751,6 +782,13 @@ pub fn make_router(
         .post("/debug/v1/node/:node_id/drop", |r| {
             named_request_span(r, handle_node_drop, RequestName("debug_v1_node_drop"))
         })
+        .post("/debug/v1/tenant/:tenant_id/import", |r| {
+            named_request_span(
+                r,
+                handle_tenant_import,
+                RequestName("debug_v1_tenant_import"),
+            )
+        })
         .get("/debug/v1/tenant", |r| {
             named_request_span(r, handle_tenants_dump, RequestName("debug_v1_tenant"))
         })
@@ -874,7 +912,7 @@ pub fn make_router(
                 RequestName("v1_tenant_timeline"),
             )
         })
-        // Tenant detail GET passthrough to shard zero
+        // Tenant detail GET passthrough to shard zero:
         .get("/v1/tenant/:tenant_id", |r| {
             tenant_service_handler(
                 r,
@@ -882,13 +920,14 @@ pub fn make_router(
                 RequestName("v1_tenant_passthrough"),
             )
         })
-        // Timeline GET passthrough to shard zero.  Note that the `*` in the URL is a wildcard: any future
-        // timeline GET APIs will be implicitly included.
-        .get("/v1/tenant/:tenant_id/timeline*", |r| {
+        // The `*` in the  URL is a wildcard: any tenant/timeline GET APIs on the pageserver
+        // are implicitly exposed here.  This must be last in the list to avoid
+        // taking precedence over other GET methods we might implement by hand.
+        .get("/v1/tenant/:tenant_id/*", |r| {
             tenant_service_handler(
                 r,
                 handle_tenant_timeline_passthrough,
-                RequestName("v1_tenant_timeline_passthrough"),
+                RequestName("v1_tenant_passthrough"),
             )
         })
 }
diff --git a/storage_controller/src/id_lock_map.rs b/storage_controller/src/id_lock_map.rs
index b03700b50c..dff793289f 100644
--- a/storage_controller/src/id_lock_map.rs
+++ b/storage_controller/src/id_lock_map.rs
@@ -1,25 +1,64 @@
+use std::fmt::Display;
+use std::time::Instant;
 use std::{collections::HashMap, sync::Arc};
 
+use std::time::Duration;
+
+use crate::service::RECONCILE_TIMEOUT;
+
+const LOCK_TIMEOUT_ALERT_THRESHOLD: Duration = RECONCILE_TIMEOUT;
+
+/// A wrapper around `OwnedRwLockWriteGuard` that when dropped changes the
+/// current holding operation in lock.
+pub struct WrappedWriteGuard<T: Display> {
+    guard: tokio::sync::OwnedRwLockWriteGuard<Option<T>>,
+    start: Instant,
+}
+
+impl<T: Display> WrappedWriteGuard<T> {
+    pub fn new(guard: tokio::sync::OwnedRwLockWriteGuard<Option<T>>) -> Self {
+        Self {
+            guard,
+            start: Instant::now(),
+        }
+    }
+}
+
+impl<T: Display> Drop for WrappedWriteGuard<T> {
+    fn drop(&mut self) {
+        let duration = self.start.elapsed();
+        if duration > LOCK_TIMEOUT_ALERT_THRESHOLD {
+            tracing::warn!(
+                "Lock on {} was held for {:?}",
+                self.guard.as_ref().unwrap(),
+                duration
+            );
+        }
+        *self.guard = None;
+    }
+}
+
 /// A map of locks covering some arbitrary identifiers. Useful if you have a collection of objects but don't
 /// want to embed a lock in each one, or if your locking granularity is different to your object granularity.
 /// For example, used in the storage controller where the objects are tenant shards, but sometimes locking
 /// is needed at a tenant-wide granularity.
-pub(crate) struct IdLockMap<T>
+pub(crate) struct IdLockMap<T, I>
 where
     T: Eq + PartialEq + std::hash::Hash,
 {
     /// A synchronous lock for getting/setting the async locks that our callers will wait on.
-    entities: std::sync::Mutex<std::collections::HashMap<T, Arc<tokio::sync::RwLock<()>>>>,
+    entities: std::sync::Mutex<std::collections::HashMap<T, Arc<tokio::sync::RwLock<Option<I>>>>>,
 }
 
-impl<T> IdLockMap<T>
+impl<T, I> IdLockMap<T, I>
 where
     T: Eq + PartialEq + std::hash::Hash,
+    I: Display,
 {
     pub(crate) fn shared(
         &self,
         key: T,
-    ) -> impl std::future::Future<Output = tokio::sync::OwnedRwLockReadGuard<()>> {
+    ) -> impl std::future::Future<Output = tokio::sync::OwnedRwLockReadGuard<Option<I>>> {
         let mut locked = self.entities.lock().unwrap();
         let entry = locked.entry(key).or_default();
         entry.clone().read_owned()
@@ -28,21 +67,26 @@ where
     pub(crate) fn exclusive(
         &self,
         key: T,
-    ) -> impl std::future::Future<Output = tokio::sync::OwnedRwLockWriteGuard<()>> {
+        operation: I,
+    ) -> impl std::future::Future<Output = WrappedWriteGuard<I>> {
         let mut locked = self.entities.lock().unwrap();
-        let entry = locked.entry(key).or_default();
-        entry.clone().write_owned()
+        let entry = locked.entry(key).or_default().clone();
+        async move {
+            let mut guard = WrappedWriteGuard::new(entry.clone().write_owned().await);
+            *guard.guard = Some(operation);
+            guard
+        }
     }
 
     /// Rather than building a lock guard that re-takes the [`Self::entities`] lock, we just do
     /// periodic housekeeping to avoid the map growing indefinitely
     pub(crate) fn housekeeping(&self) {
         let mut locked = self.entities.lock().unwrap();
-        locked.retain(|_k, lock| lock.try_write().is_err())
+        locked.retain(|_k, entry| entry.try_write().is_err())
     }
 }
 
-impl<T> Default for IdLockMap<T>
+impl<T, I> Default for IdLockMap<T, I>
 where
     T: Eq + PartialEq + std::hash::Hash,
 {
@@ -52,3 +96,94 @@ where
         }
     }
 }
+
+pub async fn trace_exclusive_lock<
+    T: Clone + Display + Eq + PartialEq + std::hash::Hash,
+    I: Display + Clone,
+>(
+    op_locks: &IdLockMap<T, I>,
+    key: T,
+    operation: I,
+) -> WrappedWriteGuard<I> {
+    let start = Instant::now();
+    let guard = op_locks.exclusive(key.clone(), operation.clone()).await;
+
+    let duration = start.elapsed();
+    if duration > LOCK_TIMEOUT_ALERT_THRESHOLD {
+        tracing::warn!(
+            "Operation {} on key {} has waited {:?} for exclusive lock",
+            operation,
+            key,
+            duration
+        );
+    }
+
+    guard
+}
+
+pub async fn trace_shared_lock<
+    T: Clone + Display + Eq + PartialEq + std::hash::Hash,
+    I: Display,
+>(
+    op_locks: &IdLockMap<T, I>,
+    key: T,
+    operation: I,
+) -> tokio::sync::OwnedRwLockReadGuard<Option<I>> {
+    let start = Instant::now();
+    let guard = op_locks.shared(key.clone()).await;
+
+    let duration = start.elapsed();
+    if duration > LOCK_TIMEOUT_ALERT_THRESHOLD {
+        tracing::warn!(
+            "Operation {} on key {} has waited {:?} for shared lock",
+            operation,
+            key,
+            duration
+        );
+    }
+
+    guard
+}
+
+#[cfg(test)]
+mod tests {
+    use super::IdLockMap;
+
+    #[derive(Clone, Debug, strum_macros::Display, PartialEq)]
+    enum Operations {
+        Op1,
+        Op2,
+    }
+
+    #[tokio::test]
+    async fn multiple_shared_locks() {
+        let id_lock_map: IdLockMap<i32, Operations> = IdLockMap::default();
+
+        let shared_lock_1 = id_lock_map.shared(1).await;
+        let shared_lock_2 = id_lock_map.shared(1).await;
+
+        assert!(shared_lock_1.is_none());
+        assert!(shared_lock_2.is_none());
+    }
+
+    #[tokio::test]
+    async fn exclusive_locks() {
+        let id_lock_map = IdLockMap::default();
+        let resource_id = 1;
+
+        {
+            let _ex_lock = id_lock_map.exclusive(resource_id, Operations::Op1).await;
+            assert_eq!(_ex_lock.guard.clone().unwrap(), Operations::Op1);
+
+            let _ex_lock_2 = tokio::time::timeout(
+                tokio::time::Duration::from_millis(1),
+                id_lock_map.exclusive(resource_id, Operations::Op2),
+            )
+            .await;
+            assert!(_ex_lock_2.is_err());
+        }
+
+        let shared_lock_1 = id_lock_map.shared(resource_id).await;
+        assert!(shared_lock_1.is_none());
+    }
+}
diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs
index 3c03d6efe8..f1454af533 100644
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -3,11 +3,14 @@ use camino::Utf8PathBuf;
 use clap::Parser;
 use diesel::Connection;
 use metrics::launch_timestamp::LaunchTimestamp;
+use metrics::BuildInfo;
 use std::sync::Arc;
 use storage_controller::http::make_router;
 use storage_controller::metrics::preinitialize_metrics;
 use storage_controller::persistence::Persistence;
-use storage_controller::service::{Config, Service, MAX_UNAVAILABLE_INTERVAL_DEFAULT};
+use storage_controller::service::{
+    Config, Service, MAX_UNAVAILABLE_INTERVAL_DEFAULT, RECONCILER_CONCURRENCY_DEFAULT,
+};
 use tokio::signal::unix::SignalKind;
 use tokio_util::sync::CancellationToken;
 use utils::auth::{JwtAuth, SwappableJwtAuth};
@@ -62,6 +65,14 @@ struct Cli {
     /// Grace period before marking unresponsive pageserver offline
     #[arg(long)]
     max_unavailable_interval: Option<humantime::Duration>,
+
+    /// Maximum number of reconcilers that may run in parallel
+    #[arg(long)]
+    reconciler_concurrency: Option<usize>,
+
+    /// How long to wait for the initial database connection to be available.
+    #[arg(long, default_value = "5s")]
+    db_connect_timeout: humantime::Duration,
 }
 
 enum StrictMode {
@@ -192,6 +203,11 @@ async fn async_main() -> anyhow::Result<()> {
         args.listen
     );
 
+    let build_info = BuildInfo {
+        revision: GIT_VERSION,
+        build_tag: BUILD_TAG,
+    };
+
     let strict_mode = if args.dev {
         StrictMode::Dev
     } else {
@@ -236,9 +252,14 @@ async fn async_main() -> anyhow::Result<()> {
             .max_unavailable_interval
             .map(humantime::Duration::into)
             .unwrap_or(MAX_UNAVAILABLE_INTERVAL_DEFAULT),
+        reconciler_concurrency: args
+            .reconciler_concurrency
+            .unwrap_or(RECONCILER_CONCURRENCY_DEFAULT),
     };
 
     // After loading secrets & config, but before starting anything else, apply database migrations
+    Persistence::await_connection(&secrets.database_url, args.db_connect_timeout.into()).await?;
+
     migration_run(&secrets.database_url)
         .await
         .context("Running database migrations")?;
@@ -253,7 +274,7 @@ async fn async_main() -> anyhow::Result<()> {
     let auth = secrets
         .public_key
         .map(|jwt_auth| Arc::new(SwappableJwtAuth::new(jwt_auth)));
-    let router = make_router(service.clone(), auth)
+    let router = make_router(service.clone(), auth, build_info)
         .build()
         .map_err(|err| anyhow!(err))?;
     let router_service = utils::http::RouterService::new(router).unwrap();
diff --git a/storage_controller/src/metrics.rs b/storage_controller/src/metrics.rs
index cabf416b9f..ac9f22c739 100644
--- a/storage_controller/src/metrics.rs
+++ b/storage_controller/src/metrics.rs
@@ -8,10 +8,8 @@
 //! The rest of the code defines label group types and deals with converting outer types to labels.
 //!
 use bytes::Bytes;
-use measured::{
-    label::{LabelValue, StaticLabelSet},
-    FixedCardinalityLabel, MetricGroup,
-};
+use measured::{label::LabelValue, metric::histogram, FixedCardinalityLabel, MetricGroup};
+use metrics::NeonMetrics;
 use once_cell::sync::Lazy;
 use std::sync::Mutex;
 
@@ -26,13 +24,15 @@ pub fn preinitialize_metrics() {
 
 pub(crate) struct StorageControllerMetrics {
     pub(crate) metrics_group: StorageControllerMetricGroup,
-    encoder: Mutex<measured::text::TextEncoder>,
+    encoder: Mutex<measured::text::BufferedTextEncoder>,
 }
 
 #[derive(measured::MetricGroup)]
+#[metric(new())]
 pub(crate) struct StorageControllerMetricGroup {
     /// Count of how many times we spawn a reconcile task
     pub(crate) storage_controller_reconcile_spawn: measured::Counter,
+
     /// Reconciler tasks completed, broken down by success/failure/cancelled
     pub(crate) storage_controller_reconcile_complete:
         measured::CounterVec<ReconcileCompleteLabelGroupSet>,
@@ -43,7 +43,9 @@ pub(crate) struct StorageControllerMetricGroup {
     /// HTTP request status counters for handled requests
     pub(crate) storage_controller_http_request_status:
         measured::CounterVec<HttpRequestStatusLabelGroupSet>,
+
     /// HTTP request handler latency across all status codes
+    #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
     pub(crate) storage_controller_http_request_latency:
         measured::HistogramVec<HttpRequestLatencyLabelGroupSet, 5>,
 
@@ -55,6 +57,7 @@ pub(crate) struct StorageControllerMetricGroup {
     /// Latency of HTTP requests to the pageserver, broken down by pageserver
     /// node id, request name and method. This include both successful and unsuccessful
     /// requests.
+    #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
     pub(crate) storage_controller_pageserver_request_latency:
         measured::HistogramVec<PageserverRequestLabelGroupSet, 5>,
 
@@ -66,6 +69,7 @@ pub(crate) struct StorageControllerMetricGroup {
     /// Latency of pass-through HTTP requests to the pageserver, broken down by pageserver
     /// node id, request name and method. This include both successful and unsuccessful
     /// requests.
+    #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
     pub(crate) storage_controller_passthrough_request_latency:
         measured::HistogramVec<PageserverRequestLabelGroupSet, 5>,
 
@@ -74,76 +78,34 @@ pub(crate) struct StorageControllerMetricGroup {
         measured::CounterVec<DatabaseQueryErrorLabelGroupSet>,
 
     /// Latency of database queries, broken down by operation.
+    #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
     pub(crate) storage_controller_database_query_latency:
         measured::HistogramVec<DatabaseQueryLatencyLabelGroupSet, 5>,
 }
 
 impl StorageControllerMetrics {
-    pub(crate) fn encode(&self) -> Bytes {
+    pub(crate) fn encode(&self, neon_metrics: &NeonMetrics) -> Bytes {
         let mut encoder = self.encoder.lock().unwrap();
-        self.metrics_group.collect_into(&mut *encoder);
+        neon_metrics
+            .collect_group_into(&mut *encoder)
+            .unwrap_or_else(|infallible| match infallible {});
+        self.metrics_group
+            .collect_group_into(&mut *encoder)
+            .unwrap_or_else(|infallible| match infallible {});
         encoder.finish()
     }
 }
 
 impl Default for StorageControllerMetrics {
     fn default() -> Self {
-        Self {
-            metrics_group: StorageControllerMetricGroup::new(),
-            encoder: Mutex::new(measured::text::TextEncoder::new()),
-        }
-    }
-}
+        let mut metrics_group = StorageControllerMetricGroup::new();
+        metrics_group
+            .storage_controller_reconcile_complete
+            .init_all_dense();
 
-impl StorageControllerMetricGroup {
-    pub(crate) fn new() -> Self {
         Self {
-            storage_controller_reconcile_spawn: measured::Counter::new(),
-            storage_controller_reconcile_complete: measured::CounterVec::new(
-                ReconcileCompleteLabelGroupSet {
-                    status: StaticLabelSet::new(),
-                },
-            ),
-            storage_controller_schedule_optimization: measured::Counter::new(),
-            storage_controller_http_request_status: measured::CounterVec::new(
-                HttpRequestStatusLabelGroupSet {
-                    path: lasso::ThreadedRodeo::new(),
-                    method: StaticLabelSet::new(),
-                    status: StaticLabelSet::new(),
-                },
-            ),
-            storage_controller_http_request_latency: measured::HistogramVec::new(
-                measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
-            ),
-            storage_controller_pageserver_request_error: measured::CounterVec::new(
-                PageserverRequestLabelGroupSet {
-                    pageserver_id: lasso::ThreadedRodeo::new(),
-                    path: lasso::ThreadedRodeo::new(),
-                    method: StaticLabelSet::new(),
-                },
-            ),
-            storage_controller_pageserver_request_latency: measured::HistogramVec::new(
-                measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
-            ),
-            storage_controller_passthrough_request_error: measured::CounterVec::new(
-                PageserverRequestLabelGroupSet {
-                    pageserver_id: lasso::ThreadedRodeo::new(),
-                    path: lasso::ThreadedRodeo::new(),
-                    method: StaticLabelSet::new(),
-                },
-            ),
-            storage_controller_passthrough_request_latency: measured::HistogramVec::new(
-                measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
-            ),
-            storage_controller_database_query_error: measured::CounterVec::new(
-                DatabaseQueryErrorLabelGroupSet {
-                    operation: StaticLabelSet::new(),
-                    error_type: StaticLabelSet::new(),
-                },
-            ),
-            storage_controller_database_query_latency: measured::HistogramVec::new(
-                measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
-            ),
+            metrics_group,
+            encoder: Mutex::new(measured::text::BufferedTextEncoder::new()),
         }
     }
 }
@@ -157,7 +119,7 @@ pub(crate) struct ReconcileCompleteLabelGroup {
 #[derive(measured::LabelGroup)]
 #[label(set = HttpRequestStatusLabelGroupSet)]
 pub(crate) struct HttpRequestStatusLabelGroup<'a> {
-    #[label(dynamic_with = lasso::ThreadedRodeo)]
+    #[label(dynamic_with = lasso::ThreadedRodeo, default)]
     pub(crate) path: &'a str,
     pub(crate) method: Method,
     pub(crate) status: StatusCode,
@@ -166,40 +128,21 @@ pub(crate) struct HttpRequestStatusLabelGroup<'a> {
 #[derive(measured::LabelGroup)]
 #[label(set = HttpRequestLatencyLabelGroupSet)]
 pub(crate) struct HttpRequestLatencyLabelGroup<'a> {
-    #[label(dynamic_with = lasso::ThreadedRodeo)]
+    #[label(dynamic_with = lasso::ThreadedRodeo, default)]
     pub(crate) path: &'a str,
     pub(crate) method: Method,
 }
 
-impl Default for HttpRequestLatencyLabelGroupSet {
-    fn default() -> Self {
-        Self {
-            path: lasso::ThreadedRodeo::new(),
-            method: StaticLabelSet::new(),
-        }
-    }
-}
-
 #[derive(measured::LabelGroup, Clone)]
 #[label(set = PageserverRequestLabelGroupSet)]
 pub(crate) struct PageserverRequestLabelGroup<'a> {
-    #[label(dynamic_with = lasso::ThreadedRodeo)]
+    #[label(dynamic_with = lasso::ThreadedRodeo, default)]
     pub(crate) pageserver_id: &'a str,
-    #[label(dynamic_with = lasso::ThreadedRodeo)]
+    #[label(dynamic_with = lasso::ThreadedRodeo, default)]
     pub(crate) path: &'a str,
     pub(crate) method: Method,
 }
 
-impl Default for PageserverRequestLabelGroupSet {
-    fn default() -> Self {
-        Self {
-            pageserver_id: lasso::ThreadedRodeo::new(),
-            path: lasso::ThreadedRodeo::new(),
-            method: StaticLabelSet::new(),
-        }
-    }
-}
-
 #[derive(measured::LabelGroup)]
 #[label(set = DatabaseQueryErrorLabelGroupSet)]
 pub(crate) struct DatabaseQueryErrorLabelGroup {
@@ -213,7 +156,7 @@ pub(crate) struct DatabaseQueryLatencyLabelGroup {
     pub(crate) operation: DatabaseOperation,
 }
 
-#[derive(FixedCardinalityLabel)]
+#[derive(FixedCardinalityLabel, Clone, Copy)]
 pub(crate) enum ReconcileOutcome {
     #[label(rename = "ok")]
     Success,
@@ -221,7 +164,7 @@ pub(crate) enum ReconcileOutcome {
     Cancel,
 }
 
-#[derive(FixedCardinalityLabel, Clone)]
+#[derive(FixedCardinalityLabel, Copy, Clone)]
 pub(crate) enum Method {
     Get,
     Put,
@@ -246,11 +189,12 @@ impl From<hyper::Method> for Method {
     }
 }
 
+#[derive(Clone, Copy)]
 pub(crate) struct StatusCode(pub(crate) hyper::http::StatusCode);
 
 impl LabelValue for StatusCode {
     fn visit<V: measured::label::LabelVisitor>(&self, v: V) -> V::Output {
-        v.write_int(self.0.as_u16() as u64)
+        v.write_int(self.0.as_u16() as i64)
     }
 }
 
@@ -268,7 +212,7 @@ impl FixedCardinalityLabel for StatusCode {
     }
 }
 
-#[derive(FixedCardinalityLabel)]
+#[derive(FixedCardinalityLabel, Clone, Copy)]
 pub(crate) enum DatabaseErrorLabel {
     Query,
     Connection,
diff --git a/storage_controller/src/node.rs b/storage_controller/src/node.rs
index 7ba6828deb..7b5513c908 100644
--- a/storage_controller/src/node.rs
+++ b/storage_controller/src/node.rs
@@ -1,6 +1,5 @@
 use std::{str::FromStr, time::Duration};
 
-use hyper::StatusCode;
 use pageserver_api::{
     controller_api::{
         NodeAvailability, NodeDescribeResponse, NodeRegisterRequest, NodeSchedulingPolicy,
@@ -9,6 +8,7 @@ use pageserver_api::{
     shard::TenantShardId,
 };
 use pageserver_client::mgmt_api;
+use reqwest::StatusCode;
 use serde::Serialize;
 use tokio_util::sync::CancellationToken;
 use utils::{backoff, id::NodeId};
diff --git a/storage_controller/src/pageserver_client.rs b/storage_controller/src/pageserver_client.rs
index 8237229d7b..25b6b67e12 100644
--- a/storage_controller/src/pageserver_client.rs
+++ b/storage_controller/src/pageserver_client.rs
@@ -1,13 +1,14 @@
 use pageserver_api::{
     models::{
         LocationConfig, LocationConfigListResponse, PageserverUtilization, SecondaryProgress,
-        TenantShardSplitRequest, TenantShardSplitResponse, TimelineCreateRequest, TimelineInfo,
+        TenantScanRemoteStorageResponse, TenantShardSplitRequest, TenantShardSplitResponse,
+        TimelineCreateRequest, TimelineInfo,
     },
     shard::TenantShardId,
 };
 use pageserver_client::mgmt_api::{Client, Result};
 use reqwest::StatusCode;
-use utils::id::{NodeId, TimelineId};
+use utils::id::{NodeId, TenantId, TimelineId};
 
 /// Thin wrapper around [`pageserver_client::mgmt_api::Client`]. It allows the storage
 /// controller to collect metrics in a non-intrusive manner.
@@ -88,6 +89,18 @@ impl PageserverClient {
         )
     }
 
+    pub(crate) async fn tenant_scan_remote_storage(
+        &self,
+        tenant_id: TenantId,
+    ) -> Result<TenantScanRemoteStorageResponse> {
+        measured_request!(
+            "tenant_scan_remote_storage",
+            crate::metrics::Method::Get,
+            &self.node_id_label,
+            self.inner.tenant_scan_remote_storage(tenant_id).await
+        )
+    }
+
     pub(crate) async fn tenant_secondary_download(
         &self,
         tenant_id: TenantShardId,
@@ -101,6 +114,27 @@ impl PageserverClient {
         )
     }
 
+    pub(crate) async fn tenant_secondary_status(
+        &self,
+        tenant_shard_id: TenantShardId,
+    ) -> Result<SecondaryProgress> {
+        measured_request!(
+            "tenant_secondary_status",
+            crate::metrics::Method::Get,
+            &self.node_id_label,
+            self.inner.tenant_secondary_status(tenant_shard_id).await
+        )
+    }
+
+    pub(crate) async fn tenant_heatmap_upload(&self, tenant_id: TenantShardId) -> Result<()> {
+        measured_request!(
+            "tenant_heatmap_upload",
+            crate::metrics::Method::Post,
+            &self.node_id_label,
+            self.inner.tenant_heatmap_upload(tenant_id).await
+        )
+    }
+
     pub(crate) async fn location_config(
         &self,
         tenant_shard_id: TenantShardId,
diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index 55fbfd10bc..dca37166ba 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -2,6 +2,7 @@ pub(crate) mod split_state;
 use std::collections::HashMap;
 use std::str::FromStr;
 use std::time::Duration;
+use std::time::Instant;
 
 use self::split_state::SplitState;
 use camino::Utf8Path;
@@ -79,7 +80,7 @@ pub(crate) enum DatabaseError {
     Logical(String),
 }
 
-#[derive(measured::FixedCardinalityLabel, Clone)]
+#[derive(measured::FixedCardinalityLabel, Copy, Clone)]
 pub(crate) enum DatabaseOperation {
     InsertNode,
     UpdateNode,
@@ -144,6 +145,31 @@ impl Persistence {
         }
     }
 
+    /// A helper for use during startup, where we would like to tolerate concurrent restarts of the
+    /// database and the storage controller, therefore the database might not be available right away
+    pub async fn await_connection(
+        database_url: &str,
+        timeout: Duration,
+    ) -> Result<(), diesel::ConnectionError> {
+        let started_at = Instant::now();
+        loop {
+            match PgConnection::establish(database_url) {
+                Ok(_) => {
+                    tracing::info!("Connected to database.");
+                    return Ok(());
+                }
+                Err(e) => {
+                    if started_at.elapsed() > timeout {
+                        return Err(e);
+                    } else {
+                        tracing::info!("Database not yet available, waiting... ({e})");
+                        tokio::time::sleep(Duration::from_millis(100)).await;
+                    }
+                }
+            }
+        }
+    }
+
     /// Wraps `with_conn` in order to collect latency and error metrics
     async fn with_measured_conn<F, R>(&self, op: DatabaseOperation, func: F) -> DatabaseResult<R>
     where
@@ -153,9 +179,7 @@ impl Persistence {
         let latency = &METRICS_REGISTRY
             .metrics_group
             .storage_controller_database_query_latency;
-        let _timer = latency.start_timer(DatabaseQueryLatencyLabelGroup {
-            operation: op.clone(),
-        });
+        let _timer = latency.start_timer(DatabaseQueryLatencyLabelGroup { operation: op });
 
         let res = self.with_conn(func).await;
 
diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs
index 49cfaad569..fe97f724c1 100644
--- a/storage_controller/src/reconciler.rs
+++ b/storage_controller/src/reconciler.rs
@@ -1,12 +1,12 @@
 use crate::pageserver_client::PageserverClient;
 use crate::persistence::Persistence;
 use crate::service;
-use hyper::StatusCode;
 use pageserver_api::models::{
     LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig,
 };
 use pageserver_api::shard::{ShardIdentity, TenantShardId};
 use pageserver_client::mgmt_api;
+use reqwest::StatusCode;
 use std::collections::HashMap;
 use std::sync::Arc;
 use std::time::{Duration, Instant};
@@ -51,6 +51,10 @@ pub(super) struct Reconciler {
     /// so that we can set [`crate::tenant_shard::TenantShard::pending_compute_notification`] to ensure a later retry.
     pub(crate) compute_notify_failure: bool,
 
+    /// Reconciler is responsible for keeping alive semaphore units that limit concurrency on how many
+    /// we will spawn.
+    pub(crate) _resource_units: ReconcileUnits,
+
     /// A means to abort background reconciliation: it is essential to
     /// call this when something changes in the original TenantShard that
     /// will make this reconciliation impossible or unnecessary, for
@@ -66,6 +70,19 @@ pub(super) struct Reconciler {
     pub(crate) persistence: Arc<Persistence>,
 }
 
+/// RAII resource units granted to a Reconciler, which it should keep alive until it finishes doing I/O
+pub(crate) struct ReconcileUnits {
+    _sem_units: tokio::sync::OwnedSemaphorePermit,
+}
+
+impl ReconcileUnits {
+    pub(crate) fn new(sem_units: tokio::sync::OwnedSemaphorePermit) -> Self {
+        Self {
+            _sem_units: sem_units,
+        }
+    }
+}
+
 /// This is a snapshot of [`crate::tenant_shard::IntentState`], but it does not do any
 /// reference counting for Scheduler.  The IntentState is what the scheduler works with,
 /// and the TargetState is just the instruction for a particular Reconciler run.
@@ -750,7 +767,10 @@ impl Reconciler {
                 // It is up to the caller whether they want to drop out on this error, but they don't have to:
                 // in general we should avoid letting unavailability of the cloud control plane stop us from
                 // making progress.
-                tracing::warn!("Failed to notify compute of attached pageserver {node}: {e}");
+                if !matches!(e, NotifyError::ShuttingDown) {
+                    tracing::warn!("Failed to notify compute of attached pageserver {node}: {e}");
+                }
+
                 // Set this flag so that in our ReconcileResult we will set the flag on the shard that it
                 // needs to retry at some point.
                 self.compute_notify_failure = true;
diff --git a/storage_controller/src/scheduler.rs b/storage_controller/src/scheduler.rs
index 862ac0cbfe..3ff0d87988 100644
--- a/storage_controller/src/scheduler.rs
+++ b/storage_controller/src/scheduler.rs
@@ -84,6 +84,20 @@ impl std::ops::Add for AffinityScore {
     }
 }
 
+/// Hint for whether this is a sincere attempt to schedule, or a speculative
+/// check for where we _would_ schedule (done during optimization)
+#[derive(Debug)]
+pub(crate) enum ScheduleMode {
+    Normal,
+    Speculative,
+}
+
+impl Default for ScheduleMode {
+    fn default() -> Self {
+        Self::Normal
+    }
+}
+
 // For carrying state between multiple calls to [`TenantShard::schedule`], e.g. when calling
 // it for many shards in the same tenant.
 #[derive(Debug, Default)]
@@ -93,6 +107,8 @@ pub(crate) struct ScheduleContext {
 
     /// Specifically how many _attached_ locations are on each node
     pub(crate) attached_nodes: HashMap<NodeId, usize>,
+
+    pub(crate) mode: ScheduleMode,
 }
 
 impl ScheduleContext {
@@ -329,27 +345,34 @@ impl Scheduler {
         scores.sort_by_key(|i| (i.1, i.2, i.0));
 
         if scores.is_empty() {
-            // After applying constraints, no pageservers were left.  We log some detail about
-            // the state of nodes to help understand why this happened.  This is not logged as an error because
-            // it is legitimately possible for enough nodes to be Offline to prevent scheduling a shard.
-            tracing::info!("Scheduling failure, while excluding {hard_exclude:?}, node states:");
-            for (node_id, node) in &self.nodes {
+            // After applying constraints, no pageservers were left.
+            if !matches!(context.mode, ScheduleMode::Speculative) {
+                // If this was not a speculative attempt, log details to understand why we couldn't
+                // schedule: this may help an engineer understand if some nodes are marked offline
+                // in a way that's preventing progress.
                 tracing::info!(
-                    "Node {node_id}: may_schedule={} shards={}",
-                    node.may_schedule != MaySchedule::No,
-                    node.shard_count
+                    "Scheduling failure, while excluding {hard_exclude:?}, node states:"
                 );
+                for (node_id, node) in &self.nodes {
+                    tracing::info!(
+                        "Node {node_id}: may_schedule={} shards={}",
+                        node.may_schedule != MaySchedule::No,
+                        node.shard_count
+                    );
+                }
             }
-
             return Err(ScheduleError::ImpossibleConstraint);
         }
 
         // Lowest score wins
         let node_id = scores.first().unwrap().0;
-        tracing::info!(
+
+        if !matches!(context.mode, ScheduleMode::Speculative) {
+            tracing::info!(
             "scheduler selected node {node_id} (elegible nodes {:?}, hard exclude: {hard_exclude:?}, soft exclude: {context:?})",
             scores.iter().map(|i| i.0 .0).collect::<Vec<_>>()
         );
+        }
 
         // Note that we do not update shard count here to reflect the scheduling: that
         // is IntentState's job when the scheduled location is used.
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 010558b797..d3a53066c9 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -8,10 +8,14 @@ use std::{
 };
 
 use crate::{
-    id_lock_map::IdLockMap,
+    compute_hook::NotifyError,
+    id_lock_map::{trace_exclusive_lock, trace_shared_lock, IdLockMap, WrappedWriteGuard},
     persistence::{AbortShardSplitStatus, TenantFilter},
-    reconciler::ReconcileError,
-    scheduler::ScheduleContext,
+    reconciler::{ReconcileError, ReconcileUnits},
+    scheduler::{ScheduleContext, ScheduleMode},
+    tenant_shard::{
+        MigrateAttachment, ReconcileNeeded, ScheduleOptimization, ScheduleOptimizationAction,
+    },
 };
 use anyhow::Context;
 use control_plane::storage_controller::{
@@ -19,7 +23,6 @@ use control_plane::storage_controller::{
 };
 use diesel::result::DatabaseErrorKind;
 use futures::{stream::FuturesUnordered, StreamExt};
-use hyper::StatusCode;
 use itertools::Itertools;
 use pageserver_api::{
     controller_api::{
@@ -31,6 +34,8 @@ use pageserver_api::{
     },
     models::{SecondaryProgress, TenantConfigRequest},
 };
+use reqwest::StatusCode;
+use tracing::instrument;
 
 use crate::pageserver_client::PageserverClient;
 use pageserver_api::{
@@ -48,11 +53,11 @@ use pageserver_api::{
     },
 };
 use pageserver_client::mgmt_api;
-use tokio::sync::OwnedRwLockWriteGuard;
+use tokio::sync::mpsc::error::TrySendError;
 use tokio_util::sync::CancellationToken;
-use tracing::instrument;
 use utils::{
     completion::Barrier,
+    failpoint_support,
     generation::Generation,
     http::error::ApiError,
     id::{NodeId, TenantId, TimelineId},
@@ -60,7 +65,7 @@ use utils::{
 };
 
 use crate::{
-    compute_hook::{self, ComputeHook},
+    compute_hook::ComputeHook,
     heartbeater::{Heartbeater, PageserverState},
     node::{AvailabilityTransition, Node},
     persistence::{split_state::SplitState, DatabaseError, Persistence, TenantShardPersistence},
@@ -77,7 +82,7 @@ const SHORT_RECONCILE_TIMEOUT: Duration = Duration::from_secs(5);
 
 // For operations that might be slow, like migrating a tenant with
 // some data in it.
-const RECONCILE_TIMEOUT: Duration = Duration::from_secs(30);
+pub const RECONCILE_TIMEOUT: Duration = Duration::from_secs(30);
 
 // If we receive a call using Secondary mode initially, it will omit generation.  We will initialize
 // tenant shards into this generation, and as long as it remains in this generation, we will accept
@@ -88,7 +93,38 @@ const INITIAL_GENERATION: Generation = Generation::new(0);
 /// up on unresponsive pageservers and proceed.
 pub(crate) const STARTUP_RECONCILE_TIMEOUT: Duration = Duration::from_secs(30);
 
-pub const MAX_UNAVAILABLE_INTERVAL_DEFAULT: Duration = Duration::from_secs(30);
+/// How long a node may be unresponsive to heartbeats before we declare it offline.
+/// This must be long enough to cover node restarts as well as normal operations: in future
+/// it should be separated into distinct timeouts for startup vs. normal operation
+/// (`<https://github.com/neondatabase/neon/issues/7552>`)
+pub const MAX_UNAVAILABLE_INTERVAL_DEFAULT: Duration = Duration::from_secs(300);
+
+#[derive(Clone, strum_macros::Display)]
+enum TenantOperations {
+    Create,
+    LocationConfig,
+    ConfigSet,
+    TimeTravelRemoteStorage,
+    Delete,
+    UpdatePolicy,
+    ShardSplit,
+    SecondaryDownload,
+    TimelineCreate,
+    TimelineDelete,
+}
+
+#[derive(Clone, strum_macros::Display)]
+enum NodeOperations {
+    Register,
+    Configure,
+}
+
+pub const RECONCILER_CONCURRENCY_DEFAULT: usize = 128;
+
+// Depth of the channel used to enqueue shards for reconciliation when they can't do it immediately.
+// This channel is finite-size to avoid using excessive memory if we get into a state where reconciles are finishing more slowly
+// than they're being pushed onto the queue.
+const MAX_DELAYED_RECONCILES: usize = 10000;
 
 // Top level state available to all HTTP handlers
 struct ServiceState {
@@ -97,6 +133,45 @@ struct ServiceState {
     nodes: Arc<HashMap<NodeId, Node>>,
 
     scheduler: Scheduler,
+
+    /// Queue of tenants who are waiting for concurrency limits to permit them to reconcile
+    delayed_reconcile_rx: tokio::sync::mpsc::Receiver<TenantShardId>,
+}
+
+/// Transform an error from a pageserver into an error to return to callers of a storage
+/// controller API.
+fn passthrough_api_error(node: &Node, e: mgmt_api::Error) -> ApiError {
+    match e {
+        mgmt_api::Error::ReceiveErrorBody(str) => {
+            // Presume errors receiving body are connectivity/availability issues
+            ApiError::ResourceUnavailable(
+                format!("{node} error receiving error body: {str}").into(),
+            )
+        }
+        mgmt_api::Error::ReceiveBody(str) => {
+            // Presume errors receiving body are connectivity/availability issues
+            ApiError::ResourceUnavailable(format!("{node} error receiving body: {str}").into())
+        }
+        mgmt_api::Error::ApiError(StatusCode::NOT_FOUND, msg) => {
+            ApiError::NotFound(anyhow::anyhow!(format!("{node}: {msg}")).into())
+        }
+        mgmt_api::Error::ApiError(StatusCode::SERVICE_UNAVAILABLE, msg) => {
+            ApiError::ResourceUnavailable(format!("{node}: {msg}").into())
+        }
+        mgmt_api::Error::ApiError(status @ StatusCode::UNAUTHORIZED, msg)
+        | mgmt_api::Error::ApiError(status @ StatusCode::FORBIDDEN, msg) => {
+            // Auth errors talking to a pageserver are not auth errors for the caller: they are
+            // internal server errors, showing that something is wrong with the pageserver or
+            // storage controller's auth configuration.
+            ApiError::InternalServerError(anyhow::anyhow!("{node} {status}: {msg}"))
+        }
+        mgmt_api::Error::ApiError(status, msg) => {
+            // Presume general case of pageserver API errors is that we tried to do something
+            // that can't be done right now.
+            ApiError::Conflict(format!("{node} {status}: {status} {msg}"))
+        }
+        mgmt_api::Error::Cancelled => ApiError::ShuttingDown,
+    }
 }
 
 impl ServiceState {
@@ -104,11 +179,13 @@ impl ServiceState {
         nodes: HashMap<NodeId, Node>,
         tenants: BTreeMap<TenantShardId, TenantShard>,
         scheduler: Scheduler,
+        delayed_reconcile_rx: tokio::sync::mpsc::Receiver<TenantShardId>,
     ) -> Self {
         Self {
             tenants,
             nodes: Arc::new(nodes),
             scheduler,
+            delayed_reconcile_rx,
         }
     }
 
@@ -142,6 +219,9 @@ pub struct Config {
     /// considered active. Once the grace period elapses, the next heartbeat failure will
     /// mark the pagseserver offline.
     pub max_unavailable_interval: Duration,
+
+    /// How many Reconcilers may be spawned concurrently
+    pub reconciler_concurrency: usize,
 }
 
 impl From<DatabaseError> for ApiError {
@@ -174,11 +254,22 @@ pub struct Service {
     // Locking on a tenant granularity (covers all shards in the tenant):
     // - Take exclusively for rare operations that mutate the tenant's persistent state (e.g. create/delete/split)
     // - Take in shared mode for operations that need the set of shards to stay the same to complete reliably (e.g. timeline CRUD)
-    tenant_op_locks: IdLockMap<TenantId>,
+    tenant_op_locks: IdLockMap<TenantId, TenantOperations>,
 
     // Locking for node-mutating operations: take exclusively for operations that modify the node's persistent state, or
     // that transition it to/from Active.
-    node_op_locks: IdLockMap<NodeId>,
+    node_op_locks: IdLockMap<NodeId, NodeOperations>,
+
+    // Limit how many Reconcilers we will spawn concurrently
+    reconciler_concurrency: Arc<tokio::sync::Semaphore>,
+
+    /// Queue of tenants who are waiting for concurrency limits to permit them to reconcile
+    /// Send into this queue to promptly attempt to reconcile this shard next time units are available.
+    ///
+    /// Note that this state logically lives inside ServiceInner, but carrying Sender here makes the code simpler
+    /// by avoiding needing a &mut ref to something inside the ServiceInner.  This could be optimized to
+    /// use a VecDeque instead of a channel to reduce synchronization overhead, at the cost of some code complexity.
+    delayed_reconcile_tx: tokio::sync::mpsc::Sender<TenantShardId>,
 
     // Process shutdown will fire this token
     cancel: CancellationToken,
@@ -239,7 +330,7 @@ struct TenantShardSplitAbort {
     new_shard_count: ShardCount,
     new_stripe_size: Option<ShardStripeSize>,
     /// Until this abort op is complete, no other operations may be done on the tenant
-    _tenant_lock: tokio::sync::OwnedRwLockWriteGuard<()>,
+    _tenant_lock: WrappedWriteGuard<TenantOperations>,
 }
 
 #[derive(thiserror::Error, Debug)]
@@ -269,7 +360,12 @@ impl Service {
     /// Called once on startup, this function attempts to contact all pageservers to build an up-to-date
     /// view of the world, and determine which pageservers are responsive.
     #[instrument(skip_all)]
-    async fn startup_reconcile(self: &Arc<Service>) {
+    async fn startup_reconcile(
+        self: &Arc<Service>,
+        bg_compute_notify_result_tx: tokio::sync::mpsc::Sender<
+            Result<(), (TenantShardId, NotifyError)>,
+        >,
+    ) {
         // For all tenant shards, a vector of observed states on nodes (where None means
         // indeterminate, same as in [`ObservedStateLocation`])
         let mut observed: HashMap<TenantShardId, Vec<(NodeId, Option<LocationConfig>)>> =
@@ -288,10 +384,6 @@ impl Service {
             .checked_add(STARTUP_RECONCILE_TIMEOUT / 2)
             .expect("Reconcile timeout is a modest constant");
 
-        let compute_notify_deadline = start_at
-            .checked_add((STARTUP_RECONCILE_TIMEOUT / 4) * 3)
-            .expect("Reconcile timeout is a modest constant");
-
         // Accumulate a list of any tenant locations that ought to be detached
         let mut cleanup = Vec::new();
 
@@ -317,6 +409,7 @@ impl Service {
         let mut compute_notifications = Vec::new();
 
         // Populate intent and observed states for all tenants, based on reported state on pageservers
+        tracing::info!("Populating tenant shards' states from initial pageserver scan...");
         let shard_count = {
             let mut locked = self.inner.write().unwrap();
             let (nodes, tenants, scheduler) = locked.parts_mut();
@@ -383,28 +476,27 @@ impl Service {
         // Emit compute hook notifications for all tenants which are already stably attached.  Other tenants
         // will emit compute hook notifications when they reconcile.
         //
-        // Ordering: we must complete these notification attempts before doing any other reconciliation for the
-        // tenants named here, because otherwise our calls to notify() might race with more recent values
-        // generated by reconciliation.
-        let notify_failures = self
-            .compute_notify_many(compute_notifications, compute_notify_deadline)
-            .await;
-
-        // Compute notify is fallible.  If it fails here, do not delay overall startup: set the
-        // flag on these shards that they have a pending notification.
-        // Update tenant state for any that failed to do their initial compute notify, so that they'll retry later.
-        {
-            let mut locked = self.inner.write().unwrap();
-            for tenant_shard_id in notify_failures.into_iter() {
-                if let Some(shard) = locked.tenants.get_mut(&tenant_shard_id) {
-                    shard.pending_compute_notification = true;
-                }
-            }
-        }
+        // Ordering: our calls to notify_background synchronously establish a relative order for these notifications vs. any later
+        // calls into the ComputeHook for the same tenant: we can leave these to run to completion in the background and any later
+        // calls will be correctly ordered wrt these.
+        //
+        // Concurrency: we call notify_background for all tenants, which will create O(N) tokio tasks, but almost all of them
+        // will just wait on the ComputeHook::API_CONCURRENCY semaphore immediately, so very cheap until they get that semaphore
+        // unit and start doing I/O.
+        tracing::info!(
+            "Sending {} compute notifications",
+            compute_notifications.len()
+        );
+        self.compute_hook.notify_background(
+            compute_notifications,
+            bg_compute_notify_result_tx.clone(),
+            &self.cancel,
+        );
 
         // Finally, now that the service is up and running, launch reconcile operations for any tenants
         // which require it: under normal circumstances this should only include tenants that were in some
         // transient state before we restarted, or any tenants whose compute hooks failed above.
+        tracing::info!("Checking for shards in need of reconciliation...");
         let reconcile_tasks = self.reconcile_all();
         // We will not wait for these reconciliation tasks to run here: we're now done with startup and
         // normal operations may proceed.
@@ -445,6 +537,7 @@ impl Service {
             }
         }
 
+        tracing::info!("Sending initial heartbeats...");
         let res = self
             .heartbeater
             .heartbeat(Arc::new(nodes_to_heartbeat))
@@ -481,6 +574,7 @@ impl Service {
 
         let mut node_list_futs = FuturesUnordered::new();
 
+        tracing::info!("Scanning shards on {} nodes...", nodes.len());
         for node in nodes.values() {
             node_list_futs.push({
                 async move {
@@ -600,72 +694,6 @@ impl Service {
         }
     }
 
-    /// Used during [`Self::startup_reconcile`]: issue many concurrent compute notifications.
-    ///
-    /// Returns a set of any shards for which notifications where not acked within the deadline.
-    async fn compute_notify_many(
-        &self,
-        notifications: Vec<(TenantShardId, NodeId, ShardStripeSize)>,
-        deadline: Instant,
-    ) -> HashSet<TenantShardId> {
-        let attempt_shards = notifications.iter().map(|i| i.0).collect::<HashSet<_>>();
-        let mut success_shards = HashSet::new();
-
-        // Construct an async stream of futures to invoke the compute notify function: we do this
-        // in order to subsequently use .buffered() on the stream to execute with bounded parallelism.
-        let mut stream = futures::stream::iter(notifications.into_iter())
-            .map(|(tenant_shard_id, node_id, stripe_size)| {
-                let compute_hook = self.compute_hook.clone();
-                let cancel = self.cancel.clone();
-                async move {
-                    if let Err(e) = compute_hook
-                        .notify(tenant_shard_id, node_id, stripe_size, &cancel)
-                        .await
-                    {
-                        tracing::error!(
-                            %tenant_shard_id,
-                            %node_id,
-                            "Failed to notify compute on startup for shard: {e}"
-                        );
-                        None
-                    } else {
-                        Some(tenant_shard_id)
-                    }
-                }
-            })
-            .buffered(compute_hook::API_CONCURRENCY);
-
-        loop {
-            tokio::select! {
-                next = stream.next() => {
-                    match next {
-                        Some(Some(success_shard)) => {
-                            // A notification succeeded
-                            success_shards.insert(success_shard);
-                            },
-                        Some(None) => {
-                            // A notification that failed
-                        },
-                        None => {
-                            tracing::info!("Successfully sent all compute notifications");
-                            break;
-                        }
-                    }
-                },
-                _ = tokio::time::sleep(deadline.duration_since(Instant::now())) => {
-                    // Give up sending any that didn't succeed yet
-                    tracing::info!("Reached deadline while sending compute notifications");
-                    break;
-                }
-            };
-        }
-
-        attempt_shards
-            .difference(&success_shards)
-            .cloned()
-            .collect()
-    }
-
     /// Long running background task that periodically wakes up and looks for shards that need
     /// reconciliation.  Reconciliation is fallible, so any reconciliation tasks that fail during
     /// e.g. a tenant create/attach/migrate must eventually be retried: this task is responsible
@@ -683,7 +711,7 @@ impl Service {
                 let reconciles_spawned = self.reconcile_all();
                 if reconciles_spawned == 0 {
                     // Run optimizer only when we didn't find any other work to do
-                    self.optimize_all();
+                    self.optimize_all().await;
                 }
             }
               _ = self.cancel.cancelled() => return
@@ -742,8 +770,9 @@ impl Service {
     }
 
     /// Apply the contents of a [`ReconcileResult`] to our in-memory state: if the reconciliation
-    /// was successful, this will update the observed state of the tenant such that subsequent
-    /// calls to [`TenantShard::maybe_reconcile`] will do nothing.
+    /// was successful and intent hasn't changed since the Reconciler was spawned, this will update
+    /// the observed state of the tenant such that subsequent calls to [`TenantShard::get_reconcile_needed`]
+    /// will indicate that reconciliation is not needed.
     #[instrument(skip_all, fields(
         tenant_id=%result.tenant_shard_id.tenant_id, shard_id=%result.tenant_shard_id.shard_slug(),
         sequence=%result.sequence
@@ -796,36 +825,72 @@ impl Service {
 
                 // Ordering: populate last_error before advancing error_seq,
                 // so that waiters will see the correct error after waiting.
-                *(tenant.last_error.lock().unwrap()) = format!("{e}");
-                tenant.error_waiter.advance(result.sequence);
+                tenant.set_last_error(result.sequence, e);
 
                 for (node_id, o) in result.observed.locations {
                     tenant.observed.locations.insert(node_id, o);
                 }
             }
         }
+
+        // Maybe some other work can proceed now that this job finished.
+        if self.reconciler_concurrency.available_permits() > 0 {
+            while let Ok(tenant_shard_id) = locked.delayed_reconcile_rx.try_recv() {
+                let (nodes, tenants, _scheduler) = locked.parts_mut();
+                if let Some(shard) = tenants.get_mut(&tenant_shard_id) {
+                    shard.delayed_reconcile = false;
+                    self.maybe_reconcile_shard(shard, nodes);
+                }
+
+                if self.reconciler_concurrency.available_permits() == 0 {
+                    break;
+                }
+            }
+        }
     }
 
     async fn process_results(
         &self,
         mut result_rx: tokio::sync::mpsc::UnboundedReceiver<ReconcileResult>,
+        mut bg_compute_hook_result_rx: tokio::sync::mpsc::Receiver<
+            Result<(), (TenantShardId, NotifyError)>,
+        >,
     ) {
         loop {
             // Wait for the next result, or for cancellation
-            let result = tokio::select! {
+            tokio::select! {
                 r = result_rx.recv() => {
                     match r {
-                        Some(result) => {result},
+                        Some(result) => {self.process_result(result);},
                         None => {break;}
                     }
                 }
+                _ = async{
+                    match bg_compute_hook_result_rx.recv().await {
+                        Some(result) => {
+                            if let Err((tenant_shard_id, notify_error)) = result {
+                                tracing::warn!("Marking shard {tenant_shard_id} for notification retry, due to error {notify_error}");
+                                let mut locked = self.inner.write().unwrap();
+                                if let Some(shard) = locked.tenants.get_mut(&tenant_shard_id) {
+                                    shard.pending_compute_notification = true;
+                                }
+
+                            }
+                        },
+                        None => {
+                            // This channel is dead, but we don't want to terminate the outer loop{}: just wait for shutdown
+                            self.cancel.cancelled().await;
+                        }
+                    }
+                } => {},
                 _ = self.cancel.cancelled() => {
                     break;
                 }
             };
-
-            self.process_result(result);
         }
+
+        // We should only fall through on shutdown
+        assert!(self.cancel.is_cancelled());
     }
 
     async fn process_aborts(
@@ -986,6 +1051,13 @@ impl Service {
 
         let (startup_completion, startup_complete) = utils::completion::channel();
 
+        // This channel is continuously consumed by process_results, so doesn't need to be very large.
+        let (bg_compute_notify_result_tx, bg_compute_notify_result_rx) =
+            tokio::sync::mpsc::channel(512);
+
+        let (delayed_reconcile_tx, delayed_reconcile_rx) =
+            tokio::sync::mpsc::channel(MAX_DELAYED_RECONCILES);
+
         let cancel = CancellationToken::new();
         let heartbeater = Heartbeater::new(
             config.jwt_token.clone(),
@@ -994,13 +1066,20 @@ impl Service {
         );
         let this = Arc::new(Self {
             inner: Arc::new(std::sync::RwLock::new(ServiceState::new(
-                nodes, tenants, scheduler,
+                nodes,
+                tenants,
+                scheduler,
+                delayed_reconcile_rx,
             ))),
             config: config.clone(),
             persistence,
-            compute_hook: Arc::new(ComputeHook::new(config)),
+            compute_hook: Arc::new(ComputeHook::new(config.clone())),
             result_tx,
             heartbeater,
+            reconciler_concurrency: Arc::new(tokio::sync::Semaphore::new(
+                config.reconciler_concurrency,
+            )),
+            delayed_reconcile_tx,
             abort_tx,
             startup_complete: startup_complete.clone(),
             cancel,
@@ -1013,7 +1092,9 @@ impl Service {
         tokio::task::spawn(async move {
             // Block shutdown until we're done (we must respect self.cancel)
             if let Ok(_gate) = result_task_this.gate.enter() {
-                result_task_this.process_results(result_rx).await
+                result_task_this
+                    .process_results(result_rx, bg_compute_notify_result_rx)
+                    .await
             }
         });
 
@@ -1055,7 +1136,7 @@ impl Service {
                     return;
                 };
 
-                this.startup_reconcile().await;
+                this.startup_reconcile(bg_compute_notify_result_tx).await;
                 drop(startup_completion);
             }
         });
@@ -1282,7 +1363,7 @@ impl Service {
     async fn node_activate_reconcile(
         &self,
         mut node: Node,
-        _lock: &OwnedRwLockWriteGuard<()>,
+        _lock: &WrappedWriteGuard<NodeOperations>,
     ) -> Result<(), ApiError> {
         // This Node is a mutable local copy: we will set it active so that we can use its
         // API client to reconcile with the node.  The Node in [`Self::nodes`] will get updated
@@ -1528,14 +1609,15 @@ impl Service {
         let tenant_id = create_req.new_tenant_id.tenant_id;
 
         // Exclude any concurrent attempts to create/access the same tenant ID
-        let _tenant_lock = self
-            .tenant_op_locks
-            .exclusive(create_req.new_tenant_id.tenant_id)
-            .await;
-
+        let _tenant_lock = trace_exclusive_lock(
+            &self.tenant_op_locks,
+            create_req.new_tenant_id.tenant_id,
+            TenantOperations::Create,
+        )
+        .await;
         let (response, waiters) = self.do_tenant_create(create_req).await?;
 
-        if let Err(e) = self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await {
+        if let Err(e) = self.await_waiters(waiters, RECONCILE_TIMEOUT).await {
             // Avoid deadlock: reconcile may fail while notifying compute, if the cloud control plane refuses to
             // accept compute notifications while it is in the process of creating.  Reconciliation will
             // be retried in the background.
@@ -1871,10 +1953,12 @@ impl Service {
         req: TenantLocationConfigRequest,
     ) -> Result<TenantLocationConfigResponse, ApiError> {
         // We require an exclusive lock, because we are updating both persistent and in-memory state
-        let _tenant_lock = self
-            .tenant_op_locks
-            .exclusive(tenant_shard_id.tenant_id)
-            .await;
+        let _tenant_lock = trace_exclusive_lock(
+            &self.tenant_op_locks,
+            tenant_shard_id.tenant_id,
+            TenantOperations::LocationConfig,
+        )
+        .await;
 
         if !tenant_shard_id.is_unsharded() {
             return Err(ApiError::BadRequest(anyhow::anyhow!(
@@ -1992,7 +2076,12 @@ impl Service {
 
     pub(crate) async fn tenant_config_set(&self, req: TenantConfigRequest) -> Result<(), ApiError> {
         // We require an exclusive lock, because we are updating persistent and in-memory state
-        let _tenant_lock = self.tenant_op_locks.exclusive(req.tenant_id).await;
+        let _tenant_lock = trace_exclusive_lock(
+            &self.tenant_op_locks,
+            req.tenant_id,
+            TenantOperations::ConfigSet,
+        )
+        .await;
 
         let tenant_id = req.tenant_id;
         let config = req.config;
@@ -2081,7 +2170,12 @@ impl Service {
         timestamp: Cow<'_, str>,
         done_if_after: Cow<'_, str>,
     ) -> Result<(), ApiError> {
-        let _tenant_lock = self.tenant_op_locks.exclusive(tenant_id).await;
+        let _tenant_lock = trace_exclusive_lock(
+            &self.tenant_op_locks,
+            tenant_id,
+            TenantOperations::TimeTravelRemoteStorage,
+        )
+        .await;
 
         let node = {
             let locked = self.inner.read().unwrap();
@@ -2172,7 +2266,12 @@ impl Service {
         tenant_id: TenantId,
         wait: Option<Duration>,
     ) -> Result<(StatusCode, SecondaryProgress), ApiError> {
-        let _tenant_lock = self.tenant_op_locks.shared(tenant_id).await;
+        let _tenant_lock = trace_shared_lock(
+            &self.tenant_op_locks,
+            tenant_id,
+            TenantOperations::SecondaryDownload,
+        )
+        .await;
 
         // Acquire lock and yield the collection of shard-node tuples which we will send requests onward to
         let targets = {
@@ -2266,7 +2365,8 @@ impl Service {
     }
 
     pub(crate) async fn tenant_delete(&self, tenant_id: TenantId) -> Result<StatusCode, ApiError> {
-        let _tenant_lock = self.tenant_op_locks.exclusive(tenant_id).await;
+        let _tenant_lock =
+            trace_exclusive_lock(&self.tenant_op_locks, tenant_id, TenantOperations::Delete).await;
 
         self.ensure_attached_wait(tenant_id).await?;
 
@@ -2366,7 +2466,14 @@ impl Service {
         req: TenantPolicyRequest,
     ) -> Result<(), ApiError> {
         // We require an exclusive lock, because we are updating persistent and in-memory state
-        let _tenant_lock = self.tenant_op_locks.exclusive(tenant_id).await;
+        let _tenant_lock = trace_exclusive_lock(
+            &self.tenant_op_locks,
+            tenant_id,
+            TenantOperations::UpdatePolicy,
+        )
+        .await;
+
+        failpoint_support::sleep_millis_async!("tenant-update-policy-exclusive-lock");
 
         let TenantPolicyRequest {
             placement,
@@ -2420,7 +2527,12 @@ impl Service {
             create_req.new_timeline_id,
         );
 
-        let _tenant_lock = self.tenant_op_locks.shared(tenant_id).await;
+        let _tenant_lock = trace_shared_lock(
+            &self.tenant_op_locks,
+            tenant_id,
+            TenantOperations::TimelineCreate,
+        )
+        .await;
 
         self.ensure_attached_wait(tenant_id).await?;
 
@@ -2467,17 +2579,7 @@ impl Service {
             client
                 .timeline_create(tenant_shard_id, &create_req)
                 .await
-                .map_err(|e| match e {
-                    mgmt_api::Error::ApiError(status, msg)
-                        if status == StatusCode::INTERNAL_SERVER_ERROR
-                            || status == StatusCode::NOT_ACCEPTABLE =>
-                    {
-                        // TODO: handle more error codes, e.g. 503 should be passed through.  Make a general wrapper
-                        // for pass-through API calls.
-                        ApiError::InternalServerError(anyhow::anyhow!(msg))
-                    }
-                    _ => ApiError::Conflict(format!("Failed to create timeline: {e}")),
-                })
+                .map_err(|e| passthrough_api_error(&node, e))
         }
 
         // Because the caller might not provide an explicit LSN, we must do the creation first on a single shard, and then
@@ -2539,13 +2641,57 @@ impl Service {
         Ok(results)
     }
 
+    /// Concurrently invoke a pageserver API call on many shards at once
+    pub(crate) async fn tenant_for_shards_api<T, O, F>(
+        &self,
+        locations: Vec<(TenantShardId, Node)>,
+        op: O,
+        warn_threshold: u32,
+        max_retries: u32,
+        timeout: Duration,
+        cancel: &CancellationToken,
+    ) -> Vec<mgmt_api::Result<T>>
+    where
+        O: Fn(TenantShardId, PageserverClient) -> F + Copy,
+        F: std::future::Future<Output = mgmt_api::Result<T>>,
+    {
+        let mut futs = FuturesUnordered::new();
+        let mut results = Vec::with_capacity(locations.len());
+
+        for (tenant_shard_id, node) in locations {
+            futs.push(async move {
+                node.with_client_retries(
+                    |client| op(tenant_shard_id, client),
+                    &self.config.jwt_token,
+                    warn_threshold,
+                    max_retries,
+                    timeout,
+                    cancel,
+                )
+                .await
+            });
+        }
+
+        while let Some(r) = futs.next().await {
+            let r = r.unwrap_or(Err(mgmt_api::Error::Cancelled));
+            results.push(r);
+        }
+
+        results
+    }
+
     pub(crate) async fn tenant_timeline_delete(
         &self,
         tenant_id: TenantId,
         timeline_id: TimelineId,
     ) -> Result<StatusCode, ApiError> {
         tracing::info!("Deleting timeline {}/{}", tenant_id, timeline_id,);
-        let _tenant_lock = self.tenant_op_locks.shared(tenant_id).await;
+        let _tenant_lock = trace_shared_lock(
+            &self.tenant_op_locks,
+            tenant_id,
+            TenantOperations::TimelineDelete,
+        )
+        .await;
 
         self.ensure_attached_wait(tenant_id).await?;
 
@@ -2744,7 +2890,7 @@ impl Service {
         let mut describe_shards = Vec::new();
 
         for shard in shards {
-            if shard.tenant_shard_id.is_zero() {
+            if shard.tenant_shard_id.is_shard_zero() {
                 shard_zero = Some(shard);
             }
 
@@ -2752,7 +2898,14 @@ impl Service {
                 tenant_shard_id: shard.tenant_shard_id,
                 node_attached: *shard.intent.get_attached(),
                 node_secondary: shard.intent.get_secondary().to_vec(),
-                last_error: shard.last_error.lock().unwrap().clone(),
+                last_error: shard
+                    .last_error
+                    .lock()
+                    .unwrap()
+                    .as_ref()
+                    .map(|e| format!("{e}"))
+                    .unwrap_or("".to_string())
+                    .clone(),
                 is_reconciling: shard.reconciler.is_some(),
                 is_pending_compute_notification: shard.pending_compute_notification,
                 is_splitting: matches!(shard.splitting, SplitState::Splitting),
@@ -2976,11 +3129,14 @@ impl Service {
     ) -> (
         TenantShardSplitResponse,
         Vec<(TenantShardId, NodeId, ShardStripeSize)>,
+        Vec<ReconcilerWaiter>,
     ) {
         let mut response = TenantShardSplitResponse {
             new_shards: Vec::new(),
         };
         let mut child_locations = Vec::new();
+        let mut waiters = Vec::new();
+
         {
             let mut locked = self.inner.write().unwrap();
 
@@ -3059,14 +3215,112 @@ impl Service {
                         tracing::warn!("Failed to schedule child shard {child}: {e}");
                     }
                     // In the background, attach secondary locations for the new shards
-                    self.maybe_reconcile_shard(&mut child_state, nodes);
+                    if let Some(waiter) = self.maybe_reconcile_shard(&mut child_state, nodes) {
+                        waiters.push(waiter);
+                    }
 
                     tenants.insert(child, child_state);
                     response.new_shards.push(child);
                 }
             }
+            (response, child_locations, waiters)
+        }
+    }
 
-            (response, child_locations)
+    async fn tenant_shard_split_start_secondaries(
+        &self,
+        tenant_id: TenantId,
+        waiters: Vec<ReconcilerWaiter>,
+    ) {
+        // Wait for initial reconcile of child shards, this creates the secondary locations
+        if let Err(e) = self.await_waiters(waiters, RECONCILE_TIMEOUT).await {
+            // This is not a failure to split: it's some issue reconciling the new child shards, perhaps
+            // their secondaries couldn't be attached.
+            tracing::warn!("Failed to reconcile after split: {e}");
+            return;
+        }
+
+        // Take the state lock to discover the attached & secondary intents for all shards
+        let (attached, secondary) = {
+            let locked = self.inner.read().unwrap();
+            let mut attached = Vec::new();
+            let mut secondary = Vec::new();
+
+            for (tenant_shard_id, shard) in
+                locked.tenants.range(TenantShardId::tenant_range(tenant_id))
+            {
+                let Some(node_id) = shard.intent.get_attached() else {
+                    // Unexpected.  Race with a PlacementPolicy change?
+                    tracing::warn!(
+                        "No attached node on {tenant_shard_id} immediately after shard split!"
+                    );
+                    continue;
+                };
+
+                let Some(secondary_node_id) = shard.intent.get_secondary().first() else {
+                    // No secondary location.  Nothing for us to do.
+                    continue;
+                };
+
+                let attached_node = locked
+                    .nodes
+                    .get(node_id)
+                    .expect("Pageservers may not be deleted while referenced");
+
+                let secondary_node = locked
+                    .nodes
+                    .get(secondary_node_id)
+                    .expect("Pageservers may not be deleted while referenced");
+
+                attached.push((*tenant_shard_id, attached_node.clone()));
+                secondary.push((*tenant_shard_id, secondary_node.clone()));
+            }
+            (attached, secondary)
+        };
+
+        if secondary.is_empty() {
+            // No secondary locations; nothing for us to do
+            return;
+        }
+
+        for result in self
+            .tenant_for_shards_api(
+                attached,
+                |tenant_shard_id, client| async move {
+                    client.tenant_heatmap_upload(tenant_shard_id).await
+                },
+                1,
+                1,
+                SHORT_RECONCILE_TIMEOUT,
+                &self.cancel,
+            )
+            .await
+        {
+            if let Err(e) = result {
+                tracing::warn!("Error calling heatmap upload after shard split: {e}");
+                return;
+            }
+        }
+
+        for result in self
+            .tenant_for_shards_api(
+                secondary,
+                |tenant_shard_id, client| async move {
+                    client
+                        .tenant_secondary_download(tenant_shard_id, Some(Duration::ZERO))
+                        .await
+                },
+                1,
+                1,
+                SHORT_RECONCILE_TIMEOUT,
+                &self.cancel,
+            )
+            .await
+        {
+            if let Err(e) = result {
+                tracing::warn!("Error calling secondary download after shard split: {e}");
+                return;
+            }
         }
     }
 
@@ -3077,7 +3331,12 @@ impl Service {
     ) -> Result<TenantShardSplitResponse, ApiError> {
         // TODO: return 503 if we get stuck waiting for this lock
         // (issue https://github.com/neondatabase/neon/issues/7108)
-        let _tenant_lock = self.tenant_op_locks.exclusive(tenant_id).await;
+        let _tenant_lock = trace_exclusive_lock(
+            &self.tenant_op_locks,
+            tenant_id,
+            TenantOperations::ShardSplit,
+        )
+        .await;
 
         let new_shard_count = ShardCount::new(split_req.new_shard_count);
         let new_stripe_size = split_req.new_stripe_size;
@@ -3095,8 +3354,8 @@ impl Service {
             .do_tenant_shard_split(tenant_id, shard_split_params)
             .await;
 
-        match r {
-            Ok(r) => Ok(r),
+        let (response, waiters) = match r {
+            Ok(r) => r,
             Err(e) => {
                 // Split might be part-done, we must do work to abort it.
                 tracing::warn!("Enqueuing background abort of split on {tenant_id}");
@@ -3109,9 +3368,17 @@ impl Service {
                     })
                     // Ignore error sending: that just means we're shutting down: aborts are ephemeral so it's fine to drop it.
                     .ok();
-                Err(e)
+                return Err(e);
             }
-        }
+        };
+
+        // The split is now complete.  As an optimization, we will trigger all the child shards to upload
+        // a heatmap immediately, and all their secondary locations to start downloading: this avoids waiting
+        // for the background heatmap/download interval before secondaries get warm enough to migrate shards
+        // in [`Self::optimize_all`]
+        self.tenant_shard_split_start_secondaries(tenant_id, waiters)
+            .await;
+        Ok(response)
     }
 
     fn prepare_tenant_shard_split(
@@ -3261,7 +3528,7 @@ impl Service {
         &self,
         tenant_id: TenantId,
         params: ShardSplitParams,
-    ) -> Result<TenantShardSplitResponse, ApiError> {
+    ) -> Result<(TenantShardSplitResponse, Vec<ReconcilerWaiter>), ApiError> {
         // FIXME: we have dropped self.inner lock, and not yet written anything to the database: another
         // request could occur here, deleting or mutating the tenant.  begin_shard_split checks that the
         // parent shards exist as expected, but it would be neater to do the above pre-checks within the
@@ -3463,7 +3730,7 @@ impl Service {
         ));
 
         // Replace all the shards we just split with their children: this phase is infallible.
-        let (response, child_locations) =
+        let (response, child_locations, waiters) =
             self.tenant_shard_split_commit_inmem(tenant_id, new_shard_count, new_stripe_size);
 
         // Send compute notifications for all the new shards
@@ -3490,7 +3757,7 @@ impl Service {
             }
         }
 
-        Ok(response)
+        Ok((response, waiters))
     }
 
     pub(crate) async fn tenant_shard_migrate(
@@ -3595,6 +3862,88 @@ impl Service {
         Ok(())
     }
 
+    /// This is for debug/support only: assuming tenant data is already present in S3, we "create" a
+    /// tenant with a very high generation number so that it will see the existing data.
+    pub(crate) async fn tenant_import(
+        &self,
+        tenant_id: TenantId,
+    ) -> Result<TenantCreateResponse, ApiError> {
+        // Pick an arbitrary available pageserver to use for scanning the tenant in remote storage
+        let maybe_node = {
+            self.inner
+                .read()
+                .unwrap()
+                .nodes
+                .values()
+                .find(|n| n.is_available())
+                .cloned()
+        };
+        let Some(node) = maybe_node else {
+            return Err(ApiError::BadRequest(anyhow::anyhow!("No nodes available")));
+        };
+
+        let client = PageserverClient::new(
+            node.get_id(),
+            node.base_url(),
+            self.config.jwt_token.as_deref(),
+        );
+
+        let scan_result = client
+            .tenant_scan_remote_storage(tenant_id)
+            .await
+            .map_err(|e| passthrough_api_error(&node, e))?;
+
+        // A post-split tenant may contain a mixture of shard counts in remote storage: pick the highest count.
+        let Some(shard_count) = scan_result
+            .shards
+            .iter()
+            .map(|s| s.tenant_shard_id.shard_count)
+            .max()
+        else {
+            return Err(ApiError::NotFound(
+                anyhow::anyhow!("No shards found").into(),
+            ));
+        };
+
+        // Ideally we would set each newly imported shard's generation independently, but for correctness it is sufficient
+        // to
+        let generation = scan_result
+            .shards
+            .iter()
+            .map(|s| s.generation)
+            .max()
+            .expect("We already validated >0 shards");
+
+        // FIXME: we have no way to recover the shard stripe size from contents of remote storage: this will
+        // only work if they were using the default stripe size.
+        let stripe_size = ShardParameters::DEFAULT_STRIPE_SIZE;
+
+        let (response, waiters) = self
+            .do_tenant_create(TenantCreateRequest {
+                new_tenant_id: TenantShardId::unsharded(tenant_id),
+                generation,
+
+                shard_parameters: ShardParameters {
+                    count: shard_count,
+                    stripe_size,
+                },
+                placement_policy: Some(PlacementPolicy::Attached(0)), // No secondaries, for convenient debug/hacking
+
+                // There is no way to know what the tenant's config was: revert to defaults
+                config: TenantConfig::default(),
+            })
+            .await?;
+
+        if let Err(e) = self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await {
+            // Since this is a debug/support operation, all kinds of weird issues are possible (e.g. this
+            // tenant doesn't exist in the control plane), so don't fail the request if it can't fully
+            // reconcile, as reconciliation includes notifying compute.
+            tracing::warn!(%tenant_id, "Reconcile not done yet while importing tenant ({e})");
+        }
+
+        Ok(response)
+    }
+
     /// For debug/support: a full JSON dump of TenantShards.  Returns a response so that
     /// we don't have to make TenantShard clonable in the return path.
     pub(crate) fn tenants_dump(&self) -> Result<hyper::Response<hyper::Body>, ApiError> {
@@ -3756,9 +4105,13 @@ impl Service {
         &self,
         register_req: NodeRegisterRequest,
     ) -> Result<(), ApiError> {
-        let _node_lock = self.node_op_locks.exclusive(register_req.node_id).await;
+        let _node_lock = trace_exclusive_lock(
+            &self.node_op_locks,
+            register_req.node_id,
+            NodeOperations::Register,
+        )
+        .await;
 
-        // Pre-check for an already-existing node
         {
             let locked = self.inner.read().unwrap();
             if let Some(node) = locked.nodes.get(&register_req.node_id) {
@@ -3845,7 +4198,8 @@ impl Service {
         availability: Option<NodeAvailability>,
         scheduling: Option<NodeSchedulingPolicy>,
     ) -> Result<(), ApiError> {
-        let _node_lock = self.node_op_locks.exclusive(node_id).await;
+        let _node_lock =
+            trace_exclusive_lock(&self.node_op_locks, node_id, NodeOperations::Configure).await;
 
         if let Some(scheduling) = scheduling {
             // Scheduling is a persistent part of Node: we must write updates to the database before
@@ -3978,7 +4332,7 @@ impl Service {
                 // TODO: in the background, we should balance work back onto this pageserver
             }
             AvailabilityTransition::Unchanged => {
-                tracing::info!("Node {} no change during config", node_id);
+                tracing::debug!("Node {} no change during config", node_id);
             }
         }
 
@@ -4053,20 +4407,64 @@ impl Service {
         Ok(())
     }
 
-    /// Convenience wrapper around [`TenantShard::maybe_reconcile`] that provides
-    /// all the references to parts of Self that are needed
+    /// Wrap [`TenantShard`] reconciliation methods with acquisition of [`Gate`] and [`ReconcileUnits`],
     fn maybe_reconcile_shard(
         &self,
         shard: &mut TenantShard,
         nodes: &Arc<HashMap<NodeId, Node>>,
     ) -> Option<ReconcilerWaiter> {
-        shard.maybe_reconcile(
+        let reconcile_needed = shard.get_reconcile_needed(nodes);
+
+        match reconcile_needed {
+            ReconcileNeeded::No => return None,
+            ReconcileNeeded::WaitExisting(waiter) => return Some(waiter),
+            ReconcileNeeded::Yes => {
+                // Fall through to try and acquire units for spawning reconciler
+            }
+        };
+
+        let units = match self.reconciler_concurrency.clone().try_acquire_owned() {
+            Ok(u) => ReconcileUnits::new(u),
+            Err(_) => {
+                tracing::info!(tenant_id=%shard.tenant_shard_id.tenant_id, shard_id=%shard.tenant_shard_id.shard_slug(),
+                    "Concurrency limited: enqueued for reconcile later");
+                if !shard.delayed_reconcile {
+                    match self.delayed_reconcile_tx.try_send(shard.tenant_shard_id) {
+                        Err(TrySendError::Closed(_)) => {
+                            // Weird mid-shutdown case?
+                        }
+                        Err(TrySendError::Full(_)) => {
+                            // It is safe to skip sending our ID in the channel: we will eventually get retried by the background reconcile task.
+                            tracing::warn!(
+                                "Many shards are waiting to reconcile: delayed_reconcile queue is full"
+                            );
+                        }
+                        Ok(()) => {
+                            shard.delayed_reconcile = true;
+                        }
+                    }
+                }
+
+                // We won't spawn a reconciler, but we will construct a waiter that waits for the shard's sequence
+                // number to advance.  When this function is eventually called again and succeeds in getting units,
+                // it will spawn a reconciler that makes this waiter complete.
+                return Some(shard.future_reconcile_waiter());
+            }
+        };
+
+        let Ok(gate_guard) = self.gate.enter() else {
+            // Gate closed: we're shutting down, drop out.
+            return None;
+        };
+
+        shard.spawn_reconciler(
             &self.result_tx,
             nodes,
             &self.compute_hook,
             &self.config,
             &self.persistence,
-            &self.gate,
+            units,
+            gate_guard,
             &self.cancel,
         )
     }
@@ -4074,7 +4472,9 @@ impl Service {
     /// Check all tenants for pending reconciliation work, and reconcile those in need.
     /// Additionally, reschedule tenants that require it.
     ///
-    /// Returns how many reconciliation tasks were started
+    /// Returns how many reconciliation tasks were started, or `1` if no reconciles were
+    /// spawned but some _would_ have been spawned if `reconciler_concurrency` units where
+    /// available.  A return value of 0 indicates that everything is fully reconciled already.
     fn reconcile_all(&self) -> usize {
         let mut locked = self.inner.write().unwrap();
         let (nodes, tenants, _scheduler) = locked.parts_mut();
@@ -4084,10 +4484,19 @@ impl Service {
 
         let mut reconciles_spawned = 0;
         for (tenant_shard_id, shard) in tenants.iter_mut() {
-            if tenant_shard_id.is_zero() {
+            if tenant_shard_id.is_shard_zero() {
                 schedule_context = ScheduleContext::default();
             }
 
+            // Skip checking if this shard is already enqueued for reconciliation
+            if shard.delayed_reconcile && self.reconciler_concurrency.available_permits() == 0 {
+                // If there is something delayed, then return a nonzero count so that
+                // callers like reconcile_all_now do not incorrectly get the impression
+                // that the system is in a quiescent state.
+                reconciles_spawned = std::cmp::max(1, reconciles_spawned);
+                continue;
+            }
+
             // Eventual consistency: if an earlier reconcile job failed, and the shard is still
             // dirty, spawn another rone
             if self.maybe_reconcile_shard(shard, &pageservers).is_some() {
@@ -4114,33 +4523,77 @@ impl Service {
     /// To put it more briefly: whereas the scheduler respects soft constraints in a ScheduleContext at
     /// the time of scheduling, this function looks for cases where a better-scoring location is available
     /// according to those same soft constraints.
-    fn optimize_all(&self) -> usize {
-        let mut locked = self.inner.write().unwrap();
-        let (nodes, tenants, scheduler) = locked.parts_mut();
-        let pageservers = nodes.clone();
-
-        let mut schedule_context = ScheduleContext::default();
-
-        let mut reconciles_spawned = 0;
-
-        let mut tenant_shards: Vec<&TenantShard> = Vec::new();
-
+    async fn optimize_all(&self) -> usize {
         // Limit on how many shards' optmizations each call to this function will execute.  Combined
         // with the frequency of background calls, this acts as an implicit rate limit that runs a small
         // trickle of optimizations in the background, rather than executing a large number in parallel
         // when a change occurs.
-        const MAX_OPTIMIZATIONS_PER_PASS: usize = 2;
+        const MAX_OPTIMIZATIONS_EXEC_PER_PASS: usize = 2;
+
+        // Synchronous prepare: scan shards for possible scheduling optimizations
+        let candidate_work = self.optimize_all_plan();
+        let candidate_work_len = candidate_work.len();
+
+        // Asynchronous validate: I/O to pageservers to make sure shards are in a good state to apply validation
+        let validated_work = self.optimize_all_validate(candidate_work).await;
+
+        let was_work_filtered = validated_work.len() != candidate_work_len;
+
+        // Synchronous apply: update the shards' intent states according to validated optimisations
+        let mut reconciles_spawned = 0;
+        let mut optimizations_applied = 0;
+        let mut locked = self.inner.write().unwrap();
+        let (nodes, tenants, scheduler) = locked.parts_mut();
+        for (tenant_shard_id, optimization) in validated_work {
+            let Some(shard) = tenants.get_mut(&tenant_shard_id) else {
+                // Shard was dropped between planning and execution;
+                continue;
+            };
+            if shard.apply_optimization(scheduler, optimization) {
+                optimizations_applied += 1;
+                if self.maybe_reconcile_shard(shard, nodes).is_some() {
+                    reconciles_spawned += 1;
+                }
+            }
+
+            if optimizations_applied >= MAX_OPTIMIZATIONS_EXEC_PER_PASS {
+                break;
+            }
+        }
+
+        if was_work_filtered {
+            // If we filtered any work out during validation, ensure we return a nonzero value to indicate
+            // to callers that the system is not in a truly quiet state, it's going to do some work as soon
+            // as these validations start passing.
+            reconciles_spawned = std::cmp::max(reconciles_spawned, 1);
+        }
+
+        reconciles_spawned
+    }
+
+    fn optimize_all_plan(&self) -> Vec<(TenantShardId, ScheduleOptimization)> {
+        let mut schedule_context = ScheduleContext::default();
+
+        let mut tenant_shards: Vec<&TenantShard> = Vec::new();
+
+        // How many candidate optimizations we will generate, before evaluating them for readniess: setting
+        // this higher than the execution limit gives us a chance to execute some work even if the first
+        // few optimizations we find are not ready.
+        const MAX_OPTIMIZATIONS_PLAN_PER_PASS: usize = 8;
 
         let mut work = Vec::new();
 
+        let mut locked = self.inner.write().unwrap();
+        let (nodes, tenants, scheduler) = locked.parts_mut();
         for (tenant_shard_id, shard) in tenants.iter() {
-            if tenant_shard_id.is_zero() {
+            if tenant_shard_id.is_shard_zero() {
                 // Reset accumulators on the first shard in a tenant
                 schedule_context = ScheduleContext::default();
+                schedule_context.mode = ScheduleMode::Speculative;
                 tenant_shards.clear();
             }
 
-            if work.len() >= MAX_OPTIMIZATIONS_PER_PASS {
+            if work.len() >= MAX_OPTIMIZATIONS_PLAN_PER_PASS {
                 break;
             }
 
@@ -4212,18 +4665,105 @@ impl Service {
             }
         }
 
-        for (tenant_shard_id, optimization) in work {
-            let shard = tenants
-                .get_mut(&tenant_shard_id)
-                .expect("We held lock from place we got this ID");
-            shard.apply_optimization(scheduler, optimization);
+        work
+    }
 
-            if self.maybe_reconcile_shard(shard, &pageservers).is_some() {
-                reconciles_spawned += 1;
+    async fn optimize_all_validate(
+        &self,
+        candidate_work: Vec<(TenantShardId, ScheduleOptimization)>,
+    ) -> Vec<(TenantShardId, ScheduleOptimization)> {
+        // Take a clone of the node map to use outside the lock in async validation phase
+        let validation_nodes = { self.inner.read().unwrap().nodes.clone() };
+
+        let mut want_secondary_status = Vec::new();
+
+        // Validate our plans: this is an async phase where we may do I/O to pageservers to
+        // check that the state of locations is acceptable to run the optimization, such as
+        // checking that a secondary location is sufficiently warmed-up to cleanly cut over
+        // in a live migration.
+        let mut validated_work = Vec::new();
+        for (tenant_shard_id, optimization) in candidate_work {
+            match optimization.action {
+                ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment {
+                    old_attached_node_id: _,
+                    new_attached_node_id,
+                }) => {
+                    match validation_nodes.get(&new_attached_node_id) {
+                        None => {
+                            // Node was dropped between planning and validation
+                        }
+                        Some(node) => {
+                            if !node.is_available() {
+                                tracing::info!("Skipping optimization migration of {tenant_shard_id} to {new_attached_node_id} because node unavailable");
+                            } else {
+                                // Accumulate optimizations that require fetching secondary status, so that we can execute these
+                                // remote API requests concurrently.
+                                want_secondary_status.push((
+                                    tenant_shard_id,
+                                    node.clone(),
+                                    optimization,
+                                ));
+                            }
+                        }
+                    }
+                }
+                ScheduleOptimizationAction::ReplaceSecondary(_) => {
+                    // No extra checks needed to replace a secondary: this does not interrupt client access
+                    validated_work.push((tenant_shard_id, optimization))
+                }
+            };
+        }
+
+        // Call into pageserver API to find out if the destination secondary location is warm enough for a reasonably smooth migration: we
+        // do this so that we avoid spawning a Reconciler that would have to wait minutes/hours for a destination to warm up: that reconciler
+        // would hold a precious reconcile semaphore unit the whole time it was waiting for the destination to warm up.
+        let results = self
+            .tenant_for_shards_api(
+                want_secondary_status
+                    .iter()
+                    .map(|i| (i.0, i.1.clone()))
+                    .collect(),
+                |tenant_shard_id, client| async move {
+                    client.tenant_secondary_status(tenant_shard_id).await
+                },
+                1,
+                1,
+                SHORT_RECONCILE_TIMEOUT,
+                &self.cancel,
+            )
+            .await;
+
+        for ((tenant_shard_id, node, optimization), secondary_status) in
+            want_secondary_status.into_iter().zip(results.into_iter())
+        {
+            match secondary_status {
+                Err(e) => {
+                    tracing::info!("Skipping migration of {tenant_shard_id} to {node}, error querying secondary: {e}");
+                }
+                Ok(progress) => {
+                    // We require secondary locations to have less than 10GiB of downloads pending before we will use
+                    // them in an optimization
+                    const DOWNLOAD_FRESHNESS_THRESHOLD: u64 = 10 * 1024 * 1024 * 1024;
+
+                    if progress.bytes_total == 0
+                        || progress.bytes_total < DOWNLOAD_FRESHNESS_THRESHOLD
+                            && progress.bytes_downloaded != progress.bytes_total
+                        || progress.bytes_total - progress.bytes_downloaded
+                            > DOWNLOAD_FRESHNESS_THRESHOLD
+                    {
+                        tracing::info!("Skipping migration of {tenant_shard_id} to {node} because secondary isn't ready: {progress:?}");
+                    } else {
+                        // Location looks ready: proceed
+                        tracing::info!(
+                            "{tenant_shard_id} secondary on {node} is warm enough for migration: {progress:?}"
+                        );
+                        validated_work.push((tenant_shard_id, optimization))
+                    }
+                }
             }
         }
 
-        reconciles_spawned
+        validated_work
     }
 
     /// Useful for tests: run whatever work a background [`Self::reconcile_all`] would have done, but
@@ -4231,10 +4771,12 @@ impl Service {
     /// put the system into a quiescent state where future background reconciliations won't do anything.
     pub(crate) async fn reconcile_all_now(&self) -> Result<usize, ReconcileWaitError> {
         let reconciles_spawned = self.reconcile_all();
-        if reconciles_spawned == 0 {
+        let reconciles_spawned = if reconciles_spawned == 0 {
             // Only optimize when we are otherwise idle
-            self.optimize_all();
-        }
+            self.optimize_all().await
+        } else {
+            reconciles_spawned
+        };
 
         let waiters = {
             let mut waiters = Vec::new();
@@ -4248,8 +4790,27 @@ impl Service {
         };
 
         let waiter_count = waiters.len();
-        self.await_waiters(waiters, RECONCILE_TIMEOUT).await?;
-        Ok(waiter_count)
+        match self.await_waiters(waiters, RECONCILE_TIMEOUT).await {
+            Ok(()) => {}
+            Err(ReconcileWaitError::Failed(_, reconcile_error))
+                if matches!(*reconcile_error, ReconcileError::Cancel) =>
+            {
+                // Ignore reconciler cancel errors: this reconciler might have shut down
+                // because some other change superceded it.  We will return a nonzero number,
+                // so the caller knows they might have to call again to quiesce the system.
+            }
+            Err(e) => {
+                return Err(e);
+            }
+        };
+
+        tracing::info!(
+            "{} reconciles in reconcile_all, {} waiters",
+            reconciles_spawned,
+            waiter_count
+        );
+
+        Ok(std::cmp::max(waiter_count, reconciles_spawned))
     }
 
     pub async fn shutdown(&self) {
diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index 58b8ef8d5d..dda17f9887 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -7,6 +7,7 @@ use std::{
 use crate::{
     metrics::{self, ReconcileCompleteLabelGroup, ReconcileOutcome},
     persistence::TenantShardPersistence,
+    reconciler::ReconcileUnits,
     scheduler::{AffinityScore, MaySchedule, ScheduleContext},
 };
 use pageserver_api::controller_api::{PlacementPolicy, ShardSchedulingPolicy};
@@ -22,7 +23,7 @@ use utils::{
     generation::Generation,
     id::NodeId,
     seqwait::{SeqWait, SeqWaitError},
-    sync::gate::Gate,
+    sync::gate::GateGuard,
 };
 
 use crate::{
@@ -37,12 +38,18 @@ use crate::{
 };
 
 /// Serialization helper
-fn read_mutex_content<S, T>(v: &std::sync::Mutex<T>, serializer: S) -> Result<S::Ok, S::Error>
+fn read_last_error<S, T>(v: &std::sync::Mutex<Option<T>>, serializer: S) -> Result<S::Ok, S::Error>
 where
     S: serde::ser::Serializer,
-    T: Clone + std::fmt::Display,
+    T: std::fmt::Display,
 {
-    serializer.collect_str(&v.lock().unwrap())
+    serializer.collect_str(
+        &v.lock()
+            .unwrap()
+            .as_ref()
+            .map(|e| format!("{e}"))
+            .unwrap_or("".to_string()),
+    )
 }
 
 /// In-memory state for a particular tenant shard.
@@ -95,6 +102,10 @@ pub(crate) struct TenantShard {
     /// reconciliation, and timeline creation.
     pub(crate) splitting: SplitState,
 
+    /// If a tenant was enqueued for later reconcile due to hitting concurrency limit, this flag
+    /// is set. This flag is cleared when the tenant is popped off the delay queue.
+    pub(crate) delayed_reconcile: bool,
+
     /// Optionally wait for reconciliation to complete up to a particular
     /// sequence number.
     #[serde(skip)]
@@ -106,15 +117,19 @@ pub(crate) struct TenantShard {
     #[serde(skip)]
     pub(crate) error_waiter: std::sync::Arc<SeqWait<Sequence, Sequence>>,
 
-    /// The most recent error from a reconcile on this tenant
+    /// The most recent error from a reconcile on this tenant.  This is a nested Arc
+    /// because:
+    ///  - ReconcileWaiters need to Arc-clone the overall object to read it later
+    ///  - ReconcileWaitError needs to use an `Arc<ReconcileError>` because we can construct
+    ///    many waiters for one shard, and the underlying error types are not Clone.
     /// TODO: generalize to an array of recent events
     /// TOOD: use a ArcSwap instead of mutex for faster reads?
-    #[serde(serialize_with = "read_mutex_content")]
-    pub(crate) last_error: std::sync::Arc<std::sync::Mutex<String>>,
+    #[serde(serialize_with = "read_last_error")]
+    pub(crate) last_error: std::sync::Arc<std::sync::Mutex<Option<Arc<ReconcileError>>>>,
 
     /// If we have a pending compute notification that for some reason we weren't able to send,
-    /// set this to true. If this is set, calls to [`Self::maybe_reconcile`] will run a task to retry
-    /// sending it.  This is the mechanism by which compute notifications are included in the scope
+    /// set this to true. If this is set, calls to [`Self::get_reconcile_needed`] will return Yes
+    /// and trigger a Reconciler run.  This is the mechanism by which compute notifications are included in the scope
     /// of state that we publish externally in an eventually consistent way.
     pub(crate) pending_compute_notification: bool,
 
@@ -288,18 +303,18 @@ pub(crate) struct ReconcilerWaiter {
 
     seq_wait: std::sync::Arc<SeqWait<Sequence, Sequence>>,
     error_seq_wait: std::sync::Arc<SeqWait<Sequence, Sequence>>,
-    error: std::sync::Arc<std::sync::Mutex<String>>,
+    error: std::sync::Arc<std::sync::Mutex<Option<Arc<ReconcileError>>>>,
     seq: Sequence,
 }
 
 #[derive(thiserror::Error, Debug)]
-pub enum ReconcileWaitError {
+pub(crate) enum ReconcileWaitError {
     #[error("Timeout waiting for shard {0}")]
     Timeout(TenantShardId),
     #[error("shutting down")]
     Shutdown,
     #[error("Reconcile error on shard {0}: {1}")]
-    Failed(TenantShardId, String),
+    Failed(TenantShardId, Arc<ReconcileError>),
 }
 
 #[derive(Eq, PartialEq, Debug)]
@@ -310,18 +325,28 @@ pub(crate) struct ReplaceSecondary {
 
 #[derive(Eq, PartialEq, Debug)]
 pub(crate) struct MigrateAttachment {
-    old_attached_node_id: NodeId,
-    new_attached_node_id: NodeId,
+    pub(crate) old_attached_node_id: NodeId,
+    pub(crate) new_attached_node_id: NodeId,
 }
 
 #[derive(Eq, PartialEq, Debug)]
-pub(crate) enum ScheduleOptimization {
+pub(crate) enum ScheduleOptimizationAction {
     // Replace one of our secondary locations with a different node
     ReplaceSecondary(ReplaceSecondary),
     // Migrate attachment to an existing secondary location
     MigrateAttachment(MigrateAttachment),
 }
 
+#[derive(Eq, PartialEq, Debug)]
+pub(crate) struct ScheduleOptimization {
+    // What was the reconcile sequence when we generated this optimization?  The optimization
+    // should only be applied if the shard's sequence is still at this value, in case other changes
+    // happened between planning the optimization and applying it.
+    sequence: Sequence,
+
+    pub(crate) action: ScheduleOptimizationAction,
+}
+
 impl ReconcilerWaiter {
     pub(crate) async fn wait_timeout(&self, timeout: Duration) -> Result<(), ReconcileWaitError> {
         tokio::select! {
@@ -337,7 +362,8 @@ impl ReconcilerWaiter {
                     SeqWaitError::Timeout => unreachable!()
                 })?;
 
-                return Err(ReconcileWaitError::Failed(self.tenant_shard_id, self.error.lock().unwrap().clone()))
+                return Err(ReconcileWaitError::Failed(self.tenant_shard_id,
+                    self.error.lock().unwrap().clone().expect("If error_seq_wait was advanced error was set").clone()))
             }
         }
 
@@ -353,6 +379,17 @@ pub(crate) struct ReconcilerHandle {
     cancel: CancellationToken,
 }
 
+pub(crate) enum ReconcileNeeded {
+    /// shard either doesn't need reconciliation, or is forbidden from spawning a reconciler
+    /// in its current state (e.g. shard split in progress, or ShardSchedulingPolicy forbids it)
+    No,
+    /// shard has a reconciler running, and its intent hasn't changed since that one was
+    /// spawned: wait for the existing reconciler rather than spawning a new one.
+    WaitExisting(ReconcilerWaiter),
+    /// shard needs reconciliation: call into [`TenantShard::spawn_reconciler`]
+    Yes,
+}
+
 /// When a reconcile task completes, it sends this result object
 /// to be applied to the primary TenantShard.
 pub(crate) struct ReconcileResult {
@@ -396,6 +433,7 @@ impl TenantShard {
             reconciler: None,
             splitting: SplitState::Idle,
             sequence: Sequence(1),
+            delayed_reconcile: false,
             waiter: Arc::new(SeqWait::new(Sequence(0))),
             error_waiter: Arc::new(SeqWait::new(Sequence(0))),
             last_error: Arc::default(),
@@ -647,10 +685,13 @@ impl TenantShard {
                         "Identified optimization: migrate attachment {attached}->{preferred_node} (secondaries {:?})",
                         self.intent.get_secondary()
                     );
-                    return Some(ScheduleOptimization::MigrateAttachment(MigrateAttachment {
-                        old_attached_node_id: attached,
-                        new_attached_node_id: *preferred_node,
-                    }));
+                    return Some(ScheduleOptimization {
+                        sequence: self.sequence,
+                        action: ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment {
+                            old_attached_node_id: attached,
+                            new_attached_node_id: *preferred_node,
+                        }),
+                    });
                 }
             } else {
                 tracing::debug!(
@@ -708,28 +749,37 @@ impl TenantShard {
                     "Identified optimization: replace secondary {secondary}->{candidate_node} (current secondaries {:?})",
                     self.intent.get_secondary()
                 );
-                return Some(ScheduleOptimization::ReplaceSecondary(ReplaceSecondary {
-                    old_node_id: *secondary,
-                    new_node_id: candidate_node,
-                }));
+                return Some(ScheduleOptimization {
+                    sequence: self.sequence,
+                    action: ScheduleOptimizationAction::ReplaceSecondary(ReplaceSecondary {
+                        old_node_id: *secondary,
+                        new_node_id: candidate_node,
+                    }),
+                });
             }
         }
 
         None
     }
 
+    /// Return true if the optimization was really applied: it will not be applied if the optimization's
+    /// sequence is behind this tenant shard's
     pub(crate) fn apply_optimization(
         &mut self,
         scheduler: &mut Scheduler,
         optimization: ScheduleOptimization,
-    ) {
+    ) -> bool {
+        if optimization.sequence != self.sequence {
+            return false;
+        }
+
         metrics::METRICS_REGISTRY
             .metrics_group
             .storage_controller_schedule_optimization
             .inc();
 
-        match optimization {
-            ScheduleOptimization::MigrateAttachment(MigrateAttachment {
+        match optimization.action {
+            ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment {
                 old_attached_node_id,
                 new_attached_node_id,
             }) => {
@@ -737,7 +787,7 @@ impl TenantShard {
                 self.intent
                     .promote_attached(scheduler, new_attached_node_id);
             }
-            ScheduleOptimization::ReplaceSecondary(ReplaceSecondary {
+            ScheduleOptimizationAction::ReplaceSecondary(ReplaceSecondary {
                 old_node_id,
                 new_node_id,
             }) => {
@@ -745,6 +795,8 @@ impl TenantShard {
                 self.intent.push_secondary(scheduler, new_node_id);
             }
         }
+
+        true
     }
 
     /// Query whether the tenant's observed state for attached node matches its intent state, and if so,
@@ -831,16 +883,10 @@ impl TenantShard {
 
     #[allow(clippy::too_many_arguments)]
     #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
-    pub(crate) fn maybe_reconcile(
+    pub(crate) fn get_reconcile_needed(
         &mut self,
-        result_tx: &tokio::sync::mpsc::UnboundedSender<ReconcileResult>,
         pageservers: &Arc<HashMap<NodeId, Node>>,
-        compute_hook: &Arc<ComputeHook>,
-        service_config: &service::Config,
-        persistence: &Arc<Persistence>,
-        gate: &Gate,
-        cancel: &CancellationToken,
-    ) -> Option<ReconcilerWaiter> {
+    ) -> ReconcileNeeded {
         // If there are any ambiguous observed states, and the nodes they refer to are available,
         // we should reconcile to clean them up.
         let mut dirty_observed = false;
@@ -862,8 +908,8 @@ impl TenantShard {
             active_nodes_dirty || dirty_observed || self.pending_compute_notification;
 
         if !do_reconcile {
-            tracing::info!("Not dirty, no reconciliation needed.");
-            return None;
+            tracing::debug!("Not dirty, no reconciliation needed.");
+            return ReconcileNeeded::No;
         }
 
         // If we are currently splitting, then never start a reconciler task: the splitting logic
@@ -871,7 +917,7 @@ impl TenantShard {
         // up top, so that we only log this message if we would otherwise have done a reconciliation.
         if !matches!(self.splitting, SplitState::Idle) {
             tracing::info!("Refusing to reconcile, splitting in progress");
-            return None;
+            return ReconcileNeeded::No;
         }
 
         // Reconcile already in flight for the current sequence?
@@ -881,7 +927,7 @@ impl TenantShard {
                     "Reconciliation already in progress for sequence {:?}",
                     self.sequence,
                 );
-                return Some(ReconcilerWaiter {
+                return ReconcileNeeded::WaitExisting(ReconcilerWaiter {
                     tenant_shard_id: self.tenant_shard_id,
                     seq_wait: self.waiter.clone(),
                     error_seq_wait: self.error_waiter.clone(),
@@ -900,10 +946,67 @@ impl TenantShard {
                 // We only reach this point if there is work to do and we're going to skip
                 // doing it: warn it obvious why this tenant isn't doing what it ought to.
                 tracing::warn!("Skipping reconcile for policy {:?}", self.scheduling_policy);
-                return None;
+                return ReconcileNeeded::No;
             }
         }
 
+        ReconcileNeeded::Yes
+    }
+
+    /// Ensure the sequence number is set to a value where waiting for this value will make us wait
+    /// for the next reconcile: i.e. it is ahead of all completed or running reconcilers.
+    ///
+    /// Constructing a ReconcilerWaiter with the resulting sequence number gives the property
+    /// that the waiter will not complete until some future Reconciler is constructed and run.
+    fn ensure_sequence_ahead(&mut self) {
+        // Find the highest sequence for which a Reconciler has previously run or is currently
+        // running
+        let max_seen = std::cmp::max(
+            self.reconciler
+                .as_ref()
+                .map(|r| r.sequence)
+                .unwrap_or(Sequence(0)),
+            std::cmp::max(self.waiter.load(), self.error_waiter.load()),
+        );
+
+        if self.sequence <= max_seen {
+            self.sequence = max_seen.next();
+        }
+    }
+
+    /// Create a waiter that will wait for some future Reconciler that hasn't been spawned yet.
+    ///
+    /// This is appropriate when you can't spawn a reconciler (e.g. due to resource limits), but
+    /// you would like to wait on the next reconciler that gets spawned in the background.
+    pub(crate) fn future_reconcile_waiter(&mut self) -> ReconcilerWaiter {
+        self.ensure_sequence_ahead();
+
+        ReconcilerWaiter {
+            tenant_shard_id: self.tenant_shard_id,
+            seq_wait: self.waiter.clone(),
+            error_seq_wait: self.error_waiter.clone(),
+            error: self.last_error.clone(),
+            seq: self.sequence,
+        }
+    }
+
+    #[allow(clippy::too_many_arguments)]
+    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
+    pub(crate) fn spawn_reconciler(
+        &mut self,
+        result_tx: &tokio::sync::mpsc::UnboundedSender<ReconcileResult>,
+        pageservers: &Arc<HashMap<NodeId, Node>>,
+        compute_hook: &Arc<ComputeHook>,
+        service_config: &service::Config,
+        persistence: &Arc<Persistence>,
+        units: ReconcileUnits,
+        gate_guard: GateGuard,
+        cancel: &CancellationToken,
+    ) -> Option<ReconcilerWaiter> {
+        // Reconcile in flight for a stale sequence?  Our sequence's task will wait for it before
+        // doing our sequence's work.
+        let old_handle = self.reconciler.take();
+
         // Build list of nodes from which the reconciler should detach
         let mut detach = Vec::new();
         for node_id in self.observed.locations.keys() {
@@ -919,18 +1022,9 @@ impl TenantShard {
             }
         }
 
-        // Reconcile in flight for a stale sequence?  Our sequence's task will wait for it before
-        // doing our sequence's work.
-        let old_handle = self.reconciler.take();
-
-        let Ok(gate_guard) = gate.enter() else {
-            // Shutting down, don't start a reconciler
-            return None;
-        };
-
         // Advance the sequence before spawning a reconciler, so that sequence waiters
         // can distinguish between before+after the reconcile completes.
-        self.sequence = self.sequence.next();
+        self.ensure_sequence_ahead();
 
         let reconciler_cancel = cancel.child_token();
         let reconciler_intent = TargetState::from_intent(pageservers, &self.intent);
@@ -945,6 +1039,7 @@ impl TenantShard {
             compute_hook: compute_hook.clone(),
             service_config: service_config.clone(),
             _gate_guard: gate_guard,
+            _resource_units: units,
             cancel: reconciler_cancel.clone(),
             persistence: persistence.clone(),
             compute_notify_failure: false,
@@ -1011,16 +1106,18 @@ impl TenantShard {
                         status: outcome_label,
                     });
 
-                result_tx
-                    .send(ReconcileResult {
-                        sequence: reconcile_seq,
-                        result,
-                        tenant_shard_id: reconciler.tenant_shard_id,
-                        generation: reconciler.generation,
-                        observed: reconciler.observed,
-                        pending_compute_notification: reconciler.compute_notify_failure,
-                    })
-                    .ok();
+                // Constructing result implicitly drops Reconciler, freeing any ReconcileUnits before the Service might
+                // try and schedule more work in response to our result.
+                let result = ReconcileResult {
+                    sequence: reconcile_seq,
+                    result,
+                    tenant_shard_id: reconciler.tenant_shard_id,
+                    generation: reconciler.generation,
+                    observed: reconciler.observed,
+                    pending_compute_notification: reconciler.compute_notify_failure,
+                };
+
+                result_tx.send(result).ok();
             }
             .instrument(reconciler_span),
         );
@@ -1089,6 +1186,13 @@ impl TenantShard {
         &self.scheduling_policy
     }
 
+    pub(crate) fn set_last_error(&mut self, sequence: Sequence, error: ReconcileError) {
+        // Ordering: always set last_error before advancing sequence, so that sequence
+        // waiters are guaranteed to see a Some value when they see an error.
+        *(self.last_error.lock().unwrap()) = Some(Arc::new(error));
+        self.error_waiter.advance(sequence);
+    }
+
     pub(crate) fn from_persistent(
         tsp: TenantShardPersistence,
         intent: IntentState,
@@ -1111,6 +1215,7 @@ impl TenantShard {
             error_waiter: Arc::new(SeqWait::new(Sequence::initial())),
             last_error: Arc::default(),
             pending_compute_notification: false,
+            delayed_reconcile: false,
             scheduling_policy: serde_json::from_str(&tsp.scheduling_policy).unwrap(),
         })
     }
@@ -1347,10 +1452,13 @@ pub(crate) mod tests {
         // would be no other shards from the same tenant, and request to do so.
         assert_eq!(
             optimization_a,
-            Some(ScheduleOptimization::MigrateAttachment(MigrateAttachment {
-                old_attached_node_id: NodeId(1),
-                new_attached_node_id: NodeId(2)
-            }))
+            Some(ScheduleOptimization {
+                sequence: shard_a.sequence,
+                action: ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment {
+                    old_attached_node_id: NodeId(1),
+                    new_attached_node_id: NodeId(2)
+                })
+            })
         );
 
         // Note that these optimizing two shards in the same tenant with the same ScheduleContext is
@@ -1361,10 +1469,13 @@ pub(crate) mod tests {
         let optimization_b = shard_b.optimize_attachment(&nodes, &schedule_context);
         assert_eq!(
             optimization_b,
-            Some(ScheduleOptimization::MigrateAttachment(MigrateAttachment {
-                old_attached_node_id: NodeId(1),
-                new_attached_node_id: NodeId(3)
-            }))
+            Some(ScheduleOptimization {
+                sequence: shard_b.sequence,
+                action: ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment {
+                    old_attached_node_id: NodeId(1),
+                    new_attached_node_id: NodeId(3)
+                })
+            })
         );
 
         // Applying these optimizations should result in the end state proposed
@@ -1408,10 +1519,13 @@ pub(crate) mod tests {
         // same tenant should generate an optimization to move one away
         assert_eq!(
             optimization_a,
-            Some(ScheduleOptimization::ReplaceSecondary(ReplaceSecondary {
-                old_node_id: NodeId(3),
-                new_node_id: NodeId(4)
-            }))
+            Some(ScheduleOptimization {
+                sequence: shard_a.sequence,
+                action: ScheduleOptimizationAction::ReplaceSecondary(ReplaceSecondary {
+                    old_node_id: NodeId(3),
+                    new_node_id: NodeId(4)
+                })
+            })
         );
 
         shard_a.apply_optimization(&mut scheduler, optimization_a.unwrap());
diff --git a/test_runner/fixtures/compute_reconfigure.py b/test_runner/fixtures/compute_reconfigure.py
index 9dd66fe636..a883d94f73 100644
--- a/test_runner/fixtures/compute_reconfigure.py
+++ b/test_runner/fixtures/compute_reconfigure.py
@@ -14,10 +14,18 @@ class ComputeReconfigure:
         self.server = server
         self.control_plane_compute_hook_api = f"http://{server.host}:{server.port}/notify-attach"
         self.workloads = {}
+        self.on_notify = None
 
     def register_workload(self, workload):
         self.workloads[workload.tenant_id] = workload
 
+    def register_on_notify(self, fn):
+        """
+        Add some extra work during a notification, like sleeping to slow things down, or
+        logging what was notified.
+        """
+        self.on_notify = fn
+
 
 @pytest.fixture(scope="function")
 def compute_reconfigure_listener(make_httpserver):
@@ -43,6 +51,9 @@ def compute_reconfigure_listener(make_httpserver):
         body: dict[str, Any] = request.json
         log.info(f"notify-attach request: {body}")
 
+        if self.on_notify is not None:
+            self.on_notify(body)
+
         try:
             workload = self.workloads[TenantId(body["tenant_id"])]
         except KeyError:
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index c615dd154f..7d34e12ca3 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -129,7 +129,7 @@ PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = (
     "pageserver_getpage_reconstruct_seconds_sum",
     *[f"pageserver_basebackup_query_seconds_{x}" for x in ["bucket", "count", "sum"]],
     *histogram("pageserver_smgr_query_seconds_global"),
-    *histogram("pageserver_read_num_fs_layers"),
+    *histogram("pageserver_layers_visited_per_read_global"),
     *histogram("pageserver_getpage_get_reconstruct_data_seconds"),
     *histogram("pageserver_wait_lsn_seconds"),
     *histogram("pageserver_remote_operation_seconds"),
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 0e4a58c099..240b6ee199 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -499,6 +499,7 @@ class NeonEnvBuilder:
         self.config_init_force: Optional[str] = None
         self.top_output_dir = top_output_dir
         self.control_plane_compute_hook_api: Optional[str] = None
+        self.storage_controller_config: Optional[dict[Any, Any]] = None
 
         self.pageserver_virtual_file_io_engine: Optional[str] = pageserver_virtual_file_io_engine
 
@@ -507,6 +508,16 @@ class NeonEnvBuilder:
             self.pageserver_get_vectored_impl = "vectored"
             log.debug('Overriding pageserver get_vectored_impl config to "vectored"')
 
+        self.pageserver_get_impl: Optional[str] = None
+        if os.getenv("PAGESERVER_GET_IMPL", "") == "vectored":
+            self.pageserver_get_impl = "vectored"
+            log.debug('Overriding pageserver get_impl config to "vectored"')
+
+        self.pageserver_validate_vectored_get: Optional[bool] = None
+        if (validate := os.getenv("PAGESERVER_VALIDATE_VEC_GET")) is not None:
+            self.pageserver_validate_vectored_get = bool(validate)
+            log.debug(f'Overriding pageserver validate_vectored_get config to "{validate}"')
+
         assert test_name.startswith(
             "test_"
         ), "Unexpectedly instantiated from outside a test function"
@@ -1011,6 +1022,7 @@ class NeonEnv:
         self.pg_distrib_dir = config.pg_distrib_dir
         self.endpoint_counter = 0
         self.pageserver_config_override = config.pageserver_config_override
+        self.storage_controller_config = config.storage_controller_config
 
         # generate initial tenant ID here instead of letting 'neon init' generate it,
         # so that we don't need to dig it out of the config file afterwards.
@@ -1056,6 +1068,9 @@ class NeonEnv:
         if self.control_plane_compute_hook_api is not None:
             cfg["control_plane_compute_hook_api"] = self.control_plane_compute_hook_api
 
+        if self.storage_controller_config is not None:
+            cfg["storage_controller"] = self.storage_controller_config
+
         # Create config for pageserver
         http_auth_type = "NeonJWT" if config.auth_enabled else "Trust"
         pg_auth_type = "NeonJWT" if config.auth_enabled else "Trust"
@@ -1078,6 +1093,10 @@ class NeonEnv:
                 ps_cfg["virtual_file_io_engine"] = self.pageserver_virtual_file_io_engine
             if config.pageserver_get_vectored_impl is not None:
                 ps_cfg["get_vectored_impl"] = config.pageserver_get_vectored_impl
+            if config.pageserver_get_impl is not None:
+                ps_cfg["get_impl"] = config.pageserver_get_impl
+            if config.pageserver_validate_vectored_get is not None:
+                ps_cfg["validate_vectored_get"] = config.pageserver_validate_vectored_get
 
             # Create a corresponding NeonPageserver object
             self.pageservers.append(
@@ -1085,7 +1104,6 @@ class NeonEnv:
                     self,
                     ps_id,
                     port=pageserver_port,
-                    config_override=self.pageserver_config_override,
                 )
             )
             cfg["pageservers"].append(ps_cfg)
@@ -1120,12 +1138,9 @@ class NeonEnv:
         # bounce through retries on startup
         self.storage_controller.start()
 
-        def storage_controller_ready():
-            assert self.storage_controller.ready() is True
-
         # Wait for storage controller readiness to prevent unnecessary post start-up
         # reconcile.
-        wait_until(30, 1, storage_controller_ready)
+        self.storage_controller.wait_until_ready()
 
         # Start up broker, pageserver and all safekeepers
         futs = []
@@ -1568,6 +1583,11 @@ class NeonCli(AbstractNeonCli):
         res.check_returncode()
         return tenant_id, timeline_id
 
+    def import_tenant(self, tenant_id: TenantId):
+        args = ["tenant", "import", "--tenant-id", str(tenant_id)]
+        res = self.raw_cli(args)
+        res.check_returncode()
+
     def set_default(self, tenant_id: TenantId):
         """
         Update default tenant for future operations that require tenant_id.
@@ -1781,6 +1801,7 @@ class NeonCli(AbstractNeonCli):
         hot_standby: bool = False,
         lsn: Optional[Lsn] = None,
         pageserver_id: Optional[int] = None,
+        allow_multiple=False,
     ) -> "subprocess.CompletedProcess[str]":
         args = [
             "endpoint",
@@ -1804,6 +1825,8 @@ class NeonCli(AbstractNeonCli):
             args.extend(["--hot-standby", "true"])
         if pageserver_id is not None:
             args.extend(["--pageserver-id", str(pageserver_id)])
+        if allow_multiple:
+            args.extend(["--allow-multiple"])
 
         res = self.raw_cli(args)
         res.check_returncode()
@@ -1815,6 +1838,7 @@ class NeonCli(AbstractNeonCli):
         safekeepers: Optional[List[int]] = None,
         remote_ext_config: Optional[str] = None,
         pageserver_id: Optional[int] = None,
+        allow_multiple=False,
     ) -> "subprocess.CompletedProcess[str]":
         args = [
             "endpoint",
@@ -1829,6 +1853,8 @@ class NeonCli(AbstractNeonCli):
             args.append(endpoint_id)
         if pageserver_id is not None:
             args.extend(["--pageserver-id", str(pageserver_id)])
+        if allow_multiple:
+            args.extend(["--allow-multiple"])
 
         res = self.raw_cli(args)
         res.check_returncode()
@@ -1938,6 +1964,55 @@ class Pagectl(AbstractNeonCli):
         return IndexPartDump.from_json(parsed)
 
 
+class LogUtils:
+    """
+    A mixin class which provides utilities for inspecting the logs of a service.
+    """
+
+    def __init__(self, logfile: Path) -> None:
+        self.logfile = logfile
+
+    def assert_log_contains(
+        self, pattern: str, offset: None | LogCursor = None
+    ) -> Tuple[str, LogCursor]:
+        """Convenient for use inside wait_until()"""
+
+        res = self.log_contains(pattern, offset=offset)
+        assert res is not None
+        return res
+
+    def log_contains(
+        self, pattern: str, offset: None | LogCursor = None
+    ) -> Optional[Tuple[str, LogCursor]]:
+        """Check that the log contains a line that matches the given regex"""
+        logfile = self.logfile
+        if not logfile.exists():
+            log.warning(f"Skipping log check: {logfile} does not exist")
+            return None
+
+        contains_re = re.compile(pattern)
+
+        # XXX: Our rust logging machinery buffers the messages, so if you
+        # call this function immediately after it's been logged, there is
+        # no guarantee it is already present in the log file. This hasn't
+        # been a problem in practice, our python tests are not fast enough
+        # to hit that race condition.
+        skip_until_line_no = 0 if offset is None else offset._line_no
+        cur_line_no = 0
+        with logfile.open("r") as f:
+            for line in f:
+                if cur_line_no < skip_until_line_no:
+                    cur_line_no += 1
+                    continue
+                elif contains_re.search(line):
+                    # found it!
+                    cur_line_no += 1
+                    return (line, LogCursor(cur_line_no))
+                else:
+                    cur_line_no += 1
+        return None
+
+
 class StorageControllerApiException(Exception):
     def __init__(self, message, status_code: int):
         super().__init__(message)
@@ -1945,12 +2020,13 @@ class StorageControllerApiException(Exception):
         self.status_code = status_code
 
 
-class NeonStorageController(MetricsGetter):
+class NeonStorageController(MetricsGetter, LogUtils):
     def __init__(self, env: NeonEnv, auth_enabled: bool):
         self.env = env
         self.running = False
         self.auth_enabled = auth_enabled
         self.allowed_errors: list[str] = DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS
+        self.logfile = self.workdir / "storage_controller.log"
 
     def start(self):
         assert not self.running
@@ -2024,6 +2100,15 @@ class NeonStorageController(MetricsGetter):
         else:
             raise RuntimeError(f"Unexpected status {status} from readiness endpoint")
 
+    def wait_until_ready(self):
+        t1 = time.time()
+
+        def storage_controller_ready():
+            assert self.ready() is True
+
+        wait_until(30, 1, storage_controller_ready)
+        return time.time() - t1
+
     def attach_hook_issue(
         self, tenant_shard_id: Union[TenantId, TenantShardId], pageserver_id: int
     ) -> int:
@@ -2111,7 +2196,7 @@ class NeonStorageController(MetricsGetter):
         shard_count: Optional[int] = None,
         shard_stripe_size: Optional[int] = None,
         tenant_config: Optional[Dict[Any, Any]] = None,
-        placement_policy: Optional[str] = None,
+        placement_policy: Optional[Union[Dict[Any, Any] | str]] = None,
     ):
         """
         Use this rather than pageserver_api() when you need to include shard parameters
@@ -2200,6 +2285,13 @@ class NeonStorageController(MetricsGetter):
             headers=self.headers(TokenScope.ADMIN),
         )
 
+    def tenant_import(self, tenant_id: TenantId):
+        self.request(
+            "POST",
+            f"{self.env.storage_controller_api}/debug/v1/tenant/{tenant_id}/import",
+            headers=self.headers(TokenScope.ADMIN),
+        )
+
     def reconcile_all(self):
         r = self.request(
             "POST",
@@ -2214,10 +2306,21 @@ class NeonStorageController(MetricsGetter):
     def reconcile_until_idle(self, timeout_secs=30):
         start_at = time.time()
         n = 1
+        delay_sec = 0.5
+        delay_max = 5
         while n > 0:
             n = self.reconcile_all()
-            if time.time() - start_at > timeout_secs:
+            if n == 0:
+                break
+            elif time.time() - start_at > timeout_secs:
                 raise RuntimeError("Timeout in reconcile_until_idle")
+            else:
+                # Don't call again right away: if we're waiting for many reconciles that
+                # are blocked on the concurrency limit, it slows things down to call
+                # reconcile_all frequently.
+                time.sleep(delay_sec)
+                delay_sec *= 2
+                delay_sec = min(delay_sec, delay_max)
 
     def consistency_check(self):
         """
@@ -2247,6 +2350,10 @@ class NeonStorageController(MetricsGetter):
         log.info(f"Got failpoints request response code {res.status_code}")
         res.raise_for_status()
 
+    @property
+    def workdir(self) -> Path:
+        return self.env.repo_dir
+
     def __enter__(self) -> "NeonStorageController":
         return self
 
@@ -2264,24 +2371,21 @@ class LogCursor:
     _line_no: int
 
 
-class NeonPageserver(PgProtocol):
+class NeonPageserver(PgProtocol, LogUtils):
     """
     An object representing a running pageserver.
     """
 
     TEMP_FILE_SUFFIX = "___temp"
 
-    def __init__(
-        self, env: NeonEnv, id: int, port: PageserverPort, config_override: Optional[str] = None
-    ):
+    def __init__(self, env: NeonEnv, id: int, port: PageserverPort):
         super().__init__(host="localhost", port=port.pg, user="cloud_admin")
         self.env = env
         self.id = id
         self.running = False
         self.service_port = port
-        self.config_override = config_override
         self.version = env.get_binary_version("pageserver")
-
+        self.logfile = self.workdir / "pageserver.log"
         # After a test finishes, we will scrape the log to see if there are any
         # unexpected error messages. If your test expects an error, add it to
         # 'allowed_errors' in the test with something like:
@@ -2291,20 +2395,24 @@ class NeonPageserver(PgProtocol):
         # The entries in the list are regular experessions.
         self.allowed_errors: List[str] = list(DEFAULT_PAGESERVER_ALLOWED_ERRORS)
 
-    def timeline_dir(self, tenant_id: TenantId, timeline_id: Optional[TimelineId] = None) -> Path:
+    def timeline_dir(
+        self,
+        tenant_shard_id: Union[TenantId, TenantShardId],
+        timeline_id: Optional[TimelineId] = None,
+    ) -> Path:
         """Get a timeline directory's path based on the repo directory of the test environment"""
         if timeline_id is None:
-            return self.tenant_dir(tenant_id) / "timelines"
-        return self.tenant_dir(tenant_id) / "timelines" / str(timeline_id)
+            return self.tenant_dir(tenant_shard_id) / "timelines"
+        return self.tenant_dir(tenant_shard_id) / "timelines" / str(timeline_id)
 
     def tenant_dir(
         self,
-        tenant_id: Optional[TenantId] = None,
+        tenant_shard_id: Optional[Union[TenantId, TenantShardId]] = None,
     ) -> Path:
         """Get a tenant directory's path based on the repo directory of the test environment"""
-        if tenant_id is None:
+        if tenant_shard_id is None:
             return self.workdir / "tenants"
-        return self.workdir / "tenants" / str(tenant_id)
+        return self.workdir / "tenants" / str(tenant_shard_id)
 
     def start(
         self,
@@ -2417,44 +2525,6 @@ class NeonPageserver(PgProtocol):
             value = self.http_client().get_metric_value(metric)
             assert value == 0, f"Nonzero {metric} == {value}"
 
-    def assert_log_contains(
-        self, pattern: str, offset: None | LogCursor = None
-    ) -> Tuple[str, LogCursor]:
-        """Convenient for use inside wait_until()"""
-
-        res = self.log_contains(pattern, offset=offset)
-        assert res is not None
-        return res
-
-    def log_contains(
-        self, pattern: str, offset: None | LogCursor = None
-    ) -> Optional[Tuple[str, LogCursor]]:
-        """Check that the pageserver log contains a line that matches the given regex"""
-        logfile = self.workdir / "pageserver.log"
-        if not logfile.exists():
-            log.warning(f"Skipping log check: {logfile} does not exist")
-            return None
-
-        contains_re = re.compile(pattern)
-
-        # XXX: Our rust logging machinery buffers the messages, so if you
-        # call this function immediately after it's been logged, there is
-        # no guarantee it is already present in the log file. This hasn't
-        # been a problem in practice, our python tests are not fast enough
-        # to hit that race condition.
-        skip_until_line_no = 0 if offset is None else offset._line_no
-        cur_line_no = 0
-        with logfile.open("r") as f:
-            for line in f:
-                if cur_line_no < skip_until_line_no:
-                    cur_line_no += 1
-                    continue
-                if contains_re.search(line):
-                    # found it!
-                    cur_line_no += 1
-                    return (line, LogCursor(cur_line_no))
-        return None
-
     def tenant_attach(
         self,
         tenant_id: TenantId,
@@ -2489,8 +2559,10 @@ class NeonPageserver(PgProtocol):
         client = self.http_client()
         return client.tenant_location_conf(tenant_id, config, **kwargs)
 
-    def read_tenant_location_conf(self, tenant_id: TenantId) -> dict[str, Any]:
-        path = self.tenant_dir(tenant_id) / "config-v1"
+    def read_tenant_location_conf(
+        self, tenant_shard_id: Union[TenantId, TenantShardId]
+    ) -> dict[str, Any]:
+        path = self.tenant_dir(tenant_shard_id) / "config-v1"
         log.info(f"Reading location conf from {path}")
         bytes = open(path, "r").read()
         try:
@@ -3233,6 +3305,7 @@ class Endpoint(PgProtocol):
         lsn: Optional[Lsn] = None,
         config_lines: Optional[List[str]] = None,
         pageserver_id: Optional[int] = None,
+        allow_multiple: bool = False,
     ) -> "Endpoint":
         """
         Create a new Postgres endpoint.
@@ -3255,6 +3328,7 @@ class Endpoint(PgProtocol):
             pg_port=self.pg_port,
             http_port=self.http_port,
             pageserver_id=pageserver_id,
+            allow_multiple=allow_multiple,
         )
         path = Path("endpoints") / self.endpoint_id / "pgdata"
         self.pgdata_dir = os.path.join(self.env.repo_dir, path)
@@ -3271,7 +3345,10 @@ class Endpoint(PgProtocol):
         return self
 
     def start(
-        self, remote_ext_config: Optional[str] = None, pageserver_id: Optional[int] = None
+        self,
+        remote_ext_config: Optional[str] = None,
+        pageserver_id: Optional[int] = None,
+        allow_multiple: bool = False,
     ) -> "Endpoint":
         """
         Start the Postgres instance.
@@ -3287,6 +3364,7 @@ class Endpoint(PgProtocol):
             safekeepers=self.active_safekeepers,
             remote_ext_config=remote_ext_config,
             pageserver_id=pageserver_id,
+            allow_multiple=allow_multiple,
         )
         self.running = True
 
@@ -3416,6 +3494,7 @@ class Endpoint(PgProtocol):
         config_lines: Optional[List[str]] = None,
         remote_ext_config: Optional[str] = None,
         pageserver_id: Optional[int] = None,
+        allow_multiple=False,
     ) -> "Endpoint":
         """
         Create an endpoint, apply config, and start Postgres.
@@ -3431,7 +3510,12 @@ class Endpoint(PgProtocol):
             hot_standby=hot_standby,
             lsn=lsn,
             pageserver_id=pageserver_id,
-        ).start(remote_ext_config=remote_ext_config, pageserver_id=pageserver_id)
+            allow_multiple=allow_multiple,
+        ).start(
+            remote_ext_config=remote_ext_config,
+            pageserver_id=pageserver_id,
+            allow_multiple=allow_multiple,
+        )
 
         log.info(f"Postgres startup took {time.time() - started_at} seconds")
 
@@ -3694,13 +3778,15 @@ class S3Scrubber:
             log.warning(f"Scrub environment: {env}")
             log.warning(f"Output at: {output_path}")
 
-            raise RuntimeError("Remote storage scrub failed")
+            raise RuntimeError(f"Scrubber failed while running {args}")
 
         assert stdout is not None
         return stdout
 
     def scan_metadata(self) -> Any:
-        stdout = self.scrubber_cli(["scan-metadata", "--json"], timeout=30)
+        stdout = self.scrubber_cli(
+            ["scan-metadata", "--node-kind", "pageserver", "--json"], timeout=30
+        )
 
         try:
             return json.loads(stdout)
@@ -3709,6 +3795,13 @@ class S3Scrubber:
             log.error(stdout)
             raise
 
+    def tenant_snapshot(self, tenant_id: TenantId, output_path: Path):
+        stdout = self.scrubber_cli(
+            ["tenant-snapshot", "--tenant-id", str(tenant_id), "--output-path", str(output_path)],
+            timeout=30,
+        )
+        log.info(f"tenant-snapshot output: {stdout}")
+
 
 def _get_test_dir(request: FixtureRequest, top_output_dir: Path, prefix: str) -> Path:
     """Compute the path to a working directory for an individual test."""
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index b899b0dac8..231ffd898e 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -293,7 +293,6 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         lazy: Optional[bool] = None,
     ):
         body = location_conf.copy()
-        body["tenant_id"] = str(tenant_id)
 
         params = {}
         if flush_ms is not None:
diff --git a/test_runner/fixtures/remote_storage.py b/test_runner/fixtures/remote_storage.py
index 60591d8d46..83f9f26837 100644
--- a/test_runner/fixtures/remote_storage.py
+++ b/test_runner/fixtures/remote_storage.py
@@ -252,8 +252,11 @@ class S3Storage:
 
         log.info(f"deleted {cnt} objects from remote storage")
 
+    def tenants_path(self) -> str:
+        return f"{self.prefix_in_bucket}/tenants"
+
     def tenant_path(self, tenant_id: TenantId) -> str:
-        return f"{self.prefix_in_bucket}/tenants/{tenant_id}"
+        return f"{self.tenants_path()}/{tenant_id}"
 
     def heatmap_key(self, tenant_id: TenantId) -> str:
         return f"{self.tenant_path(tenant_id)}/{TENANT_HEATMAP_FILE_NAME}"
@@ -262,6 +265,9 @@ class S3Storage:
         r = self.client.get_object(Bucket=self.bucket_name, Key=self.heatmap_key(tenant_id))
         return json.loads(r["Body"].read().decode("utf-8"))
 
+    def mock_remote_tenant_path(self, tenant_id: TenantId):
+        assert self.real is False
+
 
 RemoteStorage = Union[LocalFsStorage, S3Storage]
 
diff --git a/test_runner/fixtures/types.py b/test_runner/fixtures/types.py
index 80c9b9ce9a..b5458b5c26 100644
--- a/test_runner/fixtures/types.py
+++ b/test_runner/fixtures/types.py
@@ -156,7 +156,11 @@ class TenantShardId:
             raise ValueError(f"Invalid TenantShardId '{input}'")
 
     def __str__(self):
-        return f"{self.tenant_id}-{self.shard_number:02x}{self.shard_count:02x}"
+        if self.shard_count > 0:
+            return f"{self.tenant_id}-{self.shard_number:02x}{self.shard_count:02x}"
+        else:
+            # Unsharded case: equivalent of Rust TenantShardId::unsharded(tenant_id)
+            return str(self.tenant_id)
 
     def __repr__(self):
         return self.__str__()
diff --git a/test_runner/performance/test_storage_controller_scale.py b/test_runner/performance/test_storage_controller_scale.py
new file mode 100644
index 0000000000..17dc96dabe
--- /dev/null
+++ b/test_runner/performance/test_storage_controller_scale.py
@@ -0,0 +1,198 @@
+import concurrent.futures
+import random
+import time
+
+import pytest
+from fixtures.compute_reconfigure import ComputeReconfigure
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import (
+    NeonEnvBuilder,
+)
+from fixtures.pageserver.http import PageserverHttpClient
+from fixtures.pg_version import PgVersion
+from fixtures.types import TenantId, TenantShardId, TimelineId
+
+
+@pytest.mark.timeout(3600)  # super long running test: should go down as we optimize
+def test_storage_controller_many_tenants(
+    neon_env_builder: NeonEnvBuilder, compute_reconfigure_listener: ComputeReconfigure
+):
+    """
+    Check that we cope well with a not-totally-trivial number of tenants.
+
+    This is checking for:
+    - Obvious concurrency bugs from issuing many tenant creations/modifications
+      concurrently.
+    - Obvious scaling bugs like O(N^2) scaling that would be so slow that even
+      a basic test starts failing from slowness.
+
+    This is _not_ a comprehensive scale test: just a basic sanity check that
+    we don't fall over for a thousand shards.
+    """
+
+    neon_env_builder.num_pageservers = 5
+    neon_env_builder.storage_controller_config = {
+        # Default neon_local uses a small timeout: use a longer one to tolerate longer pageserver restarts.
+        # TODO: tune this down as restarts get faster (https://github.com/neondatabase/neon/pull/7553), to
+        # guard against regressions in restart time.
+        "max_unavailable": "300s"
+    }
+    neon_env_builder.control_plane_compute_hook_api = (
+        compute_reconfigure_listener.control_plane_compute_hook_api
+    )
+
+    # A small sleep on each call into the notify hook, to simulate the latency of doing a database write
+    compute_reconfigure_listener.register_on_notify(lambda body: time.sleep(0.01))
+
+    env = neon_env_builder.init_start()
+
+    # We will intentionally stress reconciler concurrrency, which triggers a warning when lots
+    # of shards are hitting the delayed path.
+    env.storage_controller.allowed_errors.append(".*Many shards are waiting to reconcile")
+
+    for ps in env.pageservers:
+        # This can happen because when we do a loop over all pageservers and mark them offline/active,
+        # reconcilers might get cancelled, and the next reconcile can follow a not-so-elegant path of
+        # bumping generation before other attachments are detached.
+        #
+        # We could clean this up by making reconcilers respect the .observed of their predecessor, if
+        # we spawn with a wait for the predecessor.
+        ps.allowed_errors.append(".*Dropped remote consistent LSN updates.*")
+
+        # Storage controller is allowed to drop pageserver requests when the cancellation token
+        # for a Reconciler fires.
+        ps.allowed_errors.append(".*request was dropped before completing.*")
+
+    # Total tenants
+    tenant_count = 4000
+
+    # Shards per tenant
+    shard_count = 2
+    stripe_size = 1024
+
+    tenants = set(TenantId.generate() for _i in range(0, tenant_count))
+
+    virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True)
+
+    def check_memory():
+        # Shards should be cheap_ in memory, as we will have very many of them
+        expect_memory_per_shard = 128 * 1024
+
+        rss = env.storage_controller.get_metric_value("process_resident_memory_bytes")
+        assert rss is not None
+        log.info(f"Resident memory: {rss} ({ rss / (shard_count * tenant_count)} per shard)")
+        assert rss < expect_memory_per_shard * shard_count * tenant_count
+
+    # We use a fixed seed to make the test somewhat reproducible: we want a randomly
+    # chosen order in the sense that it's arbitrary, but not in the sense that it should change every run.
+    rng = random.Random(1234)
+
+    # Issue more concurrent operations than the storage controller's reconciler concurrency semaphore
+    # permits, to ensure that we are exercising stressing that.
+    api_concurrency = 135
+
+    # We will create tenants directly via API, not via neon_local, to avoid any false
+    # serialization of operations in neon_local (it e.g. loads/saves a config file on each call)
+    with concurrent.futures.ThreadPoolExecutor(max_workers=api_concurrency) as executor:
+        futs = []
+        t1 = time.time()
+        for tenant_id in tenants:
+            f = executor.submit(
+                env.storage_controller.tenant_create,
+                tenant_id,
+                shard_count,
+                stripe_size,
+                placement_policy={"Attached": 1},
+            )
+            futs.append(f)
+
+        # Wait for creations to finish
+        for f in futs:
+            f.result()
+        log.info(
+            f"Created {len(tenants)} tenants in {time.time() - t1}, {len(tenants) / (time.time() - t1)}/s"
+        )
+
+        run_ops = api_concurrency * 4
+        assert run_ops < len(tenants)
+        op_tenants = list(tenants)[0:run_ops]
+
+        # Generate a mixture of operations and dispatch them all concurrently
+        futs = []
+        for tenant_id in op_tenants:
+            op = rng.choice([0, 1, 2])
+            if op == 0:
+                # A fan-out write operation to all shards in a tenant (timeline creation)
+                f = executor.submit(
+                    virtual_ps_http.timeline_create,
+                    PgVersion.NOT_SET,
+                    tenant_id,
+                    TimelineId.generate(),
+                )
+            elif op == 1:
+                # A reconciler operation: migrate a shard.
+                shard_number = rng.randint(0, shard_count - 1)
+                tenant_shard_id = TenantShardId(tenant_id, shard_number, shard_count)
+                dest_ps_id = rng.choice([ps.id for ps in env.pageservers])
+                f = executor.submit(
+                    env.storage_controller.tenant_shard_migrate, tenant_shard_id, dest_ps_id
+                )
+            elif op == 2:
+                # A passthrough read to shard zero
+                f = executor.submit(virtual_ps_http.tenant_status, tenant_id)
+
+            futs.append(f)
+
+        # Wait for mixed ops to finish
+        for f in futs:
+            f.result()
+
+    # Consistency check is safe here: all the previous operations waited for reconcile before completing
+    env.storage_controller.consistency_check()
+    check_memory()
+
+    # This loop waits for reconcile_all to indicate no pending work, and then calls it once more to time
+    # how long the call takes when idle: this iterates over shards while doing no I/O and should be reliably fast: if
+    # it isn't, that's a sign that we have made some algorithmic mistake (e.g. O(N**2) scheduling)
+    #
+    # We do not require that the system is quiescent already here, although at present in this point in the test
+    # that may be the case.
+    while True:
+        t1 = time.time()
+        reconcilers = env.storage_controller.reconcile_all()
+        if reconcilers == 0:
+            # Time how long a no-op background reconcile takes: this measures how long it takes to
+            # loop over all the shards looking for work to do.
+            runtime = time.time() - t1
+            log.info(f"No-op call to reconcile_all took {runtime}s")
+            assert runtime < 1
+            break
+
+    # Restart the storage controller
+    env.storage_controller.stop()
+    env.storage_controller.start()
+
+    # See how long the controller takes to pass its readiness check.  This should be fast because
+    # all the nodes are online: offline pageservers are the only thing that's allowed to delay
+    # startup.
+    readiness_period = env.storage_controller.wait_until_ready()
+    assert readiness_period < 5
+
+    # Consistency check is safe here: the storage controller's restart should not have caused any reconcilers
+    # to run, as it was in a stable state before restart.  If it did, that's a bug.
+    env.storage_controller.consistency_check()
+    check_memory()
+
+    # Restart pageservers: this exercises the /re-attach API
+    for pageserver in env.pageservers:
+        pageserver.stop()
+        pageserver.start()
+
+    # Consistency check is safe here: restarting pageservers should not have caused any Reconcilers to spawn,
+    # as they were not offline long enough to trigger any scheduling changes.
+    env.storage_controller.consistency_check()
+    check_memory()
+
+    # Stop the storage controller before tearing down fixtures, because it otherwise might log
+    # errors trying to call our `ComputeReconfigure`.
+    env.storage_controller.stop()
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index 909d25980b..59461cc095 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -190,6 +190,7 @@ def test_fully_custom_config(positive_env: NeonEnv):
         "trace_read_requests": True,
         "walreceiver_connect_timeout": "13m",
         "image_layer_creation_check_threshold": 1,
+        "switch_to_aux_file_v2": True,
     }
 
     ps_http = env.pageserver.http_client()
diff --git a/test_runner/regress/test_bad_connection.py b/test_runner/regress/test_bad_connection.py
index c808fa0f54..82a3a05c2b 100644
--- a/test_runner/regress/test_bad_connection.py
+++ b/test_runner/regress/test_bad_connection.py
@@ -1,10 +1,13 @@
 import random
 import time
 
+import psycopg2.errors
+import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder
 
 
+@pytest.mark.timeout(600)
 def test_compute_pageserver_connection_stress(neon_env_builder: NeonEnvBuilder):
     env = neon_env_builder.init_start()
     env.pageserver.allowed_errors.append(".*simulated connection error.*")
@@ -20,12 +23,20 @@ def test_compute_pageserver_connection_stress(neon_env_builder: NeonEnvBuilder):
     pg_conn = endpoint.connect()
     cur = pg_conn.cursor()
 
+    def execute_retry_on_timeout(query):
+        while True:
+            try:
+                cur.execute(query)
+                return
+            except psycopg2.errors.QueryCanceled:
+                log.info(f"Query '{query}' timed out - retrying")
+
     # Create table, and insert some rows. Make it big enough that it doesn't fit in
     # shared_buffers, otherwise the SELECT after restart will just return answer
     # from shared_buffers without hitting the page server, which defeats the point
     # of this test.
-    cur.execute("CREATE TABLE foo (t text)")
-    cur.execute(
+    execute_retry_on_timeout("CREATE TABLE foo (t text)")
+    execute_retry_on_timeout(
         """
         INSERT INTO foo
             SELECT 'long string to consume some space' || g
@@ -34,7 +45,7 @@ def test_compute_pageserver_connection_stress(neon_env_builder: NeonEnvBuilder):
     )
 
     # Verify that the table is larger than shared_buffers
-    cur.execute(
+    execute_retry_on_timeout(
         """
         select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_size
         from pg_settings where name = 'shared_buffers'
@@ -45,16 +56,16 @@ def test_compute_pageserver_connection_stress(neon_env_builder: NeonEnvBuilder):
     log.info(f"shared_buffers is {row[0]}, table size {row[1]}")
     assert int(row[0]) < int(row[1])
 
-    cur.execute("SELECT count(*) FROM foo")
+    execute_retry_on_timeout("SELECT count(*) FROM foo")
     assert cur.fetchone() == (100000,)
 
     end_time = time.time() + 30
     times_executed = 0
     while time.time() < end_time:
         if random.random() < 0.5:
-            cur.execute("INSERT INTO foo VALUES ('stas'), ('heikki')")
+            execute_retry_on_timeout("INSERT INTO foo VALUES ('stas'), ('heikki')")
         else:
-            cur.execute("SELECT t FROM foo ORDER BY RANDOM() LIMIT 10")
+            execute_retry_on_timeout("SELECT t FROM foo ORDER BY RANDOM() LIMIT 10")
             cur.fetchall()
         times_executed += 1
     log.info(f"Workload executed {times_executed} times")
diff --git a/test_runner/regress/test_branching.py b/test_runner/regress/test_branching.py
index 5b69649007..9fe9f77fea 100644
--- a/test_runner/regress/test_branching.py
+++ b/test_runner/regress/test_branching.py
@@ -1,6 +1,7 @@
 import random
 import threading
 import time
+from concurrent.futures import ThreadPoolExecutor
 from typing import List
 
 import pytest
@@ -405,6 +406,29 @@ def test_duplicate_creation(neon_env_builder: NeonEnvBuilder):
     assert len(ps_http.timeline_list(tenant_id=env.initial_tenant)) == 1
 
 
+def test_branching_while_stuck_find_gc_cutoffs(neon_env_builder: NeonEnvBuilder):
+    env = neon_env_builder.init_start()
+
+    client = env.pageserver.http_client()
+
+    failpoint = "Timeline::find_gc_cutoffs-pausable"
+
+    client.configure_failpoints((failpoint, "pause"))
+
+    with ThreadPoolExecutor(max_workers=1) as exec:
+        completion = exec.submit(client.timeline_gc, env.initial_tenant, env.initial_timeline, None)
+
+        wait_until_paused(env, failpoint)
+
+        env.neon_cli.create_branch(
+            tenant_id=env.initial_tenant, ancestor_branch_name="main", new_branch_name="branch"
+        )
+
+        client.configure_failpoints((failpoint, "off"))
+
+        completion.result()
+
+
 def wait_until_paused(env: NeonEnv, failpoint: str):
     found = False
     msg = f"at failpoint {failpoint}"
diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py
index 804ad135ce..1279c1bf81 100644
--- a/test_runner/regress/test_broken_timeline.py
+++ b/test_runner/regress/test_broken_timeline.py
@@ -17,11 +17,16 @@ from fixtures.types import TenantId, TimelineId
 # Test restarting page server, while safekeeper and compute node keep
 # running.
 def test_local_corruption(neon_env_builder: NeonEnvBuilder):
+    if neon_env_builder.pageserver_get_impl == "vectored":
+        reconstruct_function_name = "get_values_reconstruct_data"
+    else:
+        reconstruct_function_name = "get_value_reconstruct_data"
+
     env = neon_env_builder.init_start()
 
     env.pageserver.allowed_errors.extend(
         [
-            ".*get_value_reconstruct_data for layer .*",
+            f".*{reconstruct_function_name} for layer .*",
             ".*could not find data for key.*",
             ".*is not active. Current state: Broken.*",
             ".*will not become active. Current state: Broken.*",
@@ -84,7 +89,7 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder):
     # (We don't check layer file contents on startup, when loading the timeline)
     #
     # This will change when we implement checksums for layers
-    with pytest.raises(Exception, match="get_value_reconstruct_data for layer ") as err:
+    with pytest.raises(Exception, match=f"{reconstruct_function_name} for layer ") as err:
         pg2.start()
     log.info(
         f"As expected, compute startup failed for timeline {tenant2}/{timeline2} with corrupt layers: {err}"
diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
new file mode 100644
index 0000000000..43a3323462
--- /dev/null
+++ b/test_runner/regress/test_compaction.py
@@ -0,0 +1,192 @@
+import json
+import os
+from typing import Optional
+
+import pytest
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import NeonEnvBuilder
+from fixtures.workload import Workload
+
+AGGRESIVE_COMPACTION_TENANT_CONF = {
+    # Disable gc and compaction. The test runs compaction manually.
+    "gc_period": "0s",
+    "compaction_period": "0s",
+    # Small checkpoint distance to create many layers
+    "checkpoint_distance": 1024**2,
+    # Compact small layers
+    "compaction_target_size": 1024**2,
+    "image_creation_threshold": 2,
+}
+
+
+@pytest.mark.skipif(os.environ.get("BUILD_TYPE") == "debug", reason="only run with release build")
+def test_pageserver_compaction_smoke(neon_env_builder: NeonEnvBuilder):
+    """
+    This is a smoke test that compaction kicks in. The workload repeatedly churns
+    a small number of rows and manually instructs the pageserver to run compaction
+    between iterations. At the end of the test validate that the average number of
+    layers visited to gather reconstruct data for a given key is within the empirically
+    observed bounds.
+    """
+
+    # Effectively disable the page cache to rely only on image layers
+    # to shorten reads.
+    neon_env_builder.pageserver_config_override = """
+page_cache_size=10
+"""
+
+    env = neon_env_builder.init_start(initial_tenant_conf=AGGRESIVE_COMPACTION_TENANT_CONF)
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    row_count = 10000
+    churn_rounds = 100
+
+    ps_http = env.pageserver.http_client()
+
+    workload = Workload(env, tenant_id, timeline_id)
+    workload.init(env.pageserver.id)
+
+    log.info("Writing initial data ...")
+    workload.write_rows(row_count, env.pageserver.id)
+
+    for i in range(1, churn_rounds + 1):
+        if i % 10 == 0:
+            log.info(f"Running churn round {i}/{churn_rounds} ...")
+
+        workload.churn_rows(row_count, env.pageserver.id)
+        ps_http.timeline_compact(tenant_id, timeline_id)
+
+    log.info("Validating at workload end ...")
+    workload.validate(env.pageserver.id)
+
+    log.info("Checking layer access metrics ...")
+
+    layer_access_metric_names = [
+        "pageserver_layers_visited_per_read_global_sum",
+        "pageserver_layers_visited_per_read_global_count",
+        "pageserver_layers_visited_per_read_global_bucket",
+        "pageserver_layers_visited_per_vectored_read_global_sum",
+        "pageserver_layers_visited_per_vectored_read_global_count",
+        "pageserver_layers_visited_per_vectored_read_global_bucket",
+    ]
+
+    metrics = env.pageserver.http_client().get_metrics()
+    for name in layer_access_metric_names:
+        layer_access_metrics = metrics.query_all(name)
+        log.info(f"Got metrics: {layer_access_metrics}")
+
+    non_vectored_sum = metrics.query_one("pageserver_layers_visited_per_read_global_sum")
+    non_vectored_count = metrics.query_one("pageserver_layers_visited_per_read_global_count")
+    non_vectored_average = non_vectored_sum.value / non_vectored_count.value
+
+    vectored_sum = metrics.query_one("pageserver_layers_visited_per_vectored_read_global_sum")
+    vectored_count = metrics.query_one("pageserver_layers_visited_per_vectored_read_global_count")
+    vectored_average = vectored_sum.value / vectored_count.value
+
+    log.info(f"{non_vectored_average=} {vectored_average=}")
+
+    # The upper bound for average number of layer visits below (8)
+    # was chosen empirically for this workload.
+    assert non_vectored_average < 8
+    assert vectored_average < 8
+
+
+# Stripe sizes in number of pages.
+TINY_STRIPES = 16
+LARGE_STRIPES = 32768
+
+
+@pytest.mark.parametrize(
+    "shard_count,stripe_size", [(None, None), (4, TINY_STRIPES), (4, LARGE_STRIPES)]
+)
+def test_sharding_compaction(
+    neon_env_builder: NeonEnvBuilder, stripe_size: int, shard_count: Optional[int]
+):
+    """
+    Use small stripes, small layers, and small compaction thresholds to exercise how compaction
+    and image layer generation interacts with sharding.
+
+    We are looking for bugs that might emerge from the way sharding uses sparse layer files that
+    only contain some of the keys in the key range covered by the layer, such as errors estimating
+    the size of layers that might result in too-small layer files.
+    """
+
+    compaction_target_size = 128 * 1024
+
+    TENANT_CONF = {
+        # small checkpointing and compaction targets to ensure we generate many upload operations
+        "checkpoint_distance": f"{128 * 1024}",
+        "compaction_threshold": "1",
+        "compaction_target_size": f"{compaction_target_size}",
+        # no PITR horizon, we specify the horizon when we request on-demand GC
+        "pitr_interval": "0s",
+        # disable background compaction and GC. We invoke it manually when we want it to happen.
+        "gc_period": "0s",
+        "compaction_period": "0s",
+        # create image layers eagerly: we want to exercise image layer creation in this test.
+        "image_creation_threshold": "1",
+        "image_layer_creation_check_threshold": 0,
+    }
+
+    neon_env_builder.num_pageservers = 1 if shard_count is None else shard_count
+    env = neon_env_builder.init_start(
+        initial_tenant_conf=TENANT_CONF,
+        initial_tenant_shard_count=shard_count,
+        initial_tenant_shard_stripe_size=stripe_size,
+    )
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    workload = Workload(env, tenant_id, timeline_id)
+    workload.init()
+    workload.write_rows(64)
+    for _i in range(0, 10):
+        # Each of these does some writes then a checkpoint: because we set image_creation_threshold to 1,
+        # these should result in image layers each time we write some data into a shard, and also shards
+        # recieving less data hitting their "empty image layer" path (wherre they should skip writing the layer,
+        # rather than asserting)
+        workload.churn_rows(64)
+
+    # Assert that we got some image layers: this is important because this test's purpose is to exercise the sharding changes
+    # to Timeline::create_image_layers, so if we weren't creating any image layers we wouldn't be doing our job.
+    shard_has_image_layers = []
+    for shard in env.storage_controller.locate(tenant_id):
+        pageserver = env.get_pageserver(shard["node_id"])
+        shard_id = shard["shard_id"]
+        layer_map = pageserver.http_client().layer_map_info(shard_id, timeline_id)
+        image_layer_sizes = {}
+        for layer in layer_map.historic_layers:
+            if layer.kind == "Image":
+                image_layer_sizes[layer.layer_file_name] = layer.layer_file_size
+
+                # Pageserver should assert rather than emit an empty layer file, but double check here
+                assert layer.layer_file_size is not None
+                assert layer.layer_file_size > 0
+
+        shard_has_image_layers.append(len(image_layer_sizes) > 1)
+        log.info(f"Shard {shard_id} image layer sizes: {json.dumps(image_layer_sizes, indent=2)}")
+
+        if stripe_size == TINY_STRIPES:
+            # Checking the average size validates that our keyspace partitioning is  properly respecting sharding: if
+            # it was not, we would tend to get undersized layers because the partitioning would overestimate the physical
+            # data in a keyrange.
+            #
+            # We only do this check with tiny stripes, because large stripes may not give all shards enough
+            # data to have statistically significant image layers
+            avg_size = sum(v for v in image_layer_sizes.values()) / len(image_layer_sizes)  # type: ignore
+            log.info(f"Shard {shard_id} average image layer size: {avg_size}")
+            assert avg_size > compaction_target_size / 2
+
+    if stripe_size == TINY_STRIPES:
+        # Expect writes were scattered across all pageservers: they should all have compacted some image layers
+        assert all(shard_has_image_layers)
+    else:
+        # With large stripes, it is expected that most of our writes went to one pageserver, so we just require
+        # that at least one of them has some image layers.
+        assert any(shard_has_image_layers)
+
+    # Assert that everything is still readable
+    workload.validate()
diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index 208263a22a..e1ccb3e0c6 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -192,9 +192,6 @@ def test_backward_compatibility(
     assert not breaking_changes_allowed, "Breaking changes are allowed by ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage"
 
 
-# Forward compatibility is broken due to https://github.com/neondatabase/neon/pull/6530
-# The test is disabled until the next release deployment
-@pytest.mark.xfail
 @check_ondisk_data_compatibility_if_enabled
 @pytest.mark.xdist_group("compatibility")
 @pytest.mark.order(after="test_create_snapshot")
@@ -229,6 +226,12 @@ def test_forward_compatibility(
     )
 
     try:
+        # Previous version neon_local and pageserver are not aware
+        # of the new config.
+        # TODO: remove these once the previous version of neon local supports them
+        neon_env_builder.pageserver_get_impl = None
+        neon_env_builder.pageserver_validate_vectored_get = None
+
         neon_env_builder.num_safekeepers = 3
         neon_local_binpath = neon_env_builder.neon_binpath
         env = neon_env_builder.from_repo_dir(
diff --git a/test_runner/regress/test_crafted_wal_end.py b/test_runner/regress/test_crafted_wal_end.py
index 01ecc2b95f..30f8d81890 100644
--- a/test_runner/regress/test_crafted_wal_end.py
+++ b/test_runner/regress/test_crafted_wal_end.py
@@ -19,6 +19,12 @@ from fixtures.neon_fixtures import NeonEnvBuilder, WalCraft
 def test_crafted_wal_end(neon_env_builder: NeonEnvBuilder, wal_type: str):
     env = neon_env_builder.init_start()
     env.neon_cli.create_branch("test_crafted_wal_end")
+    env.pageserver.allowed_errors.extend(
+        [
+            # seems like pageserver stop triggers these
+            ".*initial size calculation failed.*Bad state (not active).*",
+        ]
+    )
 
     endpoint = env.endpoints.create("test_crafted_wal_end")
     wal_craft = WalCraft(env)
diff --git a/test_runner/regress/test_hot_standby.py b/test_runner/regress/test_hot_standby.py
index ac3315b86f..179cc273ec 100644
--- a/test_runner/regress/test_hot_standby.py
+++ b/test_runner/regress/test_hot_standby.py
@@ -3,7 +3,7 @@ import re
 import time
 
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnv, wait_replica_caughtup
+from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, tenant_get_shards, wait_replica_caughtup
 
 
 # Check for corrupted WAL messages which might otherwise go unnoticed if
@@ -102,3 +102,80 @@ def test_2_replicas_start(neon_simple_env: NeonEnv):
             ) as secondary2:
                 wait_replica_caughtup(primary, secondary1)
                 wait_replica_caughtup(primary, secondary2)
+
+
+# We had an issue that a standby server made GetPage requests with an
+# old LSN, based on the last-written LSN cache, to avoid waits in the
+# pageserver.  However, requesting a page with a very old LSN, such
+# that the GC horizon has already advanced past it, results in an
+# error from the pageserver:
+# "Bad request: tried to request a page version that was garbage collected"
+#
+# To avoid that, the compute<-> pageserver protocol was updated so
+# that that the standby now sends two LSNs, the old last-written LSN
+# and the current replay LSN.
+#
+# https://github.com/neondatabase/neon/issues/6211
+def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder):
+    tenant_conf = {
+        # set PITR interval to be small, so we can do GC
+        "pitr_interval": "0 s",
+    }
+    env = neon_env_builder.init_start(initial_tenant_conf=tenant_conf)
+    timeline_id = env.initial_timeline
+    tenant_id = env.initial_tenant
+
+    with env.endpoints.create_start(
+        branch_name="main",
+        endpoint_id="primary",
+    ) as primary:
+        with env.endpoints.new_replica_start(
+            origin=primary,
+            endpoint_id="secondary",
+            # Protocol version 2 was introduced to fix the issue
+            # that this test exercises. With protocol version 1 it
+            # fails.
+            config_lines=["neon.protocol_version=2"],
+        ) as secondary:
+            p_cur = primary.connect().cursor()
+            p_cur.execute("CREATE EXTENSION neon_test_utils")
+            p_cur.execute("CREATE TABLE test (id int primary key) WITH (autovacuum_enabled=false)")
+            p_cur.execute("INSERT INTO test SELECT generate_series(1, 10000) AS g")
+
+            wait_replica_caughtup(primary, secondary)
+
+            s_cur = secondary.connect().cursor()
+
+            s_cur.execute("SELECT 1 WHERE pg_is_in_recovery()")
+            res = s_cur.fetchone()
+            assert res is not None
+
+            s_cur.execute("SELECT COUNT(*) FROM test")
+            res = s_cur.fetchone()
+            assert res[0] == 10000
+
+            # Clear the cache in the standby, so that when we
+            # re-execute the query, it will make GetPage
+            # requests. This does not clear the last-written LSN cache
+            # so we still remember the LSNs of the pages.
+            s_cur.execute("SELECT clear_buffer_cache()")
+
+            # Do other stuff on the primary, to advance the WAL
+            p_cur.execute("CREATE TABLE test2 AS SELECT generate_series(1, 1000000) AS g")
+
+            # Run GC. The PITR interval is very small, so this advances the GC cutoff LSN
+            # very close to the primary's current insert LSN.
+            shards = tenant_get_shards(env, tenant_id, None)
+            for tenant_shard_id, pageserver in shards:
+                client = pageserver.http_client()
+                client.timeline_checkpoint(tenant_shard_id, timeline_id)
+                client.timeline_compact(tenant_shard_id, timeline_id)
+                client.timeline_gc(tenant_shard_id, timeline_id, 0)
+
+            # Re-execute the query. The GetPage requests that this
+            # generates use old not_modified_since LSNs, older than
+            # the GC cutoff, but new request LSNs. (In protocol
+            # version 1 there was only one LSN, and this failed.)
+            s_cur.execute("SELECT COUNT(*) FROM test")
+            res = s_cur.fetchone()
+            assert res[0] == 10000
diff --git a/test_runner/regress/test_local_file_cache.py b/test_runner/regress/test_local_file_cache.py
index 38f2034c18..76c6581448 100644
--- a/test_runner/regress/test_local_file_cache.py
+++ b/test_runner/regress/test_local_file_cache.py
@@ -4,16 +4,21 @@ import threading
 import time
 from typing import List
 
-from fixtures.neon_fixtures import NeonEnv
+from fixtures.neon_fixtures import DEFAULT_BRANCH_NAME, NeonEnvBuilder
 from fixtures.utils import query_scalar
 
 
-def test_local_file_cache_unlink(neon_simple_env: NeonEnv):
-    env = neon_simple_env
+def test_local_file_cache_unlink(neon_env_builder: NeonEnvBuilder, build_type: str):
+    if build_type == "debug":
+        # Disable vectored read path cross validation since it makes the test time out.
+        neon_env_builder.pageserver_config_override = "validate_vectored_get=false"
+
+    env = neon_env_builder.init_start()
 
     cache_dir = os.path.join(env.repo_dir, "file_cache")
     os.mkdir(cache_dir)
 
+    env.neon_cli.create_branch("empty", ancestor_branch_name=DEFAULT_BRANCH_NAME)
     env.neon_cli.create_branch("test_local_file_cache_unlink", "empty")
 
     endpoint = env.endpoints.create_start(
diff --git a/test_runner/regress/test_lsn_mapping.py b/test_runner/regress/test_lsn_mapping.py
index 5813231aab..5c99ca6733 100644
--- a/test_runner/regress/test_lsn_mapping.py
+++ b/test_runner/regress/test_lsn_mapping.py
@@ -1,3 +1,4 @@
+import re
 import time
 from datetime import datetime, timedelta, timezone
 
@@ -109,6 +110,8 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder):
 
 # Test pageserver get_timestamp_of_lsn API
 def test_ts_of_lsn_api(neon_env_builder: NeonEnvBuilder):
+    key_not_found_error = r".*could not find data for key.*"
+
     env = neon_env_builder.init_start()
 
     new_timeline_id = env.neon_cli.create_branch("test_ts_of_lsn_api")
@@ -177,8 +180,8 @@ def test_ts_of_lsn_api(neon_env_builder: NeonEnvBuilder):
             raise RuntimeError("there should have been an 'could not find data for key' error")
         except PageserverApiException as error:
             assert error.status_code == 500
-            assert str(error).startswith("could not find data for key")
-            env.pageserver.allowed_errors.append(".*could not find data for key.*")
+            assert re.match(key_not_found_error, str(error))
+            env.pageserver.allowed_errors.append(key_not_found_error)
 
         # Probe a bunch of timestamps in the valid range
         step_size = 100
diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py
index ba0d53704b..6c2556f6a2 100644
--- a/test_runner/regress/test_ondemand_download.py
+++ b/test_runner/regress/test_ondemand_download.py
@@ -333,6 +333,17 @@ def test_download_remote_layers_api(
         }
     )
 
+    # This test triggers layer download failures on demand. It is possible to modify the failpoint
+    # during a `Timeline::get_vectored` right between the vectored read and it's validation read.
+    # This means that one of the reads can fail while the other one succeeds and vice versa.
+    # TODO(vlad): Remove this block once the vectored read path validation goes away.
+    env.pageserver.allowed_errors.extend(
+        [
+            ".*initial_size_calculation.*Vectored get failed with downloading evicted layer file failed, but sequential get did not.*"
+            ".*initial_size_calculation.*Sequential get failed with downloading evicted layer file failed, but vectored get did not.*"
+        ]
+    )
+
     endpoint = env.endpoints.create_start("main")
 
     client = env.pageserver.http_client()
diff --git a/test_runner/regress/test_ondemand_slru_download.py b/test_runner/regress/test_ondemand_slru_download.py
new file mode 100644
index 0000000000..0b36b32552
--- /dev/null
+++ b/test_runner/regress/test_ondemand_slru_download.py
@@ -0,0 +1,131 @@
+from typing import Optional
+
+import pytest
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import NeonEnvBuilder, tenant_get_shards
+from fixtures.types import Lsn
+from fixtures.utils import query_scalar
+
+
+#
+# Test on-demand download of the pg_xact SLRUs
+#
+@pytest.mark.parametrize("shard_count", [None, 4])
+def test_ondemand_download_pg_xact(neon_env_builder: NeonEnvBuilder, shard_count: Optional[int]):
+    if shard_count is not None:
+        neon_env_builder.num_pageservers = shard_count
+
+    tenant_conf = {
+        "lazy_slru_download": "true",
+        # set PITR interval to be small, so we can do GC
+        "pitr_interval": "0 s",
+    }
+    env = neon_env_builder.init_start(
+        initial_tenant_conf=tenant_conf, initial_tenant_shard_count=shard_count
+    )
+
+    timeline_id = env.initial_timeline
+    tenant_id = env.initial_tenant
+    endpoint = env.endpoints.create_start("main")
+
+    pg_conn = endpoint.connect()
+    cur = pg_conn.cursor()
+
+    cur.execute("CREATE EXTENSION neon_test_utils")
+
+    # Create a test table
+    cur.execute("CREATE TABLE clogtest (id integer)")
+    cur.execute("INSERT INTO clogtest VALUES (1)")
+
+    # Consume a lot of XIDs, to create more pg_xact segments
+    for _ in range(1000):
+        cur.execute("select test_consume_xids(10000);")
+    cur.execute("INSERT INTO clogtest VALUES (2)")
+    for _ in range(1000):
+        cur.execute("select test_consume_xids(10000);")
+    cur.execute("INSERT INTO clogtest VALUES (2)")
+    for _ in range(1000):
+        cur.execute("select test_consume_xids(10000);")
+    cur.execute("INSERT INTO clogtest VALUES (3)")
+
+    # Restart postgres. After restart, the new instance will download the
+    # pg_xact segments lazily.
+    endpoint.stop()
+    endpoint.start()
+    pg_conn = endpoint.connect()
+    cur = pg_conn.cursor()
+
+    # Consume more WAL, so that the pageserver can compact and GC older data,
+    # including the LSN that we started the new endpoint at,
+    cur.execute("CREATE TABLE anothertable (i int, t text)")
+    cur.execute(
+        "INSERT INTO anothertable SELECT g, 'long string to consume some space' || g FROM generate_series(1, 10000) g"
+    )
+
+    # Run GC
+    shards = tenant_get_shards(env, tenant_id, None)
+    for tenant_shard_id, pageserver in shards:
+        client = pageserver.http_client()
+        client.timeline_checkpoint(tenant_shard_id, timeline_id)
+        client.timeline_compact(tenant_shard_id, timeline_id)
+        client.timeline_gc(tenant_shard_id, timeline_id, 0)
+
+    # Test that this can still on-demand download the old pg_xact segments
+    cur.execute("select xmin, xmax, * from clogtest")
+    tup = cur.fetchall()
+    log.info(f"tuples = {tup}")
+
+
+@pytest.mark.parametrize("shard_count", [None, 4])
+def test_ondemand_download_replica(neon_env_builder: NeonEnvBuilder, shard_count: Optional[int]):
+    if shard_count is not None:
+        neon_env_builder.num_pageservers = shard_count
+
+    tenant_conf = {
+        "lazy_slru_download": "true",
+    }
+    env = neon_env_builder.init_start(
+        initial_tenant_conf=tenant_conf, initial_tenant_shard_count=shard_count
+    )
+
+    endpoint = env.endpoints.create_start("main")
+
+    pg_conn = endpoint.connect()
+    cur = pg_conn.cursor()
+
+    cur.execute("CREATE EXTENSION neon_test_utils")
+
+    # Create a test table
+    cur.execute("CREATE TABLE clogtest (id integer)")
+    cur.execute("INSERT INTO clogtest VALUES (1)")
+
+    # Consume a lot of XIDs, to create more pg_xact segments
+    for _ in range(1000):
+        cur.execute("select test_consume_xids(10000);")
+
+    # Open a new connection and insert another row, but leave
+    # the transaction open
+    pg_conn2 = endpoint.connect()
+    cur2 = pg_conn2.cursor()
+    cur2.execute("BEGIN")
+    cur2.execute("INSERT INTO clogtest VALUES (2)")
+
+    # Another insert on the first connection, which is committed.
+    for _ in range(1000):
+        cur.execute("select test_consume_xids(10000);")
+    cur.execute("INSERT INTO clogtest VALUES (3)")
+
+    # Start standby at this point in time
+    lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_insert_lsn()"))
+    endpoint_at_lsn = env.endpoints.create_start(
+        branch_name="main", endpoint_id="ep-at-lsn", lsn=lsn
+    )
+
+    # Commit transaction 2, after the standby was launched.
+    cur2.execute("COMMIT")
+
+    # The replica should not see transaction 2 as committed.
+    conn_replica = endpoint_at_lsn.connect()
+    cur_replica = conn_replica.cursor()
+    cur_replica.execute("SELECT * FROM clogtest")
+    assert cur_replica.fetchall() == [(1,), (3,)]
diff --git a/test_runner/regress/test_pageserver_config.py b/test_runner/regress/test_pageserver_config.py
new file mode 100644
index 0000000000..c04348b488
--- /dev/null
+++ b/test_runner/regress/test_pageserver_config.py
@@ -0,0 +1,35 @@
+import pytest
+from fixtures.neon_fixtures import (
+    NeonEnvBuilder,
+    last_flush_lsn_upload,
+)
+
+
+@pytest.mark.parametrize("kind", ["sync", "async"])
+def test_walredo_process_kind_config(neon_env_builder: NeonEnvBuilder, kind: str):
+    neon_env_builder.pageserver_config_override = f"walredo_process_kind = '{kind}'"
+    # ensure it starts
+    env = neon_env_builder.init_start()
+    # ensure the metric is set
+    ps_http = env.pageserver.http_client()
+    metrics = ps_http.get_metrics()
+    samples = metrics.query_all("pageserver_wal_redo_process_kind")
+    assert [(s.labels, s.value) for s in samples] == [({"kind": kind}, 1)]
+    # ensure default tenant's config kind matches
+    # => write some data to force-spawn walredo
+    ep = env.endpoints.create_start("main")
+    with ep.connect() as conn:
+        with conn.cursor() as cur:
+            cur.execute("create table foo(bar text)")
+            cur.execute("insert into foo select from generate_series(1, 100)")
+    last_flush_lsn_upload(env, ep, env.initial_tenant, env.initial_timeline)
+    ep.stop()
+    ep.start()
+    with ep.connect() as conn:
+        with conn.cursor() as cur:
+            cur.execute("select count(*) from foo")
+            [(count,)] = cur.fetchall()
+            assert count == 100
+
+    status = ps_http.tenant_status(env.initial_tenant)
+    assert status["walredo"]["process"]["kind"] == kind
diff --git a/test_runner/regress/test_pageserver_layer_rolling.py b/test_runner/regress/test_pageserver_layer_rolling.py
index c7e1e88468..c5dc0f2919 100644
--- a/test_runner/regress/test_pageserver_layer_rolling.py
+++ b/test_runner/regress/test_pageserver_layer_rolling.py
@@ -1,6 +1,7 @@
 import asyncio
 import os
-from typing import Tuple
+import time
+from typing import Optional, Tuple
 
 import psutil
 import pytest
@@ -20,20 +21,30 @@ ENTRIES_PER_TIMELINE = 10_000
 CHECKPOINT_TIMEOUT_SECONDS = 60
 
 
-async def run_worker(env: NeonEnv, tenant_conf, entries: int) -> Tuple[TenantId, TimelineId, Lsn]:
-    tenant, timeline = env.neon_cli.create_tenant(conf=tenant_conf)
+async def run_worker_for_tenant(
+    env: NeonEnv, entries: int, tenant: TenantId, offset: Optional[int] = None
+) -> Lsn:
+    if offset is None:
+        offset = 0
+
     with env.endpoints.create_start("main", tenant_id=tenant) as ep:
         conn = await ep.connect_async()
         try:
             await conn.execute("CREATE TABLE IF NOT EXISTS t(key serial primary key, value text)")
             await conn.execute(
-                f"INSERT INTO t SELECT i, CONCAT('payload_', i) FROM generate_series(0,{entries}) as i"
+                f"INSERT INTO t SELECT i, CONCAT('payload_', i) FROM generate_series({offset},{entries}) as i"
             )
         finally:
             await conn.close(timeout=10)
 
         last_flush_lsn = Lsn(ep.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
-        return tenant, timeline, last_flush_lsn
+        return last_flush_lsn
+
+
+async def run_worker(env: NeonEnv, tenant_conf, entries: int) -> Tuple[TenantId, TimelineId, Lsn]:
+    tenant, timeline = env.neon_cli.create_tenant(conf=tenant_conf)
+    last_flush_lsn = await run_worker_for_tenant(env, entries, tenant)
+    return tenant, timeline, last_flush_lsn
 
 
 async def workload(
@@ -89,7 +100,9 @@ def assert_dirty_bytes(env, v):
 
 
 def assert_dirty_bytes_nonzero(env):
-    assert get_dirty_bytes(env) > 0
+    dirty_bytes = get_dirty_bytes(env)
+    assert dirty_bytes > 0
+    return dirty_bytes
 
 
 @pytest.mark.parametrize("immediate_shutdown", [True, False])
@@ -182,6 +195,31 @@ def test_idle_checkpoints(neon_env_builder: NeonEnvBuilder):
     log.info("Waiting for background checkpoints...")
     wait_until(CHECKPOINT_TIMEOUT_SECONDS * 2, 1, lambda: assert_dirty_bytes(env, 0))  # type: ignore
 
+    # The code below verifies that we do not flush on the first write
+    # after an idle period longer than the checkpoint timeout.
+
+    # Sit quietly for longer than the checkpoint timeout
+    time.sleep(CHECKPOINT_TIMEOUT_SECONDS + CHECKPOINT_TIMEOUT_SECONDS / 2)
+
+    # Restart the safekeepers and write a bit of extra data into one tenant
+    for sk in env.safekeepers:
+        sk.start()
+
+    tenant_with_extra_writes = last_flush_lsns[0][0]
+    asyncio.run(
+        run_worker_for_tenant(env, 5, tenant_with_extra_writes, offset=ENTRIES_PER_TIMELINE)
+    )
+
+    dirty_after_write = wait_until(10, 1, lambda: assert_dirty_bytes_nonzero(env))  # type: ignore
+
+    # We shouldn't flush since we've just opened a new layer
+    waited_for = 0
+    while waited_for < CHECKPOINT_TIMEOUT_SECONDS // 4:
+        time.sleep(5)
+        waited_for += 5
+
+        assert get_dirty_bytes(env) >= dirty_after_write
+
 
 @pytest.mark.skipif(
     # We have to use at least ~100MB of data to hit the lowest limit we can configure, which is
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index 345abdc072..8f194e5dda 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -1,6 +1,7 @@
 import json
 import os
 import random
+import time
 from pathlib import Path
 from typing import Any, Dict, Optional
 
@@ -582,6 +583,91 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
     )
 
 
+def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):
+    """
+    Slow test that runs in realtime, checks that the background scheduling of secondary
+    downloads happens as expected.
+    """
+    neon_env_builder.num_pageservers = 2
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    # Create this many tenants, each with two timelines
+    tenant_count = 4
+    tenant_timelines = {}
+
+    # This mirrors a constant in `downloader.rs`
+    freshen_interval_secs = 60
+
+    for _i in range(0, tenant_count):
+        tenant_id = TenantId.generate()
+        timeline_a = TimelineId.generate()
+        timeline_b = TimelineId.generate()
+        env.neon_cli.create_tenant(
+            tenant_id,
+            timeline_a,
+            placement_policy='{"Attached":1}',
+            # Run with a low heatmap period so that we can avoid having to do synthetic API calls
+            # to trigger the upload promptly.
+            conf={"heatmap_period": "1s"},
+        )
+        env.neon_cli.create_timeline("main2", tenant_id, timeline_b)
+
+        tenant_timelines[tenant_id] = [timeline_a, timeline_b]
+
+    t_start = time.time()
+
+    # Wait long enough that the background downloads should happen; we expect all the inital layers
+    # of all the initial timelines to show up on the secondary location of each tenant.
+    time.sleep(freshen_interval_secs * 1.5)
+
+    for tenant_id, timelines in tenant_timelines.items():
+        attached_to_id = env.storage_controller.locate(tenant_id)[0]["node_id"]
+        ps_attached = env.get_pageserver(attached_to_id)
+        # We only have two: the other one must be secondary
+        ps_secondary = next(p for p in env.pageservers if p != ps_attached)
+
+        for timeline_id in timelines:
+            log.info(f"Checking for secondary timeline {timeline_id} on node {ps_secondary.id}")
+            # One or more layers should be present for all timelines
+            assert list_layers(ps_secondary, tenant_id, timeline_id)
+
+        # Delete the second timeline: this should be reflected later on the secondary
+        env.storage_controller.pageserver_api().timeline_delete(tenant_id, timelines[1])
+
+    # Wait long enough for the secondary locations to see the deletion
+    time.sleep(freshen_interval_secs * 1.5)
+
+    for tenant_id, timelines in tenant_timelines.items():
+        attached_to_id = env.storage_controller.locate(tenant_id)[0]["node_id"]
+        ps_attached = env.get_pageserver(attached_to_id)
+        # We only have two: the other one must be secondary
+        ps_secondary = next(p for p in env.pageservers if p != ps_attached)
+
+        # This one was not deleted
+        assert list_layers(ps_secondary, tenant_id, timelines[0])
+
+        # This one was deleted
+        assert not list_layers(ps_secondary, tenant_id, timelines[1])
+
+    t_end = time.time()
+
+    # Measure how many heatmap downloads we did in total: this checks that we succeeded with
+    # proper scheduling, and not some bug that just runs downloads in a loop.
+    total_heatmap_downloads = 0
+    for ps in env.pageservers:
+        v = ps.http_client().get_metric_value("pageserver_secondary_download_heatmap_total")
+        assert v is not None
+        total_heatmap_downloads += int(v)
+
+    download_rate = (total_heatmap_downloads / tenant_count) / (t_end - t_start)
+
+    expect_download_rate = 1.0 / freshen_interval_secs
+    log.info(f"Download rate: {download_rate * 60}/min vs expected {expect_download_rate * 60}/min")
+
+    assert download_rate < expect_download_rate * 2
+
+
 @pytest.mark.skipif(os.environ.get("BUILD_TYPE") == "debug", reason="only run with release build")
 @pytest.mark.parametrize("via_controller", [True, False])
 def test_slow_secondary_downloads(neon_env_builder: NeonEnvBuilder, via_controller: bool):
diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py
index e4219ec7a6..2b1b7fff34 100644
--- a/test_runner/regress/test_pg_regress.py
+++ b/test_runner/regress/test_pg_regress.py
@@ -18,6 +18,7 @@ from fixtures.remote_storage import s3_storage
 def test_pg_regress(
     neon_env_builder: NeonEnvBuilder,
     test_output_dir: Path,
+    build_type: str,
     pg_bin,
     capsys,
     base_dir: Path,
@@ -30,6 +31,11 @@ def test_pg_regress(
     """
     if shard_count is not None:
         neon_env_builder.num_pageservers = shard_count
+
+    if build_type == "debug":
+        # Disable vectored read path cross validation since it makes the test time out.
+        neon_env_builder.pageserver_config_override = "validate_vectored_get=false"
+
     neon_env_builder.enable_pageserver_remote_storage(s3_storage())
     neon_env_builder.enable_scrub_on_exit()
     env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count)
diff --git a/test_runner/regress/test_pg_waldump.py b/test_runner/regress/test_pg_waldump.py
new file mode 100644
index 0000000000..8e80efd9ba
--- /dev/null
+++ b/test_runner/regress/test_pg_waldump.py
@@ -0,0 +1,61 @@
+import os
+import shutil
+
+from fixtures.neon_fixtures import NeonEnv, PgBin
+from fixtures.utils import subprocess_capture
+
+
+def check_wal_segment(pg_waldump_path: str, segment_path: str, test_output_dir):
+    # use special --ignore option to ignore the validation checks in pg_waldump
+    # this is necessary, because neon WAL files contain gap at the beginning
+    output_path, _, _ = subprocess_capture(
+        test_output_dir, [pg_waldump_path, "--ignore", segment_path]
+    )
+
+    with open(f"{output_path}.stdout", "r") as f:
+        stdout = f.read()
+        assert "ABORT" in stdout
+        assert "COMMIT" in stdout
+
+
+# Simple test to check that pg_waldump works with neon WAL files
+def test_pg_waldump(neon_simple_env: NeonEnv, test_output_dir, pg_bin: PgBin):
+    env = neon_simple_env
+    tenant_id = env.initial_tenant
+    timeline_id = env.neon_cli.create_branch("test_pg_waldump", "empty")
+    endpoint = env.endpoints.create_start("test_pg_waldump")
+
+    cur = endpoint.connect().cursor()
+    cur.execute(
+        """
+        BEGIN;
+        CREATE TABLE t1(i int primary key, n_updated int);
+        INSERT INTO t1 select g, 0 from generate_series(1, 50) g;
+        ROLLBACK;
+    """
+    )
+
+    cur.execute(
+        """
+        BEGIN;
+        CREATE TABLE t1(i int primary key, n_updated int);
+        INSERT INTO t1 select g, 0 from generate_series(1, 50) g;
+        COMMIT;
+    """
+    )
+
+    # stop the endpoint to make sure that WAL files are flushed and won't change
+    endpoint.stop()
+
+    assert endpoint.pgdata_dir
+    wal_path = os.path.join(endpoint.pgdata_dir, "pg_wal/000000010000000000000001")
+    pg_waldump_path = os.path.join(pg_bin.pg_bin_path, "pg_waldump")
+    # check segment on compute
+    check_wal_segment(pg_waldump_path, wal_path, test_output_dir)
+
+    # Check file on safekeepers as well. pg_waldump is strict about file naming, so remove .partial suffix.
+    sk = env.safekeepers[0]
+    sk_tli_dir = sk.timeline_dir(tenant_id, timeline_id)
+    non_partial_path = os.path.join(sk_tli_dir, "000000010000000000000001")
+    shutil.copyfile(os.path.join(sk_tli_dir, "000000010000000000000001.partial"), non_partial_path)
+    check_wal_segment(pg_waldump_path, non_partial_path, test_output_dir)
diff --git a/test_runner/regress/test_proxy_rate_limiter.py b/test_runner/regress/test_proxy_rate_limiter.py
deleted file mode 100644
index f39f0cad07..0000000000
--- a/test_runner/regress/test_proxy_rate_limiter.py
+++ /dev/null
@@ -1,84 +0,0 @@
-import asyncio
-import time
-from pathlib import Path
-from typing import Iterator
-
-import pytest
-from fixtures.neon_fixtures import (
-    PSQL,
-    NeonProxy,
-)
-from fixtures.port_distributor import PortDistributor
-from pytest_httpserver import HTTPServer
-from werkzeug.wrappers.response import Response
-
-
-def waiting_handler(status_code: int) -> Response:
-    # wait more than timeout to make sure that both (two) connections are open.
-    # It would be better to use a barrier here, but I don't know how to do that together with pytest-httpserver.
-    time.sleep(2)
-    return Response(status=status_code)
-
-
-@pytest.fixture(scope="function")
-def proxy_with_rate_limit(
-    port_distributor: PortDistributor,
-    neon_binpath: Path,
-    httpserver_listen_address,
-    test_output_dir: Path,
-) -> Iterator[NeonProxy]:
-    """Neon proxy that routes directly to vanilla postgres."""
-
-    proxy_port = port_distributor.get_port()
-    mgmt_port = port_distributor.get_port()
-    http_port = port_distributor.get_port()
-    external_http_port = port_distributor.get_port()
-    (host, port) = httpserver_listen_address
-    endpoint = f"http://{host}:{port}/billing/api/v1/usage_events"
-
-    with NeonProxy(
-        neon_binpath=neon_binpath,
-        test_output_dir=test_output_dir,
-        proxy_port=proxy_port,
-        http_port=http_port,
-        mgmt_port=mgmt_port,
-        external_http_port=external_http_port,
-        auth_backend=NeonProxy.Console(endpoint, fixed_rate_limit=5),
-    ) as proxy:
-        proxy.start()
-        yield proxy
-
-
-@pytest.mark.asyncio
-async def test_proxy_rate_limit(
-    httpserver: HTTPServer,
-    proxy_with_rate_limit: NeonProxy,
-):
-    uri = "/billing/api/v1/usage_events/proxy_get_role_secret"
-    # mock control plane service
-    httpserver.expect_ordered_request(uri, method="GET").respond_with_handler(
-        lambda _: Response(status=200)
-    )
-    httpserver.expect_ordered_request(uri, method="GET").respond_with_handler(
-        lambda _: waiting_handler(429)
-    )
-    httpserver.expect_ordered_request(uri, method="GET").respond_with_handler(
-        lambda _: waiting_handler(500)
-    )
-
-    psql = PSQL(host=proxy_with_rate_limit.host, port=proxy_with_rate_limit.proxy_port)
-    f = await psql.run("select 42;")
-    await proxy_with_rate_limit.find_auth_link(uri, f)
-    # Limit should be 2.
-
-    # Run two queries in parallel.
-    f1, f2 = await asyncio.gather(psql.run("select 42;"), psql.run("select 42;"))
-    await proxy_with_rate_limit.find_auth_link(uri, f1)
-    await proxy_with_rate_limit.find_auth_link(uri, f2)
-
-    # Now limit should be 0.
-    f = await psql.run("select 42;")
-    await proxy_with_rate_limit.find_auth_link(uri, f)
-
-    # There last query shouldn't reach the http-server.
-    assert httpserver.assertions == []
diff --git a/test_runner/regress/test_read_validation.py b/test_runner/regress/test_read_validation.py
index 868b80a561..2437c8f806 100644
--- a/test_runner/regress/test_read_validation.py
+++ b/test_runner/regress/test_read_validation.py
@@ -17,7 +17,14 @@ def test_read_validation(neon_simple_env: NeonEnv):
     env = neon_simple_env
     env.neon_cli.create_branch("test_read_validation", "empty")
 
-    endpoint = env.endpoints.create_start("test_read_validation")
+    endpoint = env.endpoints.create_start(
+        "test_read_validation",
+        # Use protocol version 2, because the code that constructs the V1 messages
+        # assumes that a primary always wants to read the latest version of a page,
+        # and therefore doesn't work with the test functions below to read an older
+        # page version.
+        config_lines=["neon.protocol_version=2"],
+    )
 
     with closing(endpoint.connect()) as con:
         with con.cursor() as c:
@@ -64,7 +71,7 @@ def test_read_validation(neon_simple_env: NeonEnv):
             log.info("Cache is clear, reading stale page version")
 
             c.execute(
-                f"select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 0, '{first[0]}'))"
+                f"select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 0, '{first[0]}', NULL))"
             )
             direct_first = c.fetchone()
             assert first == direct_first, "Failed fetch page at historic lsn"
@@ -77,7 +84,7 @@ def test_read_validation(neon_simple_env: NeonEnv):
             log.info("Cache is clear, reading latest page version without cache")
 
             c.execute(
-                "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 0, NULL))"
+                "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 0, NULL, NULL))"
             )
             direct_latest = c.fetchone()
             assert second == direct_latest, "Failed fetch page at latest lsn"
@@ -92,7 +99,7 @@ def test_read_validation(neon_simple_env: NeonEnv):
             )
 
             c.execute(
-                f"select lsn, lower, upper from page_header(get_raw_page_at_lsn({reln[0]}, {reln[1]}, {reln[2]}, 0, 0, '{first[0]}'))"
+                f"select lsn, lower, upper from page_header(get_raw_page_at_lsn({reln[0]}, {reln[1]}, {reln[2]}, 0, 0, '{first[0]}', NULL))"
             )
             direct_first = c.fetchone()
             assert first == direct_first, "Failed fetch page at historic lsn using oid"
@@ -102,7 +109,7 @@ def test_read_validation(neon_simple_env: NeonEnv):
             )
 
             c.execute(
-                f"select lsn, lower, upper from page_header(get_raw_page_at_lsn({reln[0]}, {reln[1]}, {reln[2]}, 0, 0, NULL))"
+                f"select lsn, lower, upper from page_header(get_raw_page_at_lsn({reln[0]}, {reln[1]}, {reln[2]}, 0, 0, NULL, NULL))"
             )
             direct_latest = c.fetchone()
             assert second == direct_latest, "Failed fetch page at latest lsn"
@@ -114,7 +121,7 @@ def test_read_validation(neon_simple_env: NeonEnv):
             )
 
             c.execute(
-                f"select lsn, lower, upper from page_header(get_raw_page_at_lsn({reln[0]}, {reln[1]}, {reln[2]}, 0, 0, '{first[0]}'))"
+                f"select lsn, lower, upper from page_header(get_raw_page_at_lsn({reln[0]}, {reln[1]}, {reln[2]}, 0, 0, '{first[0]}', NULL))"
             )
             direct_first = c.fetchone()
             assert first == direct_first, "Failed fetch page at historic lsn using oid"
@@ -133,7 +140,14 @@ def test_read_validation_neg(neon_simple_env: NeonEnv):
 
     env.pageserver.allowed_errors.append(".*invalid LSN\\(0\\) in request.*")
 
-    endpoint = env.endpoints.create_start("test_read_validation_neg")
+    endpoint = env.endpoints.create_start(
+        "test_read_validation_neg",
+        # Use protocol version 2, because the code that constructs the V1 messages
+        # assumes that a primary always wants to read the latest version of a page,
+        # and therefore doesn't work with the test functions below to read an older
+        # page version.
+        config_lines=["neon.protocol_version=2"],
+    )
 
     with closing(endpoint.connect()) as con:
         with con.cursor() as c:
@@ -143,7 +157,7 @@ def test_read_validation_neg(neon_simple_env: NeonEnv):
             log.info("read a page of a missing relation")
             try:
                 c.execute(
-                    "select lsn, lower, upper from page_header(get_raw_page_at_lsn('Unknown', 'main', 0, '0/0'))"
+                    "select lsn, lower, upper from page_header(get_raw_page_at_lsn('Unknown', 'main', 0, '0/0', NULL))"
                 )
                 raise AssertionError("query should have failed")
             except UndefinedTable as e:
@@ -155,7 +169,7 @@ def test_read_validation_neg(neon_simple_env: NeonEnv):
             log.info("read a page at lsn 0")
             try:
                 c.execute(
-                    "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 0, '0/0'))"
+                    "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 0, '0/0', NULL))"
                 )
                 raise AssertionError("query should have failed")
             except IoError as e:
@@ -164,22 +178,22 @@ def test_read_validation_neg(neon_simple_env: NeonEnv):
             log.info("Pass NULL as an input")
             expected = (None, None, None)
             c.execute(
-                "select lsn, lower, upper from page_header(get_raw_page_at_lsn(NULL, 'main', 0, '0/0'))"
+                "select lsn, lower, upper from page_header(get_raw_page_at_lsn(NULL, 'main', 0, '0/0', NULL))"
             )
             assert c.fetchone() == expected, "Expected null output"
 
             c.execute(
-                "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', NULL, 0, '0/0'))"
+                "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', NULL, 0, '0/0', NULL))"
             )
             assert c.fetchone() == expected, "Expected null output"
 
             c.execute(
-                "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', NULL, '0/0'))"
+                "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', NULL, '0/0', NULL))"
             )
             assert c.fetchone() == expected, "Expected null output"
 
             # This check is currently failing, reading beyond EOF is returning a 0-page
             log.info("Read beyond EOF")
             c.execute(
-                "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 1, NULL))"
+                "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 1, NULL, NULL))"
             )
diff --git a/test_runner/regress/test_s3_scrubber.py b/test_runner/regress/test_s3_scrubber.py
new file mode 100644
index 0000000000..018c1637d0
--- /dev/null
+++ b/test_runner/regress/test_s3_scrubber.py
@@ -0,0 +1,111 @@
+import os
+import shutil
+from typing import Optional
+
+import pytest
+from fixtures.neon_fixtures import (
+    NeonEnvBuilder,
+    S3Scrubber,
+)
+from fixtures.remote_storage import S3Storage, s3_storage
+from fixtures.types import TenantShardId
+from fixtures.workload import Workload
+
+
+@pytest.mark.parametrize("shard_count", [None, 4])
+def test_scrubber_tenant_snapshot(neon_env_builder: NeonEnvBuilder, shard_count: Optional[int]):
+    """
+    Test the `tenant-snapshot` subcommand, which grabs data from remote storage
+
+    This is only a support/debug tool, but worth testing to ensure the tool does not regress.
+    """
+
+    neon_env_builder.enable_pageserver_remote_storage(s3_storage())
+    neon_env_builder.num_pageservers = shard_count if shard_count is not None else 1
+
+    env = neon_env_builder.init_start()
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+    branch = "main"
+
+    # Do some work
+    workload = Workload(env, tenant_id, timeline_id, branch)
+    workload.init()
+
+    # Multiple write/flush passes to generate multiple layers
+    for _n in range(0, 3):
+        workload.write_rows(128)
+
+    # Do some more work after a restart, so that we have multiple generations
+    for pageserver in env.pageservers:
+        pageserver.stop()
+        pageserver.start()
+
+    for _n in range(0, 3):
+        workload.write_rows(128)
+
+    # If we're doing multiple shards, split: this is important to exercise
+    # the scrubber's ability to understand the references from child shards to parent shard's layers
+    if shard_count is not None:
+        tenant_shard_ids = env.storage_controller.tenant_shard_split(
+            tenant_id, shard_count=shard_count
+        )
+
+        # Write after shard split: this will result in shards containing a mixture of owned
+        # and parent layers in their index.
+        workload.write_rows(128)
+    else:
+        tenant_shard_ids = [TenantShardId(tenant_id, 0, 0)]
+
+    output_path = neon_env_builder.test_output_dir / "snapshot"
+    os.makedirs(output_path)
+
+    scrubber = S3Scrubber(neon_env_builder)
+    scrubber.tenant_snapshot(tenant_id, output_path)
+
+    assert len(os.listdir(output_path)) > 0
+
+    workload.stop()
+
+    # Stop pageservers
+    for pageserver in env.pageservers:
+        pageserver.stop()
+
+    # Drop all shards' local storage
+    for tenant_shard_id in tenant_shard_ids:
+        pageserver = env.get_tenant_pageserver(tenant_shard_id)
+        shutil.rmtree(pageserver.timeline_dir(tenant_shard_id, timeline_id))
+
+    # Replace remote storage contents with the snapshot we downloaded
+    assert isinstance(env.pageserver_remote_storage, S3Storage)
+
+    remote_tenant_path = env.pageserver_remote_storage.tenant_path(tenant_id)
+
+    # Delete current remote storage contents
+    bucket = env.pageserver_remote_storage.bucket_name
+    remote_client = env.pageserver_remote_storage.client
+    deleted = 0
+    for object in remote_client.list_objects_v2(Bucket=bucket, Prefix=remote_tenant_path)[
+        "Contents"
+    ]:
+        key = object["Key"]
+        remote_client.delete_object(Key=key, Bucket=bucket)
+        deleted += 1
+    assert deleted > 0
+
+    # Upload from snapshot
+    for root, _dirs, files in os.walk(output_path):
+        for file in files:
+            full_local_path = os.path.join(root, file)
+            full_remote_path = (
+                env.pageserver_remote_storage.tenants_path()
+                + "/"
+                + full_local_path.removeprefix(f"{output_path}/")
+            )
+            remote_client.upload_file(full_local_path, bucket, full_remote_path)
+
+    for pageserver in env.pageservers:
+        pageserver.start()
+
+    # Check we can read everything
+    workload.validate()
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index bfaab9125f..258377f8a2 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -287,6 +287,11 @@ def test_sharding_split_smoke(
         == shard_count
     )
 
+    # Make secondary downloads slow: this exercises the storage controller logic for not migrating an attachment
+    # during post-split optimization until the secondary is ready
+    for ps in env.pageservers:
+        ps.http_client().configure_failpoints([("secondary-layer-download-sleep", "return(1000)")])
+
     env.storage_controller.tenant_shard_split(tenant_id, shard_count=split_shard_count)
 
     post_split_pageserver_ids = [loc["node_id"] for loc in env.storage_controller.locate(tenant_id)]
@@ -300,7 +305,7 @@ def test_sharding_split_smoke(
 
     # Enough background reconciliations should result in the shards being properly distributed.
     # Run this before the workload, because its LSN-waiting code presumes stable locations.
-    env.storage_controller.reconcile_until_idle()
+    env.storage_controller.reconcile_until_idle(timeout_secs=60)
 
     workload.validate()
 
@@ -342,6 +347,10 @@ def test_sharding_split_smoke(
     assert cancelled_reconciles is not None and int(cancelled_reconciles) == 0
     assert errored_reconciles is not None and int(errored_reconciles) == 0
 
+    # We should see that the migration of shards after the split waited for secondaries to warm up
+    # before happening
+    assert env.storage_controller.log_contains(".*Skipping.*because secondary isn't ready.*")
+
     env.storage_controller.consistency_check()
 
     def get_node_shard_counts(env: NeonEnv, tenant_ids):
@@ -928,6 +937,8 @@ def test_sharding_split_failures(
             ".*Reconcile error: receive body: error sending request for url.*",
             # Node offline cases will fail inside reconciler when detaching secondaries
             ".*Reconcile error on shard.*: receive body: error sending request for url.*",
+            # Node offline cases may eventually cancel reconcilers when the heartbeater realizes nodes are offline
+            ".*Reconcile error.*Cancelled.*",
             # While parent shard's client is stopped during split, flush loop updating LSNs will emit this warning
             ".*Failed to schedule metadata upload after updating disk_consistent_lsn.*",
         ]
@@ -1069,6 +1080,17 @@ def test_sharding_split_failures(
         finish_split()
         assert_split_done()
 
+    if isinstance(failure, StorageControllerFailpoint) and "post-complete" in failure.failpoint:
+        # On a post-complete failure, the controller will recover the post-split state
+        # after restart, but it will have missed the optimization part of the split function
+        # where secondary downloads are kicked off.  This means that reconcile_until_idle
+        # will take a very long time if we wait for all optimizations to complete, because
+        # those optimizations will wait for secondary downloads.
+        #
+        # Avoid that by configuring the tenant into Essential scheduling mode, so that it will
+        # skip optimizations when we're exercising this particular failpoint.
+        env.storage_controller.tenant_policy_update(tenant_id, {"scheduling": "Essential"})
+
     # Having completed the split, pump the background reconciles to ensure that
     # the scheduler reaches an idle state
     env.storage_controller.reconcile_until_idle(timeout_secs=30)
@@ -1201,3 +1223,45 @@ def test_sharding_backpressure(neon_env_builder: NeonEnvBuilder):
         max_lsn = max(Lsn(info["last_record_lsn"]) for info in infos)
         diff = max_lsn - min_lsn
         assert diff < 2 * 1024 * 1024, f"LSN diff={diff}, expected diff < 2MB due to backpressure"
+
+
+def test_sharding_unlogged_relation(neon_env_builder: NeonEnvBuilder):
+    """
+    Check that an unlogged relation is handled properly on a sharded tenant
+
+    Reproducer for https://github.com/neondatabase/neon/issues/7451
+    """
+
+    neon_env_builder.num_pageservers = 2
+    env = neon_env_builder.init_configs()
+    neon_env_builder.start()
+
+    tenant_id = TenantId.generate()
+    timeline_id = TimelineId.generate()
+    env.neon_cli.create_tenant(tenant_id, timeline_id, shard_count=8)
+
+    # We will create many tables to ensure it's overwhelmingly likely that at least one
+    # of them doesn't land on shard 0
+    table_names = [f"my_unlogged_{i}" for i in range(0, 16)]
+
+    with env.endpoints.create_start("main", tenant_id=tenant_id) as ep:
+        for table_name in table_names:
+            ep.safe_psql(f"CREATE UNLOGGED TABLE {table_name} (id integer, value varchar(64));")
+            ep.safe_psql(f"INSERT INTO {table_name} VALUES (1, 'foo')")
+            result = ep.safe_psql(f"SELECT * from {table_name};")
+            assert result == [(1, "foo")]
+            ep.safe_psql(f"CREATE INDEX ON {table_name} USING btree (value);")
+
+        wait_for_last_flush_lsn(env, ep, tenant_id, timeline_id)
+
+    with env.endpoints.create_start("main", tenant_id=tenant_id) as ep:
+        for table_name in table_names:
+            # Check that table works: we can select and insert
+            result = ep.safe_psql(f"SELECT * from {table_name};")
+            assert result == []
+            ep.safe_psql(f"INSERT INTO {table_name} VALUES (2, 'bar');")
+            result = ep.safe_psql(f"SELECT * from {table_name};")
+            assert result == [(2, "bar")]
+
+        # Ensure that post-endpoint-restart modifications are ingested happily by pageserver
+        wait_for_last_flush_lsn(env, ep, tenant_id, timeline_id)
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 840f354142..fdcb4cf9a4 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -1,4 +1,5 @@
 import json
+import threading
 import time
 from collections import defaultdict
 from datetime import datetime, timezone
@@ -26,6 +27,7 @@ from fixtures.pg_version import PgVersion
 from fixtures.remote_storage import RemoteStorageKind, s3_storage
 from fixtures.types import TenantId, TenantShardId, TimelineId
 from fixtures.utils import run_pg_bench_small, subprocess_capture, wait_until
+from fixtures.workload import Workload
 from mypy_boto3_s3.type_defs import (
     ObjectTypeDef,
 )
@@ -228,6 +230,10 @@ def test_storage_controller_passthrough(
     }
     assert status["state"]["slug"] == "Active"
 
+    (synthetic_size, size_inputs) = client.tenant_size_and_modelinputs(env.initial_tenant)
+    assert synthetic_size > 0
+    assert "segments" in size_inputs
+
     env.storage_controller.consistency_check()
 
 
@@ -273,7 +279,8 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up
     but imports the generation number.
     """
 
-    neon_env_builder.num_pageservers = 2
+    # One pageserver to simulate legacy environment, two to be managed by storage controller
+    neon_env_builder.num_pageservers = 3
 
     # Start services by hand so that we can skip registration on one of the pageservers
     env = neon_env_builder.init_configs()
@@ -288,10 +295,10 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up
     )
     origin_ps = env.pageservers[0]
 
-    # This is the pageserver managed by the sharding service, where the tenant
+    # These are the pageservers managed by the sharding service, where the tenant
     # will be attached after onboarding
     env.pageservers[1].start()
-    dest_ps = env.pageservers[1]
+    env.pageservers[2].start()
     virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True)
 
     for sk in env.safekeepers:
@@ -330,6 +337,9 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up
         )
 
         virtual_ps_http.tenant_secondary_download(tenant_id)
+        warm_up_ps = env.storage_controller.tenant_describe(tenant_id)["shards"][0][
+            "node_secondary"
+        ][0]
 
     # Call into storage controller to onboard the tenant
     generation += 1
@@ -344,6 +354,18 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up
     )
     assert len(r["shards"]) == 1
 
+    describe = env.storage_controller.tenant_describe(tenant_id)["shards"][0]
+    dest_ps_id = describe["node_attached"]
+    dest_ps = env.get_pageserver(dest_ps_id)
+    if warm_up:
+        # The storage controller should have attached the tenant to the same placce
+        # it had a secondary location, otherwise there was no point warming it up
+        assert dest_ps_id == warm_up_ps
+
+        # It should have been given a new secondary location as well
+        assert len(describe["node_secondary"]) == 1
+        assert describe["node_secondary"][0] != warm_up_ps
+
     # As if doing a live migration, detach the original pageserver
     origin_ps.http_client().tenant_location_conf(
         tenant_id,
@@ -415,6 +437,9 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up
         dest_tenant_after_conf_change["generation"] == dest_tenant_before_conf_change["generation"]
     )
     dest_tenant_conf_after = dest_ps.http_client().tenant_config(tenant_id)
+
+    # Storage controller auto-sets heatmap period, ignore it for the comparison
+    del dest_tenant_conf_after.tenant_specific_overrides["heatmap_period"]
     assert dest_tenant_conf_after.tenant_specific_overrides == modified_tenant_conf
 
     env.storage_controller.consistency_check()
@@ -1237,3 +1262,132 @@ def test_storcon_cli(neon_env_builder: NeonEnvBuilder):
     # Quiesce any background reconciliation before doing consistency check
     env.storage_controller.reconcile_until_idle(timeout_secs=10)
     env.storage_controller.consistency_check()
+
+
+def test_lock_time_tracing(neon_env_builder: NeonEnvBuilder):
+    """
+    Check that when lock on resource (tenants, nodes) is held for too long it is
+    traced in logs.
+    """
+    env = neon_env_builder.init_start()
+    tenant_id = env.initial_tenant
+    env.storage_controller.allowed_errors.extend(
+        [
+            ".*Lock on.*",
+            ".*Scheduling is disabled by policy.*",
+            f".*Operation TimelineCreate on key {tenant_id} has waited.*",
+        ]
+    )
+
+    # Apply failpoint
+    env.storage_controller.configure_failpoints(
+        ("tenant-update-policy-exclusive-lock", "return(31000)")
+    )
+
+    # This will hold the exclusive for enough time to cause an warning
+    def update_tenent_policy():
+        env.storage_controller.tenant_policy_update(
+            tenant_id=tenant_id,
+            body={
+                "scheduling": "Stop",
+            },
+        )
+
+    thread_update_tenant_policy = threading.Thread(target=update_tenent_policy)
+    thread_update_tenant_policy.start()
+
+    # Make sure the update policy thread has started
+    time.sleep(1)
+    # This will not be able to access and will log a warning
+    timeline_id = TimelineId.generate()
+    env.storage_controller.pageserver_api().timeline_create(
+        pg_version=PgVersion.NOT_SET, tenant_id=tenant_id, new_timeline_id=timeline_id
+    )
+    thread_update_tenant_policy.join(timeout=10)
+
+    env.storage_controller.assert_log_contains("Lock on UpdatePolicy was held for")
+    env.storage_controller.assert_log_contains(
+        f"Operation TimelineCreate on key {tenant_id} has waited"
+    )
+
+
+@pytest.mark.parametrize("remote_storage", [RemoteStorageKind.LOCAL_FS, s3_storage()])
+@pytest.mark.parametrize("shard_count", [None, 4])
+def test_tenant_import(neon_env_builder: NeonEnvBuilder, shard_count, remote_storage):
+    """
+    Tenant import is a support/debug tool for recovering a tenant from remote storage
+    if we don't have any metadata for it in the storage controller.
+    """
+
+    # This test is parametrized on remote storage because it exercises the relatively rare
+    # code path of listing with a prefix that is not a directory name: this helps us notice
+    # quickly if local_fs or s3_bucket implementations diverge.
+    neon_env_builder.enable_pageserver_remote_storage(remote_storage)
+
+    # Use multiple pageservers because some test helpers assume single sharded tenants
+    # if there is only one pageserver.
+    neon_env_builder.num_pageservers = 2
+
+    env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count)
+    tenant_id = env.initial_tenant
+
+    # Create a second timeline to ensure that import finds both
+    timeline_a = env.initial_timeline
+    timeline_b = env.neon_cli.create_branch("branch_b", tenant_id=tenant_id)
+
+    workload_a = Workload(env, tenant_id, timeline_a, branch_name="main")
+    workload_a.init()
+
+    workload_b = Workload(env, tenant_id, timeline_b, branch_name="branch_b")
+    workload_b.init()
+
+    # Write some data
+    workload_a.write_rows(72)
+    expect_rows_a = workload_a.expect_rows
+    workload_a.stop()
+    del workload_a
+
+    # Bump generation to make sure generation recovery works properly
+    for pageserver in env.pageservers:
+        pageserver.stop()
+        pageserver.start()
+
+    # Write some data in the higher generation into the other branch
+    workload_b.write_rows(107)
+    expect_rows_b = workload_b.expect_rows
+    workload_b.stop()
+    del workload_b
+
+    # Detach from pageservers
+    env.storage_controller.tenant_policy_update(
+        tenant_id,
+        {
+            "placement": "Detached",
+        },
+    )
+    env.storage_controller.reconcile_until_idle(timeout_secs=10)
+
+    # Force-drop it from the storage controller
+    env.storage_controller.request(
+        "POST",
+        f"{env.storage_controller_api}/debug/v1/tenant/{tenant_id}/drop",
+        headers=env.storage_controller.headers(TokenScope.ADMIN),
+    )
+
+    # Now import it again
+    env.neon_cli.import_tenant(tenant_id)
+
+    # Check we found the shards
+    describe = env.storage_controller.tenant_describe(tenant_id)
+    literal_shard_count = 1 if shard_count is None else shard_count
+    assert len(describe["shards"]) == literal_shard_count
+
+    # Check the data is still there: this implicitly proves that we recovered generation numbers
+    # properly, for the timeline which was written to after a generation bump.
+    for timeline, branch, expect_rows in [
+        (timeline_a, "main", expect_rows_a),
+        (timeline_b, "branch_1", expect_rows_b),
+    ]:
+        workload = Workload(env, tenant_id, timeline, branch_name=branch)
+        workload.expect_rows = expect_rows
+        workload.validate()
diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py
index a164c7f60a..c115c0375b 100644
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -469,7 +469,8 @@ def test_tenant_delete_concurrent(
 ):
     """
     Validate that concurrent delete requests to the same tenant behave correctly:
-    exactly one should succeed.
+    exactly one should execute: the rest should give 202 responses but not start
+    another deletion.
 
     This is a reproducer for https://github.com/neondatabase/neon/issues/5936
     """
@@ -484,14 +485,10 @@ def test_tenant_delete_concurrent(
         run_pg_bench_small(pg_bin, endpoint.connstr())
         last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id)
 
-    CONFLICT_MESSAGE = "Precondition failed: Invalid state Stopping. Expected Active or Broken"
-
     env.pageserver.allowed_errors.extend(
         [
             # lucky race with stopping from flushing a layer we fail to schedule any uploads
             ".*layer flush task.+: could not flush frozen layer: update_metadata_file",
-            # Errors logged from our 4xx requests
-            f".*{CONFLICT_MESSAGE}.*",
         ]
     )
 
@@ -507,7 +504,7 @@ def test_tenant_delete_concurrent(
         return ps_http.tenant_delete(tenant_id)
 
     def hit_remove_failpoint():
-        env.pageserver.assert_log_contains(f"at failpoint {BEFORE_REMOVE_FAILPOINT}")
+        return env.pageserver.assert_log_contains(f"at failpoint {BEFORE_REMOVE_FAILPOINT}")[1]
 
     def hit_run_failpoint():
         env.pageserver.assert_log_contains(f"at failpoint {BEFORE_RUN_FAILPOINT}")
@@ -518,11 +515,14 @@ def test_tenant_delete_concurrent(
 
         # Wait until the first request completes its work and is blocked on removing
         # the TenantSlot from tenant manager.
-        wait_until(100, 0.1, hit_remove_failpoint)
+        log_cursor = wait_until(100, 0.1, hit_remove_failpoint)
+        assert log_cursor is not None
 
-        # Start another request: this should fail when it sees a tenant in Stopping state
-        with pytest.raises(PageserverApiException, match=CONFLICT_MESSAGE):
-            ps_http.tenant_delete(tenant_id)
+        # Start another request: this should succeed without actually entering the deletion code
+        ps_http.tenant_delete(tenant_id)
+        assert not env.pageserver.log_contains(
+            f"at failpoint {BEFORE_RUN_FAILPOINT}", offset=log_cursor
+        )
 
         # Start another background request, which will pause after acquiring a TenantSlotGuard
         # but before completing.
@@ -539,8 +539,10 @@ def test_tenant_delete_concurrent(
 
         # Permit the duplicate background request to run to completion and fail.
         ps_http.configure_failpoints((BEFORE_RUN_FAILPOINT, "off"))
-        with pytest.raises(PageserverApiException, match=CONFLICT_MESSAGE):
-            background_4xx_req.result(timeout=10)
+        background_4xx_req.result(timeout=10)
+        assert not env.pageserver.log_contains(
+            f"at failpoint {BEFORE_RUN_FAILPOINT}", offset=log_cursor
+        )
 
     # Physical deletion should have happened
     assert_prefix_empty(
diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py
index d3f24cb06e..0ba0108651 100644
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -132,7 +132,7 @@ def test_tenant_reattach(neon_env_builder: NeonEnvBuilder, mode: str):
             assert query_scalar(cur, "SELECT count(*) FROM t") == 100000
 
         # Check that we had to retry the downloads
-        assert env.pageserver.log_contains(".*list timelines.*failed, will retry.*")
+        assert env.pageserver.log_contains(".*list identifiers.*failed, will retry.*")
         assert env.pageserver.log_contains(".*download.*failed, will retry.*")
 
 
diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py
index 9def3ad1c2..68d9d9a660 100644
--- a/test_runner/regress/test_tenant_relocation.py
+++ b/test_runner/regress/test_tenant_relocation.py
@@ -16,7 +16,6 @@ from fixtures.pageserver.utils import (
     wait_for_upload,
     wait_tenant_status_404,
 )
-from fixtures.port_distributor import PortDistributor
 from fixtures.remote_storage import (
     LocalFsStorage,
     RemoteStorageKind,
@@ -24,7 +23,6 @@ from fixtures.remote_storage import (
 from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import (
     query_scalar,
-    subprocess_capture,
     wait_until,
 )
 
@@ -184,20 +182,14 @@ def post_migration_check(endpoint: Endpoint, sum_before_migration: int, old_loca
         # A minor migration involves no storage breaking changes.
         # It is done by attaching the tenant to a new pageserver.
         "minor",
-        # A major migration involves exporting a postgres datadir
-        # basebackup and importing it into the new pageserver.
-        # This kind of migration can tolerate breaking changes
-        # to storage format
-        "major",
+        # In the unlikely and unfortunate event that we have to break
+        # the storage format, extend this test with the param below.
+        # "major",
     ],
 )
 @pytest.mark.parametrize("with_load", ["with_load", "without_load"])
 def test_tenant_relocation(
     neon_env_builder: NeonEnvBuilder,
-    port_distributor: PortDistributor,
-    test_output_dir: Path,
-    neon_binpath: Path,
-    base_dir: Path,
     method: str,
     with_load: str,
 ):
@@ -299,40 +291,7 @@ def test_tenant_relocation(
         current_lsn=current_lsn_second,
     )
 
-    # Migrate either by attaching from s3 or import/export basebackup
-    if method == "major":
-        cmd = [
-            "poetry",
-            "run",
-            "python",
-            str(base_dir / "scripts/export_import_between_pageservers.py"),
-            "--tenant-id",
-            str(tenant_id),
-            "--from-host",
-            "localhost",
-            "--from-http-port",
-            str(origin_http.port),
-            "--from-pg-port",
-            str(origin_ps.service_port.pg),
-            "--to-host",
-            "localhost",
-            "--to-http-port",
-            str(destination_http.port),
-            "--to-pg-port",
-            str(destination_ps.service_port.pg),
-            "--pg-distrib-dir",
-            str(neon_env_builder.pg_distrib_dir),
-            "--work-dir",
-            str(test_output_dir),
-            "--tmp-pg-port",
-            str(port_distributor.get_port()),
-        ]
-        subprocess_capture(test_output_dir, cmd, check=True)
-
-        destination_ps.allowed_errors.append(
-            ".*ignored .* unexpected bytes after the tar archive.*"
-        )
-    elif method == "minor":
+    if method == "minor":
         # call to attach timeline to new pageserver
         destination_ps.tenant_attach(tenant_id)
 
diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py
index 4c8fd4b0e5..53da548524 100644
--- a/test_runner/regress/test_tenant_size.py
+++ b/test_runner/regress/test_tenant_size.py
@@ -1,4 +1,5 @@
 import os
+from concurrent.futures import ThreadPoolExecutor
 from pathlib import Path
 from typing import List, Tuple
 
@@ -11,13 +12,15 @@ from fixtures.neon_fixtures import (
     wait_for_last_flush_lsn,
     wait_for_wal_insert_lsn,
 )
-from fixtures.pageserver.http import PageserverHttpClient
+from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
 from fixtures.pageserver.utils import (
+    tenant_delete_wait_completed,
     timeline_delete_wait_completed,
     wait_until_tenant_active,
 )
 from fixtures.pg_version import PgVersion
 from fixtures.types import Lsn, TenantId, TimelineId
+from fixtures.utils import wait_until
 
 
 def test_empty_tenant_size(neon_env_builder: NeonEnvBuilder):
@@ -292,33 +295,12 @@ def test_single_branch_get_tenant_size_grows(
     Operate on single branch reading the tenants size after each transaction.
     """
 
-    # Disable automatic gc and compaction.
-    # The pitr_interval here is quite problematic, so we cannot really use it.
-    # it'd have to be calibrated per test executing env.
-
-    # there was a bug which was hidden if the create table and first batch of
-    # inserts is larger than gc_horizon. for example 0x20000 here hid the fact
-    # that there next_gc_cutoff could be smaller than initdb_lsn, which will
-    # obviously lead to issues when calculating the size.
-    gc_horizon = 0x3BA00
-
-    # it's a bit of a hack, but different versions of postgres have different
-    # amount of WAL generated for the same amount of data. so we need to
-    # adjust the gc_horizon accordingly.
-    if pg_version == PgVersion.V14:
-        gc_horizon = 0x4A000
-    elif pg_version == PgVersion.V15:
-        gc_horizon = 0x3BA00
-    elif pg_version == PgVersion.V16:
-        gc_horizon = 210000
-    else:
-        raise NotImplementedError(pg_version)
-
+    # Disable automatic compaction and GC, and set a long PITR interval: we will expect
+    # size to always increase with writes as all writes remain within the PITR
     tenant_config = {
         "compaction_period": "0s",
         "gc_period": "0s",
-        "pitr_interval": "0s",
-        "gc_horizon": gc_horizon,
+        "pitr_interval": "3600s",
     }
 
     env = neon_env_builder.init_start(initial_tenant_conf=tenant_config)
@@ -332,18 +314,6 @@ def test_single_branch_get_tenant_size_grows(
 
     size_debug_file = open(test_output_dir / "size_debug.html", "w")
 
-    def check_size_change(
-        current_lsn: Lsn, initdb_lsn: Lsn, gc_horizon: int, size: int, prev_size: int
-    ):
-        if current_lsn - initdb_lsn >= gc_horizon:
-            assert (
-                size >= prev_size
-            ), f"tenant_size may grow or not grow, because we only add gc_horizon amount of WAL to initial snapshot size (Currently at: {current_lsn}, Init at: {initdb_lsn})"
-        else:
-            assert (
-                size > prev_size
-            ), f"tenant_size should grow, because we continue to add WAL to initial snapshot size (Currently at: {current_lsn}, Init at: {initdb_lsn})"
-
     def get_current_consistent_size(
         env: NeonEnv,
         endpoint: Endpoint,
@@ -412,14 +382,6 @@ def test_single_branch_get_tenant_size_grows(
             )
 
             prev_size = collected_responses[-1][2]
-
-            # branch start shouldn't be past gc_horizon yet
-            # thus the size should grow as we insert more data
-            # "gc_horizon" is tuned so that it kicks in _after_ the
-            # insert phase, but before the update phase ends.
-            assert (
-                current_lsn - initdb_lsn <= gc_horizon
-            ), "Tuning of GC window is likely out-of-date"
             assert size > prev_size
 
             collected_responses.append(("INSERT", current_lsn, size))
@@ -439,8 +401,7 @@ def test_single_branch_get_tenant_size_grows(
             )
 
             prev_size = collected_responses[-1][2]
-
-            check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev_size)
+            assert size > prev_size
 
             collected_responses.append(("UPDATE", current_lsn, size))
 
@@ -457,8 +418,7 @@ def test_single_branch_get_tenant_size_grows(
             )
 
             prev_size = collected_responses[-1][2]
-
-            check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev_size)
+            assert size > prev_size
 
             collected_responses.append(("DELETE", current_lsn, size))
 
@@ -469,20 +429,20 @@ def test_single_branch_get_tenant_size_grows(
         with endpoint.cursor() as cur:
             cur.execute("DROP TABLE t0")
 
-        # Without setting a PITR interval, dropping the table doesn't reclaim any space
-        # from the user's point of view, because the DROP transaction is too small
-        # to fall out of gc_horizon.
+        # Dropping the table doesn't reclaim any space
+        # from the user's point of view, because the DROP transaction is still
+        # within pitr_interval.
         (current_lsn, size) = get_current_consistent_size(
             env, endpoint, size_debug_file, http_client, tenant_id, timeline_id
         )
-        prev_size = collected_responses[-1][2]
-        check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev_size)
+        assert size >= prev_size
+        prev_size = size
 
-        # Set a tiny PITR interval to allow the DROP to impact the synthetic size
+        # Set a zero PITR interval to allow the DROP to impact the synthetic size
         # Because synthetic size calculation uses pitr interval when available,
         # when our tenant is configured with a tiny pitr interval, dropping a table should
         # cause synthetic size to go down immediately
-        tenant_config["pitr_interval"] = "1ms"
+        tenant_config["pitr_interval"] = "0s"
         env.pageserver.http_client().set_tenant_config(tenant_id, tenant_config)
         (current_lsn, size) = get_current_consistent_size(
             env, endpoint, size_debug_file, http_client, tenant_id, timeline_id
@@ -494,10 +454,6 @@ def test_single_branch_get_tenant_size_grows(
         # defined by gc_horizon.
         collected_responses.append(("DROP", current_lsn, size))
 
-    # Should have gone past gc_horizon, otherwise gc_horizon is too large
-    bytes_written = current_lsn - initdb_lsn
-    assert bytes_written > gc_horizon
-
     # this isn't too many lines to forget for a while. observed while
     # developing these tests that locally the value is a bit more than what we
     # get in the ci.
@@ -663,6 +619,68 @@ def test_get_tenant_size_with_multiple_branches(
     size_debug_file.write(size_debug)
 
 
+def test_synthetic_size_while_deleting(neon_env_builder: NeonEnvBuilder):
+    """
+    Makes sure synthetic size can still be calculated even if one of the
+    timelines is deleted or the tenant is deleted.
+    """
+
+    env = neon_env_builder.init_start()
+    failpoint = "Timeline::find_gc_cutoffs-pausable"
+    client = env.pageserver.http_client()
+
+    orig_size = client.tenant_size(env.initial_tenant)
+
+    branch_id = env.neon_cli.create_branch(
+        tenant_id=env.initial_tenant, ancestor_branch_name="main", new_branch_name="branch"
+    )
+    client.configure_failpoints((failpoint, "pause"))
+
+    with ThreadPoolExecutor(max_workers=1) as exec:
+        completion = exec.submit(client.tenant_size, env.initial_tenant)
+        _, last_offset = wait_until(
+            10, 1.0, lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}")
+        )
+
+        timeline_delete_wait_completed(client, env.initial_tenant, branch_id)
+
+        client.configure_failpoints((failpoint, "off"))
+        size = completion.result()
+
+        assert_size_approx_equal(orig_size, size)
+
+    branch_id = env.neon_cli.create_branch(
+        tenant_id=env.initial_tenant, ancestor_branch_name="main", new_branch_name="branch2"
+    )
+    client.configure_failpoints((failpoint, "pause"))
+
+    with ThreadPoolExecutor(max_workers=1) as exec:
+        completion = exec.submit(client.tenant_size, env.initial_tenant)
+        wait_until(
+            10,
+            1.0,
+            lambda: env.pageserver.assert_log_contains(
+                f"at failpoint {failpoint}", offset=last_offset
+            ),
+        )
+
+        tenant_delete_wait_completed(client, env.initial_tenant, 10)
+
+        client.configure_failpoints((failpoint, "off"))
+
+        with pytest.raises(
+            PageserverApiException, match="Failed to refresh gc_info before gathering inputs"
+        ):
+            completion.result()
+
+    # this happens on both cases
+    env.pageserver.allowed_errors.append(
+        ".*ignoring failure to find gc cutoffs: timeline shutting down.*"
+    )
+    # this happens only in the case of deletion (http response logging)
+    env.pageserver.allowed_errors.append(".*Failed to refresh gc_info before gathering inputs.*")
+
+
 # Helper for tests that compare timeline_inputs
 # We don't want to compare the exact values, because they can be unstable
 # and cause flaky tests. So replace the values with useful invariants.
diff --git a/test_runner/regress/test_vm_bits.py b/test_runner/regress/test_vm_bits.py
index eff103ca09..06f2a8befd 100644
--- a/test_runner/regress/test_vm_bits.py
+++ b/test_runner/regress/test_vm_bits.py
@@ -173,7 +173,9 @@ def test_vm_bit_clear_on_heap_lock(neon_env_builder: NeonEnvBuilder):
     # which changes the LSN on the page.
     cur.execute("select get_raw_page( 'vmtest_lock', 'vm', 0 )")
     vm_page_in_cache = (cur.fetchall()[0][0])[8:100].hex()
-    cur.execute("select get_raw_page_at_lsn( 'vmtest_lock', 'vm', 0, pg_current_wal_insert_lsn() )")
+    cur.execute(
+        "select get_raw_page_at_lsn( 'vmtest_lock', 'vm', 0, pg_current_wal_insert_lsn(), NULL )"
+    )
     vm_page_at_pageserver = (cur.fetchall()[0][0])[8:100].hex()
 
     assert vm_page_at_pageserver == vm_page_in_cache
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index ac1a747df3..967d133e18 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -1828,7 +1828,7 @@ def test_idle_reconnections(neon_env_builder: NeonEnvBuilder):
     env = neon_env_builder.init_start()
 
     tenant_id = env.initial_tenant
-    timeline_id = env.neon_cli.create_branch("test_sk_auth_restart_endpoint")
+    timeline_id = env.neon_cli.create_branch("test_idle_reconnections")
 
     def collect_stats() -> Dict[str, float]:
         # we need to collect safekeeper_pg_queries_received_total metric from all safekeepers
@@ -1859,7 +1859,7 @@ def test_idle_reconnections(neon_env_builder: NeonEnvBuilder):
 
     collect_stats()
 
-    endpoint = env.endpoints.create_start("test_sk_auth_restart_endpoint")
+    endpoint = env.endpoints.create_start("test_idle_reconnections")
     # just write something to the timeline
     endpoint.safe_psql("create table t(i int)")
     collect_stats()
@@ -2007,3 +2007,47 @@ def test_patch_control_file(neon_env_builder: NeonEnvBuilder):
     )
     log.info(f"dump_control_file response: {res}")
     assert res["timelines"][0]["control_file"]["timeline_start_lsn"] == "0/1"
+
+
+# Test disables periodic pushes from safekeeper to the broker and checks that
+# pageserver can still discover safekeepers with discovery requests.
+def test_broker_discovery(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.num_safekeepers = 3
+    neon_env_builder.enable_safekeeper_remote_storage(RemoteStorageKind.LOCAL_FS)
+    env = neon_env_builder.init_start()
+
+    env.neon_cli.create_branch("test_broker_discovery")
+
+    endpoint = env.endpoints.create_start(
+        "test_broker_discovery",
+        config_lines=["shared_buffers=1MB"],
+    )
+    endpoint.safe_psql("create table t(i int, payload text)")
+    # Install extension containing function needed to clear buffer
+    endpoint.safe_psql("CREATE EXTENSION neon_test_utils")
+
+    def do_something():
+        time.sleep(1)
+        # generate some data to commit WAL on safekeepers
+        endpoint.safe_psql("insert into t select generate_series(1,100), 'action'")
+        # clear the buffers
+        endpoint.safe_psql("select clear_buffer_cache()")
+        # read data to fetch pages from pageserver
+        endpoint.safe_psql("select sum(i) from t")
+
+    do_something()
+    do_something()
+
+    for sk in env.safekeepers:
+        # Disable periodic broker push, so pageserver won't be able to discover
+        # safekeepers without sending a discovery request
+        sk.stop().start(extra_opts=["--disable-periodic-broker-push"])
+
+    do_something()
+    do_something()
+
+    # restart pageserver and check how everything works
+    env.pageserver.stop().start()
+
+    do_something()
+    do_something()
diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py
index 5902eb3217..dce5616ac6 100644
--- a/test_runner/regress/test_wal_acceptor_async.py
+++ b/test_runner/regress/test_wal_acceptor_async.py
@@ -254,7 +254,9 @@ def test_restarts_frequent_checkpoints(neon_env_builder: NeonEnvBuilder):
     )
 
 
-def endpoint_create_start(env: NeonEnv, branch: str, pgdir_name: Optional[str]):
+def endpoint_create_start(
+    env: NeonEnv, branch: str, pgdir_name: Optional[str], allow_multiple: bool = False
+):
     endpoint = Endpoint(
         env,
         tenant_id=env.initial_tenant,
@@ -268,14 +270,23 @@ def endpoint_create_start(env: NeonEnv, branch: str, pgdir_name: Optional[str]):
     # embed current time in endpoint ID
     endpoint_id = pgdir_name or f"ep-{time.time()}"
     return endpoint.create_start(
-        branch_name=branch, endpoint_id=endpoint_id, config_lines=["log_statement=all"]
+        branch_name=branch,
+        endpoint_id=endpoint_id,
+        config_lines=["log_statement=all"],
+        allow_multiple=allow_multiple,
     )
 
 
 async def exec_compute_query(
-    env: NeonEnv, branch: str, query: str, pgdir_name: Optional[str] = None
+    env: NeonEnv,
+    branch: str,
+    query: str,
+    pgdir_name: Optional[str] = None,
+    allow_multiple: bool = False,
 ):
-    with endpoint_create_start(env, branch=branch, pgdir_name=pgdir_name) as endpoint:
+    with endpoint_create_start(
+        env, branch=branch, pgdir_name=pgdir_name, allow_multiple=allow_multiple
+    ) as endpoint:
         before_conn = time.time()
         conn = await endpoint.connect_async()
         res = await conn.fetch(query)
@@ -347,6 +358,7 @@ class BackgroundCompute(object):
                     self.branch,
                     f"INSERT INTO query_log(index, verify_key) VALUES ({self.index}, {verify_key}) RETURNING verify_key",
                     pgdir_name=f"bgcompute{self.index}_key{verify_key}",
+                    allow_multiple=True,
                 )
                 log.info(f"result: {res}")
                 if len(res) != 1:
diff --git a/trace/src/main.rs b/trace/src/main.rs
index 4605c124e9..049f922b6f 100644
--- a/trace/src/main.rs
+++ b/trace/src/main.rs
@@ -7,7 +7,9 @@ use std::{
     io::BufReader,
 };
 
-use pageserver_api::models::{PagestreamFeMessage, PagestreamGetPageRequest};
+use pageserver_api::models::{
+    PagestreamFeMessage, PagestreamGetPageRequest, PagestreamProtocolVersion,
+};
 use utils::id::{ConnectionId, TenantId, TimelineId};
 
 use clap::{Parser, Subcommand};
@@ -56,7 +58,7 @@ fn analyze_trace<R: std::io::Read>(mut reader: R) {
     let mut prev: Option<PagestreamGetPageRequest> = None;
 
     // Compute stats
-    while let Ok(msg) = PagestreamFeMessage::parse(&mut reader) {
+    while let Ok(msg) = PagestreamFeMessage::parse(&mut reader, PagestreamProtocolVersion::V2) {
         match msg {
             PagestreamFeMessage::Exists(_) => {}
             PagestreamFeMessage::Nblocks(_) => {}
@@ -89,7 +91,7 @@ fn analyze_trace<R: std::io::Read>(mut reader: R) {
 }
 
 fn dump_trace<R: std::io::Read>(mut reader: R) {
-    while let Ok(msg) = PagestreamFeMessage::parse(&mut reader) {
+    while let Ok(msg) = PagestreamFeMessage::parse(&mut reader, PagestreamProtocolVersion::V2) {
         println!("{msg:?}");
     }
 }
diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index a7b4c66156..d6f7e2c604 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit a7b4c66156bce00afa60e5592d4284ba9e40b4cf
+Subproject commit d6f7e2c604bfc7cbc4c46bcea0a8e800f4bc778a
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 64b8c7bccc..f0d6b0ef75 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 64b8c7bccc6b77e04795e2d4cf6ad82dc8d987ed
+Subproject commit f0d6b0ef7581bd78011832e23d8420a7d2c8a83a
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 3946b2e2ea..8ef3c33aa0 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 3946b2e2ea71d07af092099cb5bcae76a69b90d6
+Subproject commit 8ef3c33aa01631e17cb24a122776349fcc777b46
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 75dc095168..a353fde8fd 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,5 +1,5 @@
 {
-  "postgres-v16": "3946b2e2ea71d07af092099cb5bcae76a69b90d6",
-  "postgres-v15": "64b8c7bccc6b77e04795e2d4cf6ad82dc8d987ed",
-  "postgres-v14": "a7b4c66156bce00afa60e5592d4284ba9e40b4cf"
+  "postgres-v16": "8ef3c33aa01631e17cb24a122776349fcc777b46",
+  "postgres-v15": "f0d6b0ef7581bd78011832e23d8420a7d2c8a83a",
+  "postgres-v14": "d6f7e2c604bfc7cbc4c46bcea0a8e800f4bc778a"
 }
diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml
index c760744491..41ca16f16b 100644
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -5,6 +5,12 @@ commands:
     user: root
     sysvInitAction: sysinit
     shell: 'cgconfigparser -l /etc/cgconfig.conf -s 1664'
+  # restrict permissions on /neonvm/bin/resize-swap, because we grant access to compute_ctl for
+  # running it as root.
+  - name: chmod-resize-swap
+    user: root
+    sysvInitAction: sysinit
+    shell: 'chmod 711 /neonvm/bin/resize-swap'
   - name: pgbouncer
     user: postgres
     sysvInitAction: respawn
@@ -16,10 +22,19 @@ commands:
   - name: sql-exporter
     user: nobody
     sysvInitAction: respawn
-    shell: '/bin/sql_exporter -config.file=/etc/sql_exporter.yml'
+    shell: '/bin/sql_exporter -config.file=/etc/sql_exporter.yml -web.listen-address=:9399'
+  - name: sql-exporter-autoscaling
+    user: nobody
+    sysvInitAction: respawn
+    shell: '/bin/sql_exporter -config.file=/etc/sql_exporter_autoscaling.yml -web.listen-address=:9499'
 shutdownHook: |
   su -p postgres --session-command '/usr/local/bin/pg_ctl stop -D /var/db/postgres/compute/pgdata -m fast --wait -t 10'
 files:
+  - filename: compute_ctl-resize-swap
+    content: |
+      # Allow postgres user (which is what compute_ctl runs as) to run /neonvm/bin/resize-swap
+      # as root without requiring entering a password (NOPASSWD), regardless of hostname (ALL)
+      postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap
   - filename: pgbouncer.ini
     content: |
       [databases]
@@ -88,6 +103,41 @@ files:
       # Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
       collector_files:
         - "neon_collector.yml"
+  - filename: sql_exporter_autoscaling.yml
+    content: |
+      # Configuration for sql_exporter for autoscaling-agent
+      # Global defaults.
+      global:
+        # If scrape_timeout <= 0, no timeout is set unless Prometheus provides one. The default is 10s.
+        scrape_timeout: 10s
+        # Subtracted from Prometheus' scrape_timeout to give us some headroom and prevent Prometheus from timing out first.
+        scrape_timeout_offset: 500ms
+        # Minimum interval between collector runs: by default (0s) collectors are executed on every scrape.
+        min_interval: 0s
+        # Maximum number of open connections to any one target. Metric queries will run concurrently on multiple connections,
+        # as will concurrent scrapes.
+        max_connections: 1
+        # Maximum number of idle connections to any one target. Unless you use very long collection intervals, this should
+        # always be the same as max_connections.
+        max_idle_connections: 1
+        # Maximum number of maximum amount of time a connection may be reused. Expired connections may be closed lazily before reuse.
+        # If 0, connections are not closed due to a connection's age.
+        max_connection_lifetime: 5m
+
+      # The target to monitor and the collectors to execute on it.
+      target:
+        # Data source name always has a URI schema that matches the driver name. In some cases (e.g. MySQL)
+        # the schema gets dropped or replaced to match the driver expected DSN format.
+        data_source_name: 'postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable'
+
+        # Collectors (referenced by name) to execute on the target.
+        # Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
+        collectors: [neon_collector_autoscaling]
+
+      # Collector files specifies a list of globs. One collector definition is read from each matching file.
+      # Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
+      collector_files:
+        - "neon_collector_autoscaling.yml"
   - filename: neon_collector.yml
     content: |
       collector_name: neon_collector
@@ -194,6 +244,57 @@ files:
         values: [approximate_working_set_size]
         query: |
           select neon.approximate_working_set_size(false) as approximate_working_set_size;
+  - filename: neon_collector_autoscaling.yml
+    content: |
+      collector_name: neon_collector_autoscaling
+      metrics:
+      - metric_name: lfc_misses
+        type: gauge
+        help: 'lfc_misses'
+        key_labels:
+        values: [lfc_misses]
+        query: |
+          select lfc_value as lfc_misses from neon.neon_lfc_stats where lfc_key='file_cache_misses';
+
+      - metric_name: lfc_used
+        type: gauge
+        help: 'LFC chunks used (chunk = 1MB)'
+        key_labels:
+        values: [lfc_used]
+        query: |
+          select lfc_value as lfc_used from neon.neon_lfc_stats where lfc_key='file_cache_used';
+
+      - metric_name: lfc_hits
+        type: gauge
+        help: 'lfc_hits'
+        key_labels:
+        values: [lfc_hits]
+        query: |
+          select lfc_value as lfc_hits from neon.neon_lfc_stats where lfc_key='file_cache_hits';
+
+      - metric_name: lfc_writes
+        type: gauge
+        help: 'lfc_writes'
+        key_labels:
+        values: [lfc_writes]
+        query: |
+          select lfc_value as lfc_writes from neon.neon_lfc_stats where lfc_key='file_cache_writes';
+
+      - metric_name: lfc_cache_size_limit
+        type: gauge
+        help: 'LFC cache size limit in bytes'
+        key_labels:
+        values: [lfc_cache_size_limit]
+        query: |
+          select pg_size_bytes(current_setting('neon.file_cache_size_limit')) as lfc_cache_size_limit;
+
+      - metric_name: lfc_approximate_working_set_size
+        type: gauge
+        help: 'Approximate working set size in pages of 8192 bytes'
+        key_labels:
+        values: [approximate_working_set_size]
+        query: |
+          select neon.approximate_working_set_size(false) as approximate_working_set_size;
 
 build: |
   # Build cgroup-tools
@@ -263,17 +364,32 @@ merge: |
       && echo 'root - nofile 1048576' >>/etc/security/limits.conf \
          )
 
+  # Allow postgres user (compute_ctl) to run swap resizer.
+  # Need to install sudo in order to allow this.
+  #
+  # Also, remove the 'read' permission from group/other on /neonvm/bin/resize-swap, just to be safe.
+  RUN set -e \
+      && apt update \
+      && apt install --no-install-recommends -y \
+             sudo \
+      && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
+  COPY compute_ctl-resize-swap /etc/sudoers.d/compute_ctl-resize-swap
+
   COPY cgconfig.conf /etc/cgconfig.conf
   COPY pgbouncer.ini /etc/pgbouncer.ini
   COPY sql_exporter.yml /etc/sql_exporter.yml
   COPY neon_collector.yml /etc/neon_collector.yml
+  COPY sql_exporter_autoscaling.yml /etc/sql_exporter_autoscaling.yml
+  COPY neon_collector_autoscaling.yml /etc/neon_collector_autoscaling.yml
 
   RUN set -e \
       && chown postgres:postgres /etc/pgbouncer.ini \
       && chmod 0666 /etc/pgbouncer.ini \
       && chmod 0644 /etc/cgconfig.conf \
       && chmod 0644 /etc/sql_exporter.yml \
-      && chmod 0644 /etc/neon_collector.yml
+      && chmod 0644 /etc/neon_collector.yml \
+      && chmod 0644 /etc/sql_exporter_autoscaling.yml \
+      && chmod 0644 /etc/neon_collector_autoscaling.yml
 
   COPY --from=libcgroup-builder /libcgroup-install/bin/*  /usr/bin/
   COPY --from=libcgroup-builder /libcgroup-install/lib/*  /usr/lib/
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 7b8228a082..b2da33e44a 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -37,8 +37,7 @@ futures-io = { version = "0.3" }
 futures-sink = { version = "0.3" }
 futures-util = { version = "0.3", features = ["channel", "io", "sink"] }
 getrandom = { version = "0.2", default-features = false, features = ["std"] }
-hashbrown-582f2526e08bb6a0 = { package = "hashbrown", version = "0.14", features = ["raw"] }
-hashbrown-594e8ee84c453af0 = { package = "hashbrown", version = "0.13", features = ["raw"] }
+hashbrown = { version = "0.14", features = ["raw"] }
 hex = { version = "0.4", features = ["serde"] }
 hmac = { version = "0.12", default-features = false, features = ["reset"] }
 hyper = { version = "0.14", features = ["full"] }
@@ -58,14 +57,16 @@ rand = { version = "0.8", features = ["small_rng"] }
 regex = { version = "1" }
 regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] }
 regex-syntax = { version = "0.8" }
-reqwest = { version = "0.11", default-features = false, features = ["blocking", "default-tls", "json", "multipart", "rustls-tls", "stream"] }
+reqwest-5ef9efb8ec2df382 = { package = "reqwest", version = "0.12", default-features = false, features = ["blocking", "json", "rustls-tls", "stream"] }
+reqwest-a6292c17cd707f01 = { package = "reqwest", version = "0.11", default-features = false, features = ["blocking", "default-tls", "stream"] }
 rustls = { version = "0.21", features = ["dangerous_configuration"] }
 scopeguard = { version = "1" }
 serde = { version = "1", features = ["alloc", "derive"] }
 serde_json = { version = "1", features = ["raw_value"] }
 sha2 = { version = "0.10", features = ["asm"] }
-smallvec = { version = "1", default-features = false, features = ["write"] }
+smallvec = { version = "1", default-features = false, features = ["const_new", "write"] }
 subtle = { version = "2" }
+sync_wrapper = { version = "0.1", default-features = false, features = ["futures"] }
 time = { version = "0.3", features = ["local-offset", "macros", "serde-well-known"] }
 tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "signal", "test-util"] }
 tokio-rustls = { version = "0.24" }
@@ -76,7 +77,6 @@ tonic = { version = "0.9", features = ["tls-roots"] }
 tower = { version = "0.4", default-features = false, features = ["balance", "buffer", "limit", "log", "timeout", "util"] }
 tracing = { version = "0.1", features = ["log"] }
 tracing-core = { version = "0.1" }
-tungstenite = { version = "0.20" }
 url = { version = "2", features = ["serde"] }
 uuid = { version = "1", features = ["serde", "v4", "v7"] }
 zeroize = { version = "1", features = ["derive"] }
@@ -91,7 +91,7 @@ cc = { version = "1", default-features = false, features = ["parallel"] }
 chrono = { version = "0.4", default-features = false, features = ["clock", "serde", "wasmbind"] }
 either = { version = "1" }
 getrandom = { version = "0.2", default-features = false, features = ["std"] }
-hashbrown-582f2526e08bb6a0 = { package = "hashbrown", version = "0.14", features = ["raw"] }
+hashbrown = { version = "0.14", features = ["raw"] }
 indexmap = { version = "1", default-features = false, features = ["std"] }
 itertools = { version = "0.10" }
 libc = { version = "0.2", features = ["extra_traits", "use_std"] }