Track not_modified_since, client-support for protocol V2

2026-02-08 05:00:38 +00:00 · 2024-04-14 17:50:34 +03:00
195 changed files with 5599 additions and 11654 deletions
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -477,8 +477,6 @@ jobs:
          BUILD_TAG: ${{ needs.tag.outputs.build-tag }}
          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
          PAGESERVER_GET_VECTORED_IMPL: vectored
-          PAGESERVER_GET_IMPL: vectored
-          PAGESERVER_VALIDATE_VEC_GET: true

      # Temporary disable this step until we figure out why it's so flaky
      # Ref https://github.com/neondatabase/neon/issues/4540
@@ -558,9 +556,6 @@ jobs:
          PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
          TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
-          PAGESERVER_GET_VECTORED_IMPL: vectored
-          PAGESERVER_GET_IMPL: vectored
-          PAGESERVER_VALIDATE_VEC_GET: false
      # XXX: no coverage data handling here, since benchmarks are run on release builds,
      # while coverage is currently collected for the debug ones

@@ -740,7 +735,7 @@ jobs:
        run: |
          mkdir -p .docker-custom
          echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
-      - uses: docker/setup-buildx-action@v2
+      - uses: docker/setup-buildx-action@v3

      - uses: docker/login-action@v3
        with:
@@ -797,7 +792,7 @@ jobs:
        run: |
          mkdir -p .docker-custom
          echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
-      - uses: docker/setup-buildx-action@v2
+      - uses: docker/setup-buildx-action@v3
        with:
          # Disable parallelism for docker buildkit.
          # As we already build everything with `make -j$(nproc)`, running it in additional level of parallelisam blows up the Runner.
@@ -870,7 +865,7 @@ jobs:
      run:
        shell: sh -eu {0}
    env:
-      VM_BUILDER_VERSION: v0.28.1
+      VM_BUILDER_VERSION: v0.23.2

    steps:
      - name: Checkout
@@ -1138,6 +1133,8 @@ jobs:
              -f deployPreprodRegion=true

            gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main \
+              -f deployPgSniRouter=false \
+              -f deployProxy=false \
              -f deployStorage=true \
              -f deployStorageBroker=true \
              -f deployStorageController=true \
--- a/.github/workflows/check-build-tools-image.yml
+++ b/.github/workflows/check-build-tools-image.yml
@@ -28,9 +28,7 @@ jobs:
      - name: Get build-tools image tag for the current commit
        id: get-build-tools-tag
        env:
-          # Usually, for COMMIT_SHA, we use `github.event.pull_request.head.sha || github.sha`, but here, even for PRs,
-          # we want to use `github.sha` i.e. point to a phantom merge commit to determine the image tag correctly.
-          COMMIT_SHA: ${{ github.sha }}
+          COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        run: |
          LAST_BUILD_TOOLS_SHA=$(
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -599,7 +599,7 @@ dependencies = [
 "once_cell",
 "pin-project-lite",
 "pin-utils",
- "rustls 0.21.11",
+ "rustls 0.21.9",
 "tokio",
 "tracing",
 ]
@@ -722,9 +722,9 @@ dependencies = [

 [[package]]
 name = "azure_core"
-version = "0.19.0"
+version = "0.18.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "70fd680c0d0424a518229b1150922f92653ba2ac933aa000abc8bf1ca08105f7"
+checksum = "a6218987c374650fdad0b476bfc675729762c28dfb35f58608a38a2b1ea337dd"
 dependencies = [
 "async-trait",
 "base64 0.21.1",
@@ -752,9 +752,9 @@ dependencies = [

 [[package]]
 name = "azure_identity"
-version = "0.19.0"
+version = "0.18.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a6d2060f5b2e1c664026ca4edd561306c473be887c1f7a81f10bf06f9b71c63f"
+checksum = "9e1eacc4f7fb2a73d57c39139d0fc3aed78435606055779ddaef4b43cdf919a8"
 dependencies = [
 "async-lock",
 "async-trait",
@@ -772,9 +772,9 @@ dependencies = [

 [[package]]
 name = "azure_storage"
-version = "0.19.0"
+version = "0.18.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "15d3da73bfa09350e1bd6ae2a260806fcf90048c7e78cd2d8f88be60b19a7266"
+checksum = "ade8f2653e408de88b9eafec9f48c3c26b94026375e88adbd34523a7dd9795a1"
 dependencies = [
 "RustyXML",
 "async-lock",
@@ -791,9 +791,9 @@ dependencies = [

 [[package]]
 name = "azure_storage_blobs"
-version = "0.19.0"
+version = "0.18.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "149c21834a4105d761e3dd33d91c2a3064acc05a3c978848ea8089102ae45c94"
+checksum = "025701c7cc5b523100f0f3b2b01723564ec5a86c03236521c06826337047e872"
 dependencies = [
 "RustyXML",
 "azure_core",
@@ -812,9 +812,9 @@ dependencies = [

 [[package]]
 name = "azure_svc_blobstorage"
-version = "0.19.0"
+version = "0.18.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "88c888b7bf522d5405218b8613bf0fae7ddaae6ef3bf4ad42ae005993c96ab8b"
+checksum = "76051e5bb67cea1055abe5e530a0878feac7e0ab4cbbcb4a6adc953a58993389"
 dependencies = [
 "azure_core",
 "bytes",
@@ -1319,7 +1319,6 @@ dependencies = [
 "git-version",
 "hex",
 "humantime",
- "humantime-serde",
 "hyper 0.14.26",
 "nix 0.27.1",
 "once_cell",
@@ -2520,7 +2519,7 @@ dependencies = [
 "http 0.2.9",
 "hyper 0.14.26",
 "log",
- "rustls 0.21.11",
+ "rustls 0.21.9",
 "rustls-native-certs 0.6.2",
 "tokio",
 "tokio-rustls 0.24.0",
@@ -2764,9 +2763,9 @@ dependencies = [

 [[package]]
 name = "js-sys"
-version = "0.3.69"
+version = "0.3.63"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "29c15563dc2726973df627357ce0c9ddddbea194836909d655df6a75d2cf296d"
+checksum = "2f37a4a5928311ac501dee68b3c7613a1037d0edb30c8e5427bd832d55d1b790"
 dependencies = [
 "wasm-bindgen",
 ]
@@ -3185,16 +3184,6 @@ dependencies = [
 "winapi",
 ]

-[[package]]
-name = "nu-ansi-term"
-version = "0.46.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84"
-dependencies = [
- "overload",
- "winapi",
-]
-
 [[package]]
 name = "num"
 version = "0.4.1"
@@ -3531,12 +3520,6 @@ version = "0.5.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4030760ffd992bef45b0ae3f10ce1aba99e33464c90d14dd7c039884963ddc7a"

-[[package]]
-name = "overload"
-version = "0.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
-
 [[package]]
 name = "p256"
 version = "0.11.1"
@@ -3675,7 +3658,6 @@ dependencies = [
 "tokio-util",
 "toml_edit",
 "tracing",
- "twox-hash",
 "url",
 "utils",
 "walkdir",
@@ -4077,7 +4059,7 @@ dependencies = [
 "futures",
 "once_cell",
 "pq_proto",
- "rustls 0.22.4",
+ "rustls 0.22.2",
 "rustls-pemfile 2.1.1",
 "serde",
 "thiserror",
@@ -4368,7 +4350,7 @@ dependencies = [
 "routerify",
 "rstest",
 "rustc-hash",
- "rustls 0.22.4",
+ "rustls 0.22.2",
 "rustls-pemfile 2.1.1",
 "scopeguard",
 "serde",
@@ -4560,7 +4542,7 @@ dependencies = [
 "itoa",
 "percent-encoding",
 "pin-project-lite",
- "rustls 0.22.4",
+ "rustls 0.22.2",
 "rustls-native-certs 0.7.0",
 "rustls-pemfile 2.1.1",
 "rustls-pki-types",
@@ -4714,7 +4696,7 @@ dependencies = [
 "once_cell",
 "percent-encoding",
 "pin-project-lite",
- "rustls 0.21.11",
+ "rustls 0.21.9",
 "rustls-pemfile 1.0.2",
 "serde",
 "serde_json",
@@ -4974,9 +4956,9 @@ dependencies = [

 [[package]]
 name = "rustls"
-version = "0.21.11"
+version = "0.21.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7fecbfb7b1444f477b345853b1fce097a2c6fb637b2bfb87e6bc5db0f043fae4"
+checksum = "629648aced5775d558af50b2b4c7b02983a04b312126d45eeead26e7caa498b9"
 dependencies = [
 "log",
 "ring 0.17.6",
@@ -4986,9 +4968,9 @@ dependencies = [

 [[package]]
 name = "rustls"
-version = "0.22.4"
+version = "0.22.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bf4ef73721ac7bcd79b2b315da7779d8fc09718c6b3d2d1b2d94850eb8c18432"
+checksum = "e87c9956bd9807afa1f77e0f7594af32566e830e088a5576d27c5b6f30f49d41"
 dependencies = [
 "log",
 "ring 0.17.6",
@@ -5102,7 +5084,6 @@ dependencies = [
 "aws-smithy-async",
 "bincode",
 "bytes",
- "camino",
 "chrono",
 "clap",
 "crc32c",
@@ -5112,11 +5093,8 @@ dependencies = [
 "hex",
 "histogram",
 "itertools",
- "native-tls",
 "pageserver",
 "pageserver_api",
- "postgres-native-tls",
- "postgres_ffi",
 "rand 0.8.5",
 "remote_storage",
 "reqwest",
@@ -5125,10 +5103,8 @@ dependencies = [
 "serde_with",
 "thiserror",
 "tokio",
- "tokio-postgres",
 "tokio-rustls 0.25.0",
 "tokio-stream",
- "tokio-util",
 "tracing",
 "tracing-appender",
 "tracing-subscriber",
@@ -5306,7 +5282,7 @@ checksum = "2e95efd0cefa32028cdb9766c96de71d96671072f9fb494dc9fb84c0ef93e52b"
 dependencies = [
 "httpdate",
 "reqwest",
- "rustls 0.21.11",
+ "rustls 0.21.9",
 "sentry-backtrace",
 "sentry-contexts",
 "sentry-core",
@@ -5854,7 +5830,8 @@ checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc"
 [[package]]
 name = "svg_fmt"
 version = "0.4.2"
-source = "git+https://github.com/neondatabase/fork--nical--rust_debug?branch=neon#b9501105e746629004bc6d0473639320939dbe10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f83ba502a3265efb76efb89b0a2f7782ad6f2675015d4ce37e4b547dda42b499"

 [[package]]
 name = "syn"
@@ -6216,7 +6193,7 @@ checksum = "0ea13f22eda7127c827983bdaf0d7fff9df21c8817bab02815ac277a21143677"
 dependencies = [
 "futures",
 "ring 0.17.6",
- "rustls 0.22.4",
+ "rustls 0.22.2",
 "tokio",
 "tokio-postgres",
 "tokio-rustls 0.25.0",
@@ -6229,7 +6206,7 @@ version = "0.24.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e0d409377ff5b1e3ca6437aa86c1eb7d40c134bfec254e44c830defa92669db5"
 dependencies = [
- "rustls 0.21.11",
+ "rustls 0.21.9",
 "tokio",
 ]

@@ -6239,7 +6216,7 @@ version = "0.25.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "775e0c0f0adb3a2f22a00c4745d728b479985fc15ee7ca6a2608388c5569860f"
 dependencies = [
- "rustls 0.22.4",
+ "rustls 0.22.2",
 "rustls-pki-types",
 "tokio",
 ]
@@ -6435,10 +6412,11 @@ dependencies = [

 [[package]]
 name = "tracing"
-version = "0.1.40"
+version = "0.1.37"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef"
+checksum = "8ce8c33a8d48bd45d624a6e523445fd21ec13d3653cd51f681abf67418f54eb8"
 dependencies = [
+ "cfg-if",
 "log",
 "pin-project-lite",
 "tracing-attributes",
@@ -6458,9 +6436,9 @@ dependencies = [

 [[package]]
 name = "tracing-attributes"
-version = "0.1.27"
+version = "0.1.24"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7"
+checksum = "0f57e3ca2a01450b1a921183a9c9cbfda207fd822cef4ccb00a65402cbba7a74"
 dependencies = [
 "proc-macro2",
 "quote",
@@ -6469,9 +6447,9 @@ dependencies = [

 [[package]]
 name = "tracing-core"
-version = "0.1.32"
+version = "0.1.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54"
+checksum = "0955b8137a1df6f1a2e9a37d8a6656291ff0297c1a97c24e0d8425fe2312f79a"
 dependencies = [
 "once_cell",
 "valuable",
@@ -6529,7 +6507,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "30a651bc37f915e81f087d86e62a18eec5f79550c7faff886f7090b4ea757c77"
 dependencies = [
 "matchers",
- "nu-ansi-term",
 "once_cell",
 "regex",
 "serde",
@@ -6700,7 +6677,7 @@ dependencies = [
 "base64 0.21.1",
 "log",
 "once_cell",
- "rustls 0.21.11",
+ "rustls 0.21.9",
 "rustls-webpki 0.100.2",
 "url",
 "webpki-roots 0.23.1",
@@ -6927,9 +6904,9 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"

 [[package]]
 name = "wasm-bindgen"
-version = "0.2.92"
+version = "0.2.86"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4be2531df63900aeb2bca0daaaddec08491ee64ceecbee5076636a3b026795a8"
+checksum = "5bba0e8cb82ba49ff4e229459ff22a191bbe9a1cb3a341610c9c33efc27ddf73"
 dependencies = [
 "cfg-if",
 "wasm-bindgen-macro",
@@ -6937,9 +6914,9 @@ dependencies = [

 [[package]]
 name = "wasm-bindgen-backend"
-version = "0.2.92"
+version = "0.2.86"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "614d787b966d3989fa7bb98a654e369c762374fd3213d212cfc0251257e747da"
+checksum = "19b04bc93f9d6bdee709f6bd2118f57dd6679cf1176a1af464fca3ab0d66d8fb"
 dependencies = [
 "bumpalo",
 "log",
@@ -6952,9 +6929,9 @@ dependencies = [

 [[package]]
 name = "wasm-bindgen-futures"
-version = "0.4.42"
+version = "0.4.36"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "76bc14366121efc8dbb487ab05bcc9d346b3b5ec0eaa76e46594cabbe51762c0"
+checksum = "2d1985d03709c53167ce907ff394f5316aa22cb4e12761295c5dc57dacb6297e"
 dependencies = [
 "cfg-if",
 "js-sys",
@@ -6964,9 +6941,9 @@ dependencies = [

 [[package]]
 name = "wasm-bindgen-macro"
-version = "0.2.92"
+version = "0.2.86"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a1f8823de937b71b9460c0c34e25f3da88250760bec0ebac694b49997550d726"
+checksum = "14d6b024f1a526bb0234f52840389927257beb670610081360e5a03c5df9c258"
 dependencies = [
 "quote",
 "wasm-bindgen-macro-support",
@@ -6974,9 +6951,9 @@ dependencies = [

 [[package]]
 name = "wasm-bindgen-macro-support"
-version = "0.2.92"
+version = "0.2.86"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7"
+checksum = "e128beba882dd1eb6200e1dc92ae6c5dbaa4311aa7bb211ca035779e5efc39f8"
 dependencies = [
 "proc-macro2",
 "quote",
@@ -6987,9 +6964,9 @@ dependencies = [

 [[package]]
 name = "wasm-bindgen-shared"
-version = "0.2.92"
+version = "0.2.86"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "af190c94f2773fdb3729c55b007a722abb5384da03bc0986df4c289bf5567e96"
+checksum = "ed9d5b4305409d1fc9482fee2d7f9bcbf24b3972bf59817ef757e23982242a93"

 [[package]]
 name = "wasm-streams"
@@ -7021,9 +6998,9 @@ dependencies = [

 [[package]]
 name = "web-sys"
-version = "0.3.69"
+version = "0.3.63"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "77afa9a11836342370f4817622a2f0f418b134426d91a82dfb48f532d2ec13ef"
+checksum = "3bdd9ef4e984da1187bf8110c5cf5b845fbc87a23602cdf912386a76fcd3a7c2"
 dependencies = [
 "js-sys",
 "wasm-bindgen",
@@ -7377,7 +7354,7 @@ dependencies = [
 "regex-automata 0.4.3",
 "regex-syntax 0.8.2",
 "reqwest",
- "rustls 0.21.11",
+ "rustls 0.21.9",
 "scopeguard",
 "serde",
 "serde_json",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -45,10 +45,10 @@ anyhow = { version = "1.0", features = ["backtrace"] }
 arc-swap = "1.6"
 async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
 atomic-take = "1.1.0"
-azure_core = "0.19"
-azure_identity = "0.19"
-azure_storage = "0.19"
-azure_storage_blobs = "0.19"
+azure_core = "0.18"
+azure_identity = "0.18"
+azure_storage = "0.18"
+azure_storage_blobs = "0.18"
 flate2 = "1.0.26"
 async-stream = "0.3"
 async-trait = "0.1"
@@ -157,8 +157,7 @@ socket2 = "0.5"
 strum = "0.24"
 strum_macros = "0.24"
 "subtle"  = "2.5.0"
-# https://github.com/nical/rust_debug/pull/4
-svg_fmt = { git = "https://github.com/neondatabase/fork--nical--rust_debug", branch = "neon" }
+svg_fmt = "0.4.1"
 sync_wrapper = "0.1.2"
 tar = "0.4"
 task-local-extensions = "0.1.4"
@@ -180,7 +179,7 @@ tonic = {version = "0.9", features = ["tls", "tls-roots"]}
 tracing = "0.1"
 tracing-error = "0.2.0"
 tracing-opentelemetry = "0.20.0"
-tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json", "ansi"] }
+tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
 twox-hash = { version = "1.6.3", default-features = false }
 url = "2.2"
 urlencoding = "2.1"
--- a/18
+++ b/18
@@ -25,16 +25,14 @@ ifeq ($(UNAME_S),Linux)
 	# Seccomp BPF is only available for Linux
 	PG_CONFIGURE_OPTS += --with-libseccomp
 else ifeq ($(UNAME_S),Darwin)
-	ifndef DISABLE_HOMEBREW
-		# macOS with brew-installed openssl requires explicit paths
-		# It can be configured with OPENSSL_PREFIX variable
-		OPENSSL_PREFIX ?= $(shell brew --prefix openssl@3)
-		PG_CONFIGURE_OPTS += --with-includes=$(OPENSSL_PREFIX)/include --with-libraries=$(OPENSSL_PREFIX)/lib
-		PG_CONFIGURE_OPTS += PKG_CONFIG_PATH=$(shell brew --prefix icu4c)/lib/pkgconfig
-		# macOS already has bison and flex in the system, but they are old and result in postgres-v14 target failure
-		# brew formulae are keg-only and not symlinked into HOMEBREW_PREFIX, force their usage
-		EXTRA_PATH_OVERRIDES += $(shell brew --prefix bison)/bin/:$(shell brew --prefix flex)/bin/:
-	endif
+	# macOS with brew-installed openssl requires explicit paths
+	# It can be configured with OPENSSL_PREFIX variable
+	OPENSSL_PREFIX ?= $(shell brew --prefix openssl@3)
+	PG_CONFIGURE_OPTS += --with-includes=$(OPENSSL_PREFIX)/include --with-libraries=$(OPENSSL_PREFIX)/lib
+	PG_CONFIGURE_OPTS += PKG_CONFIG_PATH=$(shell brew --prefix icu4c)/lib/pkgconfig
+	# macOS already has bison and flex in the system, but they are old and result in postgres-v14 target failure
+	# brew formulae are keg-only and not symlinked into HOMEBREW_PREFIX, force their usage
+	EXTRA_PATH_OVERRIDES += $(shell brew --prefix bison)/bin/:$(shell brew --prefix flex)/bin/:
 endif

 # Use -C option so that when PostgreSQL "make install" installs the
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -51,7 +51,6 @@ use tracing::{error, info};
 use url::Url;

 use compute_api::responses::ComputeStatus;
-use compute_api::spec::ComputeSpec;

 use compute_tools::compute::{
    forward_termination_signal, ComputeNode, ComputeState, ParsedSpec, PG_PID,
@@ -69,29 +68,6 @@ use compute_tools::spec::*;
 const BUILD_TAG_DEFAULT: &str = "latest";

 fn main() -> Result<()> {
-    let (build_tag, clap_args) = init()?;
-
-    let (pg_handle, start_pg_result) =
-    {
-        // Enter startup tracing context
-        let _startup_context_guard = startup_context_from_env();
-
-        let cli_result = process_cli(&clap_args)?;
-
-        let wait_spec_result = wait_spec(build_tag, cli_result)?;
-
-        start_postgres(&clap_args, wait_spec_result)?
-
-        // Startup is finished, exit the startup tracing context
-    };
-
-    // PostgreSQL is now running, if startup was successful. Wait until it exits.
-    let wait_pg_result = wait_postgres(pg_handle)?;
-
-    cleanup_and_exit(start_pg_result, wait_pg_result)
-}
-
-fn init() -> Result<(String, clap::ArgMatches)> {
    init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;

    let mut signals = Signals::new([SIGINT, SIGTERM, SIGQUIT])?;
@@ -106,11 +82,35 @@ fn init() -> Result<(String, clap::ArgMatches)> {
        .to_string();
    info!("build_tag: {build_tag}");

-    Ok((build_tag, cli().get_matches()))
-}
+    let matches = cli().get_matches();
+    let pgbin_default = String::from("postgres");
+    let pgbin = matches.get_one::<String>("pgbin").unwrap_or(&pgbin_default);
+
+    let ext_remote_storage = matches
+        .get_one::<String>("remote-ext-config")
+        // Compatibility hack: if the control plane specified any remote-ext-config
+        // use the default value for extension storage proxy gateway.
+        // Remove this once the control plane is updated to pass the gateway URL
+        .map(|conf| {
+            if conf.starts_with("http") {
+                conf.trim_end_matches('/')
+            } else {
+                "http://pg-ext-s3-gateway"
+            }
+        });
+
+    let http_port = *matches
+        .get_one::<u16>("http-port")
+        .expect("http-port is required");
+    let pgdata = matches
+        .get_one::<String>("pgdata")
+        .expect("PGDATA path is required");
+    let connstr = matches
+        .get_one::<String>("connstr")
+        .expect("Postgres connection string is required");
+    let spec_json = matches.get_one::<String>("spec");
+    let spec_path = matches.get_one::<String>("spec-path");

-fn startup_context_from_env() -> Option<opentelemetry::ContextGuard>
-{
    // Extract OpenTelemetry context for the startup actions from the
    // TRACEPARENT and TRACESTATE env variables, and attach it to the current
    // tracing context.
@@ -147,7 +147,7 @@ fn startup_context_from_env() -> Option<opentelemetry::ContextGuard>
    if let Ok(val) = std::env::var("TRACESTATE") {
        startup_tracing_carrier.insert("tracestate".to_string(), val);
    }
-    if !startup_tracing_carrier.is_empty() {
+    let startup_context_guard = if !startup_tracing_carrier.is_empty() {
        use opentelemetry::propagation::TextMapPropagator;
        use opentelemetry::sdk::propagation::TraceContextPropagator;
        let guard = TraceContextPropagator::new()
@@ -157,42 +157,8 @@ fn startup_context_from_env() -> Option<opentelemetry::ContextGuard>
        Some(guard)
    } else {
        None
-    }
-}
+    };

-fn process_cli(
-    matches: &clap::ArgMatches,
-) -> Result<ProcessCliResult> {
-    let pgbin_default = "postgres";
-    let pgbin = matches
-        .get_one::<String>("pgbin")
-        .map(|s| s.as_str())
-        .unwrap_or(pgbin_default);
-
-    let ext_remote_storage = matches
-        .get_one::<String>("remote-ext-config")
-        // Compatibility hack: if the control plane specified any remote-ext-config
-        // use the default value for extension storage proxy gateway.
-        // Remove this once the control plane is updated to pass the gateway URL
-        .map(|conf| {
-            if conf.starts_with("http") {
-                conf.trim_end_matches('/')
-            } else {
-                "http://pg-ext-s3-gateway"
-            }
-        });
-
-    let http_port = *matches
-        .get_one::<u16>("http-port")
-        .expect("http-port is required");
-    let pgdata = matches
-        .get_one::<String>("pgdata")
-        .expect("PGDATA path is required");
-    let connstr = matches
-        .get_one::<String>("connstr")
-        .expect("Postgres connection string is required");
-    let spec_json = matches.get_one::<String>("spec");
-    let spec_path = matches.get_one::<String>("spec-path");
    let compute_id = matches.get_one::<String>("compute-id");
    let control_plane_uri = matches.get_one::<String>("control-plane-uri");

@@ -233,45 +199,6 @@ fn process_cli(
        }
    };

-    let result = ProcessCliResult {
-        // directly from CLI:
-        connstr,
-        pgdata,
-        pgbin,
-        ext_remote_storage,
-        http_port,
-        // others:
-        spec,
-        live_config_allowed,
-    };
-
-    Ok(result)
-}
-
-struct ProcessCliResult<'clap> {
-    connstr: &'clap str,
-    pgdata: &'clap str,
-    pgbin: &'clap str,
-    ext_remote_storage: Option<&'clap str>,
-    http_port: u16,
-
-    /// If a spec was provided via CLI or file, the [`ComputeSpec`]
-    spec: Option<ComputeSpec>,
-    live_config_allowed: bool,
-}
-
-fn wait_spec(
-    build_tag: String,
-    ProcessCliResult {
-        connstr,
-        pgdata,
-        pgbin,
-        ext_remote_storage,
-        http_port,
-        spec,
-        live_config_allowed,
-    }: ProcessCliResult,
-) -> Result<WaitSpecResult> {
    let mut new_state = ComputeState::new();
    let spec_set;

@@ -310,6 +237,8 @@ fn wait_spec(
    let _http_handle =
        launch_http_server(http_port, &compute).expect("cannot launch http endpoint thread");

+    let extension_server_port: u16 = http_port;
+
    if !spec_set {
        // No spec provided, hang waiting for it.
        info!("no compute spec provided, waiting");
@@ -326,19 +255,6 @@ fn wait_spec(
        }
    }

-    Ok(WaitSpecResult { compute, http_port })
-}
-
-struct WaitSpecResult {
-    compute: Arc<ComputeNode>,
-    // passed through from ProcessCliResult
-    http_port: u16,
-}
-
-fn start_postgres(
-    matches: &clap::ArgMatches,
-    WaitSpecResult { compute, http_port }: WaitSpecResult,
-) -> Result<(Option<PostgresHandle>, StartPostgresResult)> {
    // We got all we need, update the state.
    let mut state = compute.state.lock().unwrap();

@@ -365,10 +281,9 @@ fn start_postgres(
    let _monitor_handle = launch_monitor(&compute);
    let _configurator_handle = launch_configurator(&compute);

-    let extension_server_port: u16 = http_port;
-
    // Start Postgres
    let mut delay_exit = false;
+    let mut exit_code = None;
    let pg = match compute.start_compute(extension_server_port) {
        Ok(pg) => Some(pg),
        Err(err) => {
@@ -419,7 +334,7 @@ fn start_postgres(
            // This token is used internally by the monitor to clean up all threads
            let token = CancellationToken::new();

-            let vm_monitor = rt.as_ref().map(|rt| {
+            let vm_monitor = &rt.as_ref().map(|rt| {
                rt.spawn(vm_monitor::start(
                    Box::leak(Box::new(vm_monitor::Args {
                        cgroup: cgroup.cloned(),
@@ -432,43 +347,12 @@ fn start_postgres(
        }
    }

-    Ok((
-        pg,
-        StartPostgresResult {
-            delay_exit,
-            compute,
-            #[cfg(target_os = "linux")]
-            rt,
-            #[cfg(target_os = "linux")]
-            token,
-            #[cfg(target_os = "linux")]
-            vm_monitor,
-        },
-    ))
-}
-
-type PostgresHandle = (std::process::Child, std::thread::JoinHandle<()>);
-
-struct StartPostgresResult {
-    delay_exit: bool,
-    // passed through from WaitSpecResult
-    compute: Arc<ComputeNode>,
-
-    #[cfg(target_os = "linux")]
-    rt: Option<tokio::runtime::Runtime>,
-    #[cfg(target_os = "linux")]
-    token: tokio_util::sync::CancellationToken,
-    #[cfg(target_os = "linux")]
-    vm_monitor: Option<tokio::task::JoinHandle<Result<()>>>,
-}
-
-fn wait_postgres(
-    pg: Option<PostgresHandle>,
-) -> Result<WaitPostgresResult> {
    // Wait for the child Postgres process forever. In this state Ctrl+C will
    // propagate to Postgres and it will be shut down as well.
-    let mut exit_code = None;
    if let Some((mut pg, logs_handle)) = pg {
+        // Startup is finished, exit the startup tracing span
+        drop(startup_context_guard);
+
        let ecode = pg
            .wait()
            .expect("failed to start waiting on Postgres process");
@@ -483,26 +367,6 @@ fn wait_postgres(
        exit_code = ecode.code()
    }

-    Ok(WaitPostgresResult { exit_code })
-}
-
-struct WaitPostgresResult {
-    exit_code: Option<i32>,
-}
-
-fn cleanup_and_exit(
-    StartPostgresResult {
-        mut delay_exit,
-        compute,
-        #[cfg(target_os = "linux")]
-        vm_monitor,
-        #[cfg(target_os = "linux")]
-        token,
-        #[cfg(target_os = "linux")]
-        rt,
-    }: StartPostgresResult,
-    WaitPostgresResult { exit_code }: WaitPostgresResult,
-) -> Result<()> {
    // Terminate the vm_monitor so it releases the file watcher on
    // /sys/fs/cgroup/neon-postgres.
    // Note: the vm-monitor only runs on linux because it requires cgroups.
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -818,15 +818,9 @@ impl ComputeNode {
                        Client::connect(zenith_admin_connstr.as_str(), NoTls)
                            .context("broken cloud_admin credential: tried connecting with cloud_admin but could not authenticate, and zenith_admin does not work either")?;
                    // Disable forwarding so that users don't get a cloud_admin role
-
-                    let mut func = || {
-                        client.simple_query("SET neon.forward_ddl = false")?;
-                        client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?;
-                        client.simple_query("GRANT zenith_admin TO cloud_admin")?;
-                        Ok::<_, anyhow::Error>(())
-                    };
-                    func().context("apply_config setup cloud_admin")?;
-
+                    client.simple_query("SET neon.forward_ddl = false")?;
+                    client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?;
+                    client.simple_query("GRANT zenith_admin TO cloud_admin")?;
                    drop(client);

                    // reconnect with connstring with expected name
@@ -838,29 +832,24 @@ impl ComputeNode {
        };

        // Disable DDL forwarding because control plane already knows about these roles/databases.
-        client
-            .simple_query("SET neon.forward_ddl = false")
-            .context("apply_config SET neon.forward_ddl = false")?;
+        client.simple_query("SET neon.forward_ddl = false")?;

        // Proceed with post-startup configuration. Note, that order of operations is important.
        let spec = &compute_state.pspec.as_ref().expect("spec must be set").spec;
-        create_neon_superuser(spec, &mut client).context("apply_config create_neon_superuser")?;
-        cleanup_instance(&mut client).context("apply_config cleanup_instance")?;
-        handle_roles(spec, &mut client).context("apply_config handle_roles")?;
-        handle_databases(spec, &mut client).context("apply_config handle_databases")?;
-        handle_role_deletions(spec, connstr.as_str(), &mut client)
-            .context("apply_config handle_role_deletions")?;
+        create_neon_superuser(spec, &mut client)?;
+        cleanup_instance(&mut client)?;
+        handle_roles(spec, &mut client)?;
+        handle_databases(spec, &mut client)?;
+        handle_role_deletions(spec, connstr.as_str(), &mut client)?;
        handle_grants(
            spec,
            &mut client,
            connstr.as_str(),
            self.has_feature(ComputeFeature::AnonExtension),
-        )
-        .context("apply_config handle_grants")?;
-        handle_extensions(spec, &mut client).context("apply_config handle_extensions")?;
-        handle_extension_neon(&mut client).context("apply_config handle_extension_neon")?;
-        create_availability_check_data(&mut client)
-            .context("apply_config create_availability_check_data")?;
+        )?;
+        handle_extensions(spec, &mut client)?;
+        handle_extension_neon(&mut client)?;
+        create_availability_check_data(&mut client)?;

        // 'Close' connection
        drop(client);
@@ -868,7 +857,7 @@ impl ComputeNode {
        // Run migrations separately to not hold up cold starts
        thread::spawn(move || {
            let mut client = Client::connect(connstr.as_str(), NoTls)?;
-            handle_migrations(&mut client).context("apply_config handle_migrations")
+            handle_migrations(&mut client)
        });
        Ok(())
    }
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -2,7 +2,7 @@ use std::fs::File;
 use std::path::Path;
 use std::str::FromStr;

-use anyhow::{anyhow, bail, Context, Result};
+use anyhow::{anyhow, bail, Result};
 use postgres::config::Config;
 use postgres::{Client, NoTls};
 use reqwest::StatusCode;
@@ -698,8 +698,7 @@ pub fn handle_grants(

        // it is important to run this after all grants
        if enable_anon_extension {
-            handle_extension_anon(spec, &db.owner, &mut db_client, false)
-                .context("handle_grants handle_extension_anon")?;
+            handle_extension_anon(spec, &db.owner, &mut db_client, false)?;
        }
    }

@@ -814,36 +813,28 @@ $$;"#,
        // Add new migrations below.
    ];

-    let mut func = || {
-        let query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
-        client.simple_query(query)?;
+    let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
+    client.simple_query(query)?;

-        let query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)";
-        client.simple_query(query)?;
+    query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)";
+    client.simple_query(query)?;

-        let query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING";
-        client.simple_query(query)?;
+    query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING";
+    client.simple_query(query)?;

-        let query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin";
-        client.simple_query(query)?;
+    query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin";
+    client.simple_query(query)?;

-        let query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC";
-        client.simple_query(query)?;
-        Ok::<_, anyhow::Error>(())
-    };
-    func().context("handle_migrations prepare")?;
+    query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC";
+    client.simple_query(query)?;

-    let query = "SELECT id FROM neon_migration.migration_id";
-    let row = client
-        .query_one(query, &[])
-        .context("handle_migrations get migration_id")?;
+    query = "SELECT id FROM neon_migration.migration_id";
+    let row = client.query_one(query, &[])?;
    let mut current_migration: usize = row.get::<&str, i64>("id") as usize;
    let starting_migration_id = current_migration;

-    let query = "BEGIN";
-    client
-        .simple_query(query)
-        .context("handle_migrations begin")?;
+    query = "BEGIN";
+    client.simple_query(query)?;

    while current_migration < migrations.len() {
        let migration = &migrations[current_migration];
@@ -851,9 +842,7 @@ $$;"#,
            info!("Skip migration id={}", current_migration);
        } else {
            info!("Running migration:\n{}\n", migration);
-            client.simple_query(migration).with_context(|| {
-                format!("handle_migrations current_migration={}", current_migration)
-            })?;
+            client.simple_query(migration)?;
        }
        current_migration += 1;
    }
@@ -861,14 +850,10 @@ $$;"#,
        "UPDATE neon_migration.migration_id SET id={}",
        migrations.len()
    );
-    client
-        .simple_query(&setval)
-        .context("handle_migrations update id")?;
+    client.simple_query(&setval)?;

-    let query = "COMMIT";
-    client
-        .simple_query(query)
-        .context("handle_migrations commit")?;
+    query = "COMMIT";
+    client.simple_query(query)?;

    info!(
        "Ran {} migrations",
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -17,7 +17,6 @@ nix.workspace = true
 once_cell.workspace = true
 postgres.workspace = true
 hex.workspace = true
-humantime-serde.workspace = true
 hyper.workspace = true
 regex.workspace = true
 reqwest = { workspace = true, features = ["blocking", "json"] }
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -417,54 +417,6 @@ async fn handle_tenant(
                println!("{} {:?}", t.id, t.state);
            }
        }
-        Some(("import", import_match)) => {
-            let tenant_id = parse_tenant_id(import_match)?.unwrap_or_else(TenantId::generate);
-
-            let storage_controller = StorageController::from_env(env);
-            let create_response = storage_controller.tenant_import(tenant_id).await?;
-
-            let shard_zero = create_response
-                .shards
-                .first()
-                .expect("Import response omitted shards");
-
-            let attached_pageserver_id = shard_zero.node_id;
-            let pageserver =
-                PageServerNode::from_env(env, env.get_pageserver_conf(attached_pageserver_id)?);
-
-            println!(
-                "Imported tenant {tenant_id}, attached to pageserver {attached_pageserver_id}"
-            );
-
-            let timelines = pageserver
-                .http_client
-                .list_timelines(shard_zero.shard_id)
-                .await?;
-
-            // Pick a 'main' timeline that has no ancestors, the rest will get arbitrary names
-            let main_timeline = timelines
-                .iter()
-                .find(|t| t.ancestor_timeline_id.is_none())
-                .expect("No timelines found")
-                .timeline_id;
-
-            let mut branch_i = 0;
-            for timeline in timelines.iter() {
-                let branch_name = if timeline.timeline_id == main_timeline {
-                    "main".to_string()
-                } else {
-                    branch_i += 1;
-                    format!("branch_{branch_i}")
-                };
-
-                println!(
-                    "Importing timeline {tenant_id}/{} as branch {branch_name}",
-                    timeline.timeline_id
-                );
-
-                env.register_branch_mapping(branch_name, tenant_id, timeline.timeline_id)?;
-            }
-        }
        Some(("create", create_match)) => {
            let tenant_conf: HashMap<_, _> = create_match
                .get_many::<String>("config")
@@ -1465,7 +1417,6 @@ fn cli() -> Command {
        .subcommand(
            Command::new("timeline")
            .about("Manage timelines")
-            .arg_required_else_help(true)
            .subcommand(Command::new("list")
                .about("List all timelines, available to this pageserver")
                .arg(tenant_id_arg.clone()))
@@ -1528,8 +1479,6 @@ fn cli() -> Command {
            .subcommand(Command::new("config")
                .arg(tenant_id_arg.clone())
                .arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false)))
-            .subcommand(Command::new("import").arg(tenant_id_arg.clone().required(true))
-                .about("Import a tenant that is present in remote storage, and create branches for its timelines"))
        )
        .subcommand(
            Command::new("pageserver")
@@ -1554,8 +1503,8 @@ fn cli() -> Command {
            Command::new("storage_controller")
                .arg_required_else_help(true)
                .about("Manage storage_controller")
-                .subcommand(Command::new("start").about("Start storage controller"))
-                .subcommand(Command::new("stop").about("Stop storage controller")
+                .subcommand(Command::new("start").about("Start local pageserver").arg(pageserver_config_args.clone()))
+                .subcommand(Command::new("stop").about("Stop local pageserver")
                            .arg(stop_mode_arg.clone()))
        )
        .subcommand(
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -17,7 +17,6 @@ use std::net::Ipv4Addr;
 use std::net::SocketAddr;
 use std::path::{Path, PathBuf};
 use std::process::{Command, Stdio};
-use std::time::Duration;
 use utils::{
    auth::{encode_from_key_file, Claims},
    id::{NodeId, TenantId, TenantTimelineId, TimelineId},
@@ -67,10 +66,6 @@ pub struct LocalEnv {

    pub broker: NeonBroker,

-    // Configuration for the storage controller (1 per neon_local environment)
-    #[serde(default)]
-    pub storage_controller: NeonStorageControllerConf,
-
    /// This Vec must always contain at least one pageserver
    pub pageservers: Vec<PageServerConf>,

@@ -103,29 +98,6 @@ pub struct NeonBroker {
    pub listen_addr: SocketAddr,
 }

-/// Broker config for cluster internal communication.
-#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
-#[serde(default)]
-pub struct NeonStorageControllerConf {
-    /// Heartbeat timeout before marking a node offline
-    #[serde(with = "humantime_serde")]
-    pub max_unavailable: Duration,
-}
-
-impl NeonStorageControllerConf {
-    // Use a shorter pageserver unavailability interval than the default to speed up tests.
-    const DEFAULT_MAX_UNAVAILABLE_INTERVAL: std::time::Duration =
-        std::time::Duration::from_secs(10);
-}
-
-impl Default for NeonStorageControllerConf {
-    fn default() -> Self {
-        Self {
-            max_unavailable: Self::DEFAULT_MAX_UNAVAILABLE_INTERVAL,
-        }
-    }
-}
-
 // Dummy Default impl to satisfy Deserialize derive.
 impl Default for NeonBroker {
    fn default() -> Self {
@@ -157,8 +129,6 @@ pub struct PageServerConf {

    pub(crate) virtual_file_io_engine: Option<String>,
    pub(crate) get_vectored_impl: Option<String>,
-    pub(crate) get_impl: Option<String>,
-    pub(crate) validate_vectored_get: Option<bool>,
 }

 impl Default for PageServerConf {
@@ -171,8 +141,6 @@ impl Default for PageServerConf {
            http_auth_type: AuthType::Trust,
            virtual_file_io_engine: None,
            get_vectored_impl: None,
-            get_impl: None,
-            validate_vectored_get: None,
        }
    }
 }
@@ -188,7 +156,6 @@ pub struct SafekeeperConf {
    pub remote_storage: Option<String>,
    pub backup_threads: Option<u32>,
    pub auth_enabled: bool,
-    pub listen_addr: Option<String>,
 }

 impl Default for SafekeeperConf {
@@ -202,7 +169,6 @@ impl Default for SafekeeperConf {
            remote_storage: None,
            backup_threads: None,
            auth_enabled: false,
-            listen_addr: None,
        }
    }
 }
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -92,8 +92,6 @@ impl PageServerNode {
            http_auth_type,
            virtual_file_io_engine,
            get_vectored_impl,
-            get_impl,
-            validate_vectored_get,
        } = &self.conf;

        let id = format!("id={}", id);
@@ -113,16 +111,6 @@ impl PageServerNode {
        } else {
            String::new()
        };
-        let get_impl = if let Some(get_impl) = get_impl {
-            format!("get_impl='{get_impl}'")
-        } else {
-            String::new()
-        };
-        let validate_vectored_get = if let Some(validate_vectored_get) = validate_vectored_get {
-            format!("validate_vectored_get={validate_vectored_get}")
-        } else {
-            String::new()
-        };

        let broker_endpoint_param = format!("broker_endpoint='{}'", self.env.broker.client_url());

@@ -136,8 +124,6 @@ impl PageServerNode {
            broker_endpoint_param,
            virtual_file_io_engine,
            get_vectored_impl,
-            get_impl,
-            validate_vectored_get,
        ];

        if let Some(control_plane_api) = &self.env.control_plane_api {
@@ -448,11 +434,6 @@ impl PageServerNode {
                .map(serde_json::from_str)
                .transpose()
                .context("parse `timeline_get_throttle` from json")?,
-            switch_to_aux_file_v2: settings
-                .remove("switch_to_aux_file_v2")
-                .map(|x| x.parse::<bool>())
-                .transpose()
-                .context("Failed to parse 'switch_to_aux_file_v2' as bool")?,
        };
        if !settings.is_empty() {
            bail!("Unrecognized tenant settings: {settings:?}")
@@ -571,11 +552,6 @@ impl PageServerNode {
                    .map(serde_json::from_str)
                    .transpose()
                    .context("parse `timeline_get_throttle` from json")?,
-                switch_to_aux_file_v2: settings
-                    .remove("switch_to_aux_file_v2")
-                    .map(|x| x.parse::<bool>())
-                    .transpose()
-                    .context("Failed to parse 'switch_to_aux_file_v2' as bool")?,
            }
        };

--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -70,31 +70,24 @@ pub struct SafekeeperNode {
    pub pg_connection_config: PgConnectionConfig,
    pub env: LocalEnv,
    pub http_client: reqwest::Client,
-    pub listen_addr: String,
    pub http_base_url: String,
 }

 impl SafekeeperNode {
    pub fn from_env(env: &LocalEnv, conf: &SafekeeperConf) -> SafekeeperNode {
-        let listen_addr = if let Some(ref listen_addr) = conf.listen_addr {
-            listen_addr.clone()
-        } else {
-            "127.0.0.1".to_string()
-        };
        SafekeeperNode {
            id: conf.id,
            conf: conf.clone(),
-            pg_connection_config: Self::safekeeper_connection_config(&listen_addr, conf.pg_port),
+            pg_connection_config: Self::safekeeper_connection_config(conf.pg_port),
            env: env.clone(),
            http_client: reqwest::Client::new(),
-            http_base_url: format!("http://{}:{}/v1", listen_addr, conf.http_port),
-            listen_addr,
+            http_base_url: format!("http://127.0.0.1:{}/v1", conf.http_port),
        }
    }

    /// Construct libpq connection string for connecting to this safekeeper.
-    fn safekeeper_connection_config(addr: &str, port: u16) -> PgConnectionConfig {
-        PgConnectionConfig::new_host_port(url::Host::parse(addr).unwrap(), port)
+    fn safekeeper_connection_config(port: u16) -> PgConnectionConfig {
+        PgConnectionConfig::new_host_port(url::Host::parse("127.0.0.1").unwrap(), port)
    }

    pub fn datadir_path_by_id(env: &LocalEnv, sk_id: NodeId) -> PathBuf {
@@ -118,8 +111,8 @@ impl SafekeeperNode {
        );
        io::stdout().flush().unwrap();

-        let listen_pg = format!("{}:{}", self.listen_addr, self.conf.pg_port);
-        let listen_http = format!("{}:{}", self.listen_addr, self.conf.http_port);
+        let listen_pg = format!("127.0.0.1:{}", self.conf.pg_port);
+        let listen_http = format!("127.0.0.1:{}", self.conf.http_port);
        let id = self.id;
        let datadir = self.datadir_path();

@@ -146,7 +139,7 @@ impl SafekeeperNode {
            availability_zone,
        ];
        if let Some(pg_tenant_only_port) = self.conf.pg_tenant_only_port {
-            let listen_pg_tenant_only = format!("{}:{}", self.listen_addr, pg_tenant_only_port);
+            let listen_pg_tenant_only = format!("127.0.0.1:{}", pg_tenant_only_port);
            args.extend(["--listen-pg-tenant-only".to_owned(), listen_pg_tenant_only]);
        }
        if !self.conf.sync {
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -1,7 +1,4 @@
-use crate::{
-    background_process,
-    local_env::{LocalEnv, NeonStorageControllerConf},
-};
+use crate::{background_process, local_env::LocalEnv};
 use camino::{Utf8Path, Utf8PathBuf};
 use hyper::Method;
 use pageserver_api::{
@@ -35,13 +32,15 @@ pub struct StorageController {
    public_key: Option<String>,
    postgres_port: u16,
    client: reqwest::Client,
-    config: NeonStorageControllerConf,
 }

 const COMMAND: &str = "storage_controller";

 const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16;

+// Use a shorter pageserver unavailability interval than the default to speed up tests.
+const NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL: std::time::Duration = std::time::Duration::from_secs(10);
+
 #[derive(Serialize, Deserialize)]
 pub struct AttachHookRequest {
    pub tenant_shard_id: TenantShardId,
@@ -136,7 +135,6 @@ impl StorageController {
            client: reqwest::ClientBuilder::new()
                .build()
                .expect("Failed to construct http client"),
-            config: env.storage_controller.clone(),
        }
    }

@@ -274,6 +272,8 @@ impl StorageController {
        // Run migrations on every startup, in case something changed.
        let database_url = self.setup_database().await?;

+        let max_unavailable: humantime::Duration = NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL.into();
+
        let mut args = vec![
            "-l",
            &self.listen,
@@ -283,7 +283,7 @@ impl StorageController {
            "--database-url",
            &database_url,
            "--max-unavailable-interval",
-            &humantime::Duration::from(self.config.max_unavailable).to_string(),
+            &max_unavailable.to_string(),
        ]
        .into_iter()
        .map(|s| s.to_string())
@@ -472,16 +472,6 @@ impl StorageController {
            .await
    }

-    #[instrument(skip(self))]
-    pub async fn tenant_import(&self, tenant_id: TenantId) -> anyhow::Result<TenantCreateResponse> {
-        self.dispatch::<(), TenantCreateResponse>(
-            Method::POST,
-            format!("debug/v1/tenant/{tenant_id}/import"),
-            None,
-        )
-        .await
-    }
-
    #[instrument(skip(self))]
    pub async fn tenant_locate(&self, tenant_id: TenantId) -> anyhow::Result<TenantLocateResponse> {
        self.dispatch::<(), _>(
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -1,15 +1,15 @@
-use std::{collections::HashMap, str::FromStr, time::Duration};
+use std::{collections::HashMap, str::FromStr};

 use clap::{Parser, Subcommand};
-use hyper::{Method, StatusCode};
+use hyper::Method;
 use pageserver_api::{
    controller_api::{
        NodeAvailabilityWrapper, NodeDescribeResponse, ShardSchedulingPolicy,
        TenantDescribeResponse, TenantPolicyRequest,
    },
    models::{
-        LocationConfigSecondary, ShardParameters, TenantConfig, TenantConfigRequest,
-        TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse,
+        ShardParameters, TenantConfig, TenantConfigRequest, TenantCreateRequest,
+        TenantShardSplitRequest, TenantShardSplitResponse,
    },
    shard::{ShardStripeSize, TenantShardId},
 };
@@ -120,12 +120,6 @@ enum Command {
        #[arg(long)]
        tenant_id: TenantId,
    },
-    /// For a tenant which hasn't been onboarded to the storage controller yet, add it in secondary
-    /// mode so that it can warm up content on a pageserver.
-    TenantWarmup {
-        #[arg(long)]
-        tenant_id: TenantId,
-    },
 }

 #[derive(Parser)]
@@ -587,94 +581,6 @@ async fn main() -> anyhow::Result<()> {
            }
            println!("{table}");
        }
-        Command::TenantWarmup { tenant_id } => {
-            let describe_response = storcon_client
-                .dispatch::<(), TenantDescribeResponse>(
-                    Method::GET,
-                    format!("control/v1/tenant/{tenant_id}"),
-                    None,
-                )
-                .await;
-            match describe_response {
-                Ok(describe) => {
-                    if matches!(describe.policy, PlacementPolicy::Secondary) {
-                        // Fine: it's already known to controller in secondary mode: calling
-                        // again to put it into secondary mode won't cause problems.
-                    } else {
-                        anyhow::bail!("Tenant already present with policy {:?}", describe.policy);
-                    }
-                }
-                Err(mgmt_api::Error::ApiError(StatusCode::NOT_FOUND, _)) => {
-                    // Fine: this tenant isn't know to the storage controller yet.
-                }
-                Err(e) => {
-                    // Unexpected API error
-                    return Err(e.into());
-                }
-            }
-
-            vps_client
-                .location_config(
-                    TenantShardId::unsharded(tenant_id),
-                    pageserver_api::models::LocationConfig {
-                        mode: pageserver_api::models::LocationConfigMode::Secondary,
-                        generation: None,
-                        secondary_conf: Some(LocationConfigSecondary { warm: true }),
-                        shard_number: 0,
-                        shard_count: 0,
-                        shard_stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE.0,
-                        tenant_conf: TenantConfig::default(),
-                    },
-                    None,
-                    true,
-                )
-                .await?;
-
-            let describe_response = storcon_client
-                .dispatch::<(), TenantDescribeResponse>(
-                    Method::GET,
-                    format!("control/v1/tenant/{tenant_id}"),
-                    None,
-                )
-                .await?;
-
-            let secondary_ps_id = describe_response
-                .shards
-                .first()
-                .unwrap()
-                .node_secondary
-                .first()
-                .unwrap();
-
-            println!("Tenant {tenant_id} warming up on pageserver {secondary_ps_id}");
-            loop {
-                let (status, progress) = vps_client
-                    .tenant_secondary_download(
-                        TenantShardId::unsharded(tenant_id),
-                        Some(Duration::from_secs(10)),
-                    )
-                    .await?;
-                println!(
-                    "Progress: {}/{} layers, {}/{} bytes",
-                    progress.layers_downloaded,
-                    progress.layers_total,
-                    progress.bytes_downloaded,
-                    progress.bytes_total
-                );
-                match status {
-                    StatusCode::OK => {
-                        println!("Download complete");
-                        break;
-                    }
-                    StatusCode::ACCEPTED => {
-                        // Loop
-                    }
-                    _ => {
-                        anyhow::bail!("Unexpected download status: {status}");
-                    }
-                }
-            }
-        }
    }

    Ok(())
--- a/docs/storage_controller.md
+++ b/docs/storage_controller.md
@@ -1,150 +0,0 @@
-# Storage Controller
-
-## Concepts
-
-The storage controller sits between administrative API clients and pageservers, and handles the details of mapping tenants to pageserver tenant shards. For example, creating a tenant is one API call to the storage controller,
-which is mapped into many API calls to many pageservers (for multiple shards, and for secondary locations).
-
-It implements a pageserver-compatible API that may be used for CRUD operations on tenants and timelines, translating these requests into appropriate operations on the shards within a tenant, which may be on many different pageservers. Using this API, the storage controller may be used in the same way as the pageserver's administrative HTTP API, hiding
-the underlying details of how data is spread across multiple nodes.
-
-The storage controller also manages generations, high availability (via secondary locations) and live migrations for tenants under its management. This is done with a reconciliation loop pattern, where tenants have an “intent” state and a “reconcile” task that tries to make the outside world match the intent.
-
-## APIs
-
-The storage controller’s HTTP server implements four logically separate APIs:
-
- `/v1/...` path is the pageserver-compatible API. This has to be at the path root because that’s where clients expect to find it on a pageserver.
- `/control/v1/...` path is the storage controller’s API, which enables operations such as registering and management pageservers, or executing shard splits.
- `/debug/v1/...` path contains endpoints which are either exclusively used in tests, or are for use by engineers when supporting a deployed system.
- `/upcall/v1/...` path contains endpoints that are called by pageservers. This includes the `/re-attach` and `/validate` APIs used by pageservers
-  to ensure data safety with generation numbers.
-
-The API is authenticated with a JWT token, and tokens must have scope `pageserverapi` (i.e. the same scope as pageservers’ APIs).
-
-See the `http.rs` file in the source for where the HTTP APIs are implemented.
-
-## Database
-
-The storage controller uses a postgres database to persist a subset of its state. Note that the storage controller does _not_ keep all its state in the database: this is a design choice to enable most operations to be done efficiently in memory, rather than having to read from the database. See `persistence.rs` for a more comprehensive comment explaining what we do and do not persist: a useful metaphor is that we persist objects like tenants and nodes, but we do not
-persist the _relationships_ between them: the attachment state of a tenant's shards to nodes is kept in memory and
-rebuilt on startup.
-
-The file `persistence.rs` contains all the code for accessing the database, and has a large doc comment that goes into more detail about exactly what we persist and why.
-
-The `diesel` crate is used for defining models & migrations.
-
-Running a local cluster with `cargo neon` automatically starts a vanilla postgress process to host the storage controller’s database.
-
-### Diesel tip: migrations
-
-If you need to modify the database schema, here’s how to create a migration:
-
- Install the diesel CLI with `cargo install diesel_cli`
- Use `diesel migration generate <name>` to create a new migration
- Populate the SQL files in the `migrations/` subdirectory
- Use `DATABASE_URL=... diesel migration run` to apply the migration you just wrote: this will update the `[schema.rs](http://schema.rs)` file automatically.
-  - This requires a running database: the easiest way to do that is to just run `cargo neon init ; cargo neon start`, which will leave a database available at `postgresql://localhost:1235/attachment_service`
- Commit the migration files and the changes to schema.rs
- If you need to iterate, you can rewind migrations with `diesel migration revert -a` and then `diesel migration run` again.
- The migrations are build into the storage controller binary, and automatically run at startup after it is deployed, so once you’ve committed a migration no further steps are needed.
-
-## storcon_cli
-
-The `storcon_cli` tool enables interactive management of the storage controller. This is usually
-only necessary for debug, but may also be used to manage nodes (e.g. marking a node as offline).
-
-`storcon_cli --help` includes details on commands.
-
-# Deploying
-
-This section is aimed at engineers deploying the storage controller outside of Neon's cloud platform, as
-part of a self-hosted system.
-
-_General note: since the default `neon_local` environment includes a storage controller, this is a useful
-reference when figuring out deployment._
-
-## Database
-
-It is **essential** that the database used by the storage controller is durable (**do not store it on ephemeral
-local disk**). This database contains pageserver generation numbers, which are essential to data safety on the pageserver.
-
-The resource requirements for the database are very low: a single CPU core and 1GiB of memory should work well for most deployments. The physical size of the database is typically under a gigabyte.
-
-Set the URL to the database using the `--database-url` CLI option.
-
-There is no need to run migrations manually: the storage controller automatically applies migrations
-when it starts up.
-
-## Configure pageservers to use the storage controller
-
-1. The pageserver `control_plane_api` and `control_plane_api_token` should be set in the `pageserver.toml` file. The API setting should
-   point to the "upcall" prefix, for example `http://127.0.0.1:1234/upcall/v1/` is used in neon_local clusters.
-2. Create a `metadata.json` file in the same directory as `pageserver.toml`: this enables the pageserver to automatically register itself
-   with the storage controller when it starts up. See the example below for the format of this file.
-
-### Example `metadata.json`
-
-```
-{"host":"acmehost.localdomain","http_host":"acmehost.localdomain","http_port":9898,"port":64000}
-```
-
- `port` and `host` refer to the _postgres_ port and host, and these must be accessible from wherever
-  postgres runs.
- `http_port` and `http_host` refer to the pageserver's HTTP api, this must be accessible from where
-  the storage controller runs.
-
-## Handle compute notifications.
-
-The storage controller independently moves tenant attachments between pageservers in response to
-changes such as a pageserver node becoming unavailable, or the tenant's shard count changing. To enable
-postgres clients to handle such changes, the storage controller calls an API hook when a tenant's pageserver
-location changes.
-
-The hook is configured using the storage controller's `--compute-hook-url` CLI option. If the hook requires
-JWT auth, the token may be provided with `--control-plane-jwt-token`. The hook will be invoked with a `PUT` request.
-
-In the Neon cloud service, this hook is implemented by Neon's internal cloud control plane. In `neon_local` systems
-the storage controller integrates directly with neon_local to reconfigure local postgres processes instead of calling
-the compute hook.
-
-When implementing an on-premise Neon deployment, you must implement a service that handles the compute hook. This is not complicated:
-the request body has format of the `ComputeHookNotifyRequest` structure, provided below for convenience.
-
-```
-struct ComputeHookNotifyRequestShard {
-    node_id: NodeId,
-    shard_number: ShardNumber,
-}
-
-struct ComputeHookNotifyRequest {
-    tenant_id: TenantId,
-    stripe_size: Option<ShardStripeSize>,
-    shards: Vec<ComputeHookNotifyRequestShard>,
-}
-```
-
-When a notification is received:
-
-1. Modify postgres configuration for this tenant:
-
-   - set `neon.pageserver_connstr` to a comma-separated list of postgres connection strings to pageservers according to the `shards` list. The
-     shards identified by `NodeId` must be converted to the address+port of the node.
-   - if stripe_size is not None, set `neon.stripe_size` to this value
-
-2. Send SIGHUP to postgres to reload configuration
-3. Respond with 200 to the notification request. Do not return success if postgres was not updated: if an error is returned, the controller
-   will retry the notification until it succeeds..
-
-### Example notification body
-
-```
-{
-  "tenant_id": "1f359dd625e519a1a4e8d7509690f6fc",
-  "stripe_size": 32768,
-  "shards": [
-      {"node_id": 344, "shard_number": 0},
-      {"node_id": 722, "shard_number": 1},
-  ],
-}
-```
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -256,16 +256,7 @@ fn update_rusage_metrics() {
    DISK_IO_BYTES
        .with_label_values(&["write"])
        .set(rusage_stats.ru_oublock * BYTES_IN_BLOCK);
-
-    // On macOS, the unit of maxrss is bytes; on Linux, it's kilobytes. https://stackoverflow.com/a/59915669
-    #[cfg(target_os = "macos")]
-    {
-        MAXRSS_KB.set(rusage_stats.ru_maxrss / 1024);
-    }
-    #[cfg(not(target_os = "macos"))]
-    {
-        MAXRSS_KB.set(rusage_stats.ru_maxrss);
-    }
+    MAXRSS_KB.set(rusage_stats.ru_maxrss);
 }

 fn get_rusage_stats() -> libc::rusage {
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -1,6 +1,5 @@
 use anyhow::{bail, Result};
 use byteorder::{ByteOrder, BE};
-use bytes::BufMut;
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::{Oid, TransactionId};
 use serde::{Deserialize, Serialize};
@@ -22,107 +21,15 @@ pub struct Key {
    pub field6: u32,
 }

-/// The storage key size.
 pub const KEY_SIZE: usize = 18;

-/// The metadata key size. 2B fewer than the storage key size because field2 is not fully utilized.
-/// See [`Key::to_i128`] for more information on the encoding.
-pub const METADATA_KEY_SIZE: usize = 16;
-
-/// The key prefix start range for the metadata keys. All keys with the first byte >= 0x40 is a metadata key.
-pub const METADATA_KEY_BEGIN_PREFIX: u8 = 0x60;
-pub const METADATA_KEY_END_PREFIX: u8 = 0x7F;
-
-/// The (reserved) key prefix of relation sizes.
-pub const RELATION_SIZE_PREFIX: u8 = 0x61;
-
-/// The key prefix of AUX file keys.
-pub const AUX_KEY_PREFIX: u8 = 0x62;
-
-/// Check if the key falls in the range of metadata keys.
-pub const fn is_metadata_key_slice(key: &[u8]) -> bool {
-    key[0] >= METADATA_KEY_BEGIN_PREFIX && key[0] < METADATA_KEY_END_PREFIX
-}
-
 impl Key {
-    /// Check if the key falls in the range of metadata keys.
-    pub const fn is_metadata_key(&self) -> bool {
-        self.field1 >= METADATA_KEY_BEGIN_PREFIX && self.field1 < METADATA_KEY_END_PREFIX
-    }
-
-    /// Encode a metadata key to a storage key.
-    pub fn from_metadata_key_fixed_size(key: &[u8; METADATA_KEY_SIZE]) -> Self {
-        assert!(is_metadata_key_slice(key), "key not in metadata key range");
-        Key {
-            field1: key[0],
-            field2: u16::from_be_bytes(key[1..3].try_into().unwrap()) as u32,
-            field3: u32::from_be_bytes(key[3..7].try_into().unwrap()),
-            field4: u32::from_be_bytes(key[7..11].try_into().unwrap()),
-            field5: key[11],
-            field6: u32::from_be_bytes(key[12..16].try_into().unwrap()),
-        }
-    }
-
-    /// Encode a metadata key to a storage key.
-    pub fn from_metadata_key(key: &[u8]) -> Self {
-        Self::from_metadata_key_fixed_size(key.try_into().expect("expect 16 byte metadata key"))
-    }
-
-    /// Extract a metadata key to a writer. The result should always be 16 bytes.
-    pub fn extract_metadata_key_to_writer(&self, mut writer: impl BufMut) {
-        writer.put_u8(self.field1);
-        assert!(self.field2 <= 0xFFFF);
-        writer.put_u16(self.field2 as u16);
-        writer.put_u32(self.field3);
-        writer.put_u32(self.field4);
-        writer.put_u8(self.field5);
-        writer.put_u32(self.field6);
-    }
-
-    /// Get the range of metadata keys.
-    pub fn metadata_key_range() -> Range<Self> {
-        Key {
-            field1: METADATA_KEY_BEGIN_PREFIX,
-            field2: 0,
-            field3: 0,
-            field4: 0,
-            field5: 0,
-            field6: 0,
-        }..Key {
-            field1: METADATA_KEY_END_PREFIX,
-            field2: 0,
-            field3: 0,
-            field4: 0,
-            field5: 0,
-            field6: 0,
-        }
-    }
-
-    /// Get the range of aux keys.
-    pub fn metadata_aux_key_range() -> Range<Self> {
-        Key {
-            field1: AUX_KEY_PREFIX,
-            field2: 0,
-            field3: 0,
-            field4: 0,
-            field5: 0,
-            field6: 0,
-        }..Key {
-            field1: AUX_KEY_PREFIX + 1,
-            field2: 0,
-            field3: 0,
-            field4: 0,
-            field5: 0,
-            field6: 0,
-        }
-    }
-
    /// 'field2' is used to store tablespaceid for relations and small enum numbers for other relish.
    /// As long as Neon does not support tablespace (because of lack of access to local file system),
    /// we can assume that only some predefined namespace OIDs are used which can fit in u16
    pub fn to_i128(&self) -> i128 {
        assert!(self.field2 < 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222);
-        (((self.field1 & 0x7F) as i128) << 120)
+        (((self.field1 & 0xf) as i128) << 120)
            | (((self.field2 & 0xFFFF) as i128) << 104)
            | ((self.field3 as i128) << 72)
            | ((self.field4 as i128) << 40)
@@ -132,7 +39,7 @@ impl Key {

    pub const fn from_i128(x: i128) -> Self {
        Key {
-            field1: ((x >> 120) & 0x7F) as u8,
+            field1: ((x >> 120) & 0xf) as u8,
            field2: ((x >> 104) & 0xFFFF) as u32,
            field3: (x >> 72) as u32,
            field4: (x >> 40) as u32,
@@ -141,11 +48,11 @@ impl Key {
        }
    }

-    pub const fn next(&self) -> Key {
+    pub fn next(&self) -> Key {
        self.add(1)
    }

-    pub const fn add(&self, x: u32) -> Key {
+    pub fn add(&self, x: u32) -> Key {
        let mut key = *self;

        let r = key.field6.overflowing_add(x);
@@ -174,8 +81,6 @@ impl Key {
        key
    }

-    /// Convert a 18B slice to a key. This function should not be used for metadata keys because field2 is handled differently.
-    /// Use [`Key::from_metadata_key`] instead.
    pub fn from_slice(b: &[u8]) -> Self {
        Key {
            field1: b[0],
@@ -187,8 +92,6 @@ impl Key {
        }
    }

-    /// Convert a key to a 18B slice. This function should not be used for metadata keys because field2 is handled differently.
-    /// Use [`Key::extract_metadata_key_to_writer`] instead.
    pub fn write_to_byte_slice(&self, buf: &mut [u8]) {
        buf[0] = self.field1;
        BE::write_u32(&mut buf[1..5], self.field2);
@@ -572,14 +475,12 @@ pub const AUX_FILES_KEY: Key = Key {
 // Reverse mappings for a few Keys.
 // These are needed by WAL redo manager.

-pub const NON_INHERITED_RANGE: Range<Key> = AUX_FILES_KEY..AUX_FILES_KEY.next();
-
 // AUX_FILES currently stores only data for logical replication (slots etc), and
 // we don't preserve these on a branch because safekeepers can't follow timeline
 // switch (and generally it likely should be optional), so ignore these.
 #[inline(always)]
 pub fn is_inherited_key(key: Key) -> bool {
-    !NON_INHERITED_RANGE.contains(&key)
+    key != AUX_FILES_KEY
 }

 #[inline(always)]
@@ -655,14 +556,11 @@ impl std::str::FromStr for Key {
 mod tests {
    use std::str::FromStr;

-    use crate::key::is_metadata_key_slice;
    use crate::key::Key;

    use rand::Rng;
    use rand::SeedableRng;

-    use super::AUX_KEY_PREFIX;
-
    #[test]
    fn display_fromstr_bijection() {
        let mut rng = rand::rngs::StdRng::seed_from_u64(42);
@@ -678,16 +576,4 @@ mod tests {

        assert_eq!(key, Key::from_str(&format!("{key}")).unwrap());
    }
-
-    #[test]
-    fn test_metadata_keys() {
-        let mut metadata_key = vec![AUX_KEY_PREFIX];
-        metadata_key.extend_from_slice(&[0xFF; 15]);
-        let encoded_key = Key::from_metadata_key(&metadata_key);
-        let mut output_key = Vec::new();
-        encoded_key.extract_metadata_key_to_writer(&mut output_key);
-        assert_eq!(metadata_key, output_key);
-        assert!(encoded_key.is_metadata_key());
-        assert!(is_metadata_key_slice(&metadata_key));
-    }
 }
--- a/libs/pageserver_api/src/keyspace.rs
+++ b/libs/pageserver_api/src/keyspace.rs
@@ -1,10 +1,7 @@
 use postgres_ffi::BLCKSZ;
 use std::ops::Range;

-use crate::{
-    key::Key,
-    shard::{ShardCount, ShardIdentity},
-};
+use crate::key::Key;
 use itertools::Itertools;

 ///
@@ -17,279 +14,44 @@ pub struct KeySpace {
    pub ranges: Vec<Range<Key>>,
 }

-/// A wrapper type for sparse keyspaces.
-#[derive(Clone, Debug, Default, PartialEq, Eq)]
-pub struct SparseKeySpace(pub KeySpace);
-
-/// Represents a contiguous half-open range of the keyspace, masked according to a particular
-/// ShardNumber's stripes: within this range of keys, only some "belong" to the current
-/// shard.
-///
-/// When we iterate over keys within this object, we will skip any keys that don't belong
-/// to this shard.
-///
-/// The start + end keys may not belong to the shard: these specify where layer files should
-/// start  + end, but we will never actually read/write those keys.
-#[derive(Clone, Debug, PartialEq, Eq)]
-pub struct ShardedRange<'a> {
-    pub shard_identity: &'a ShardIdentity,
-    pub range: Range<Key>,
-}
-
-// Calculate the size of a range within the blocks of the same relation, or spanning only the
-// top page in the previous relation's space.
-fn contiguous_range_len(range: &Range<Key>) -> u32 {
-    debug_assert!(is_contiguous_range(range));
-    if range.start.field6 == 0xffffffff {
-        range.end.field6 + 1
-    } else {
-        range.end.field6 - range.start.field6
-    }
-}
-
-/// Return true if this key range includes only keys in the same relation's data blocks, or
-/// just spanning one relation and the logical size (0xffffffff) block of the relation before it.
-///
-/// Contiguous in this context means we know the keys are in use _somewhere_, but it might not
-/// be on our shard.  Later in ShardedRange we do the extra work to figure out how much
-/// of a given contiguous range is present on one shard.
-///
-/// This matters, because:
-/// - Within such ranges, keys are used contiguously.  Outside such ranges it is sparse.
-/// - Within such ranges, we may calculate distances using simple subtraction of field6.
-fn is_contiguous_range(range: &Range<Key>) -> bool {
-    range.start.field1 == range.end.field1
-        && range.start.field2 == range.end.field2
-        && range.start.field3 == range.end.field3
-        && range.start.field4 == range.end.field4
-        && (range.start.field5 == range.end.field5
-            || (range.start.field6 == 0xffffffff && range.start.field5 + 1 == range.end.field5))
-}
-
-impl<'a> ShardedRange<'a> {
-    pub fn new(range: Range<Key>, shard_identity: &'a ShardIdentity) -> Self {
-        Self {
-            shard_identity,
-            range,
-        }
-    }
-
-    /// Break up this range into chunks, each of which has at least one local key in it if the
-    /// total range has at least one local key.
-    pub fn fragment(self, target_nblocks: u32) -> Vec<(u32, Range<Key>)> {
-        // Optimization for single-key case (e.g. logical size keys)
-        if self.range.end == self.range.start.add(1) {
-            return vec![(
-                if self.shard_identity.is_key_disposable(&self.range.start) {
-                    0
-                } else {
-                    1
-                },
-                self.range,
-            )];
-        }
-
-        if !is_contiguous_range(&self.range) {
-            // Ranges that span relations are not fragmented.  We only get these ranges as a result
-            // of operations that act on existing layers, so we trust that the existing range is
-            // reasonably small.
-            return vec![(u32::MAX, self.range)];
-        }
-
-        let mut fragments: Vec<(u32, Range<Key>)> = Vec::new();
-
-        let mut cursor = self.range.start;
-        while cursor < self.range.end {
-            let advance_by = self.distance_to_next_boundary(cursor);
-            let is_fragment_disposable = self.shard_identity.is_key_disposable(&cursor);
-
-            // If the previous fragment is undersized, then we seek to consume enough
-            // blocks to complete it.
-            let (want_blocks, merge_last_fragment) = match fragments.last_mut() {
-                Some(frag) if frag.0 < target_nblocks => (target_nblocks - frag.0, Some(frag)),
-                Some(frag) => {
-                    // Prev block is complete, want the full number.
-                    (
-                        target_nblocks,
-                        if is_fragment_disposable {
-                            // If this current range will be empty (not shard-local data), we will merge into previous
-                            Some(frag)
-                        } else {
-                            None
-                        },
-                    )
-                }
-                None => {
-                    // First iteration, want the full number
-                    (target_nblocks, None)
-                }
-            };
-
-            let advance_by = if is_fragment_disposable {
-                advance_by
-            } else {
-                std::cmp::min(advance_by, want_blocks)
-            };
-
-            let next_cursor = cursor.add(advance_by);
-
-            let this_frag = (
-                if is_fragment_disposable {
-                    0
-                } else {
-                    advance_by
-                },
-                cursor..next_cursor,
-            );
-            cursor = next_cursor;
-
-            if let Some(last_fragment) = merge_last_fragment {
-                // Previous fragment was short or this one is empty, merge into it
-                last_fragment.0 += this_frag.0;
-                last_fragment.1.end = this_frag.1.end;
-            } else {
-                fragments.push(this_frag);
-            }
-        }
-
-        fragments
-    }
-
-    /// Estimate the physical pages that are within this range, on this shard.  This returns
-    /// u32::MAX if the range spans relations: this return value should be interpreted as "large".
-    pub fn page_count(&self) -> u32 {
-        // Special cases for single keys like logical sizes
-        if self.range.end == self.range.start.add(1) {
-            return if self.shard_identity.is_key_disposable(&self.range.start) {
-                0
-            } else {
-                1
-            };
-        }
-
-        // We can only do an authentic calculation of contiguous key ranges
-        if !is_contiguous_range(&self.range) {
-            return u32::MAX;
-        }
-
-        // Special case for single sharded tenants: our logical and physical sizes are the same
-        if self.shard_identity.count < ShardCount::new(2) {
-            return contiguous_range_len(&self.range);
-        }
-
-        // Normal path: step through stripes and part-stripes in the range, evaluate whether each one belongs
-        // to Self, and add the stripe's block count to our total if so.
-        let mut result: u64 = 0;
-        let mut cursor = self.range.start;
-        while cursor < self.range.end {
-            // Count up to the next stripe_size boundary or end of range
-            let advance_by = self.distance_to_next_boundary(cursor);
-
-            // If this blocks in this stripe belong to us, add them to our count
-            if !self.shard_identity.is_key_disposable(&cursor) {
-                result += advance_by as u64;
-            }
-
-            cursor = cursor.add(advance_by);
-        }
-
-        if result > u32::MAX as u64 {
-            u32::MAX
-        } else {
-            result as u32
-        }
-    }
-
-    /// Advance the cursor to the next potential fragment boundary: this is either
-    /// a stripe boundary, or the end of the range.
-    fn distance_to_next_boundary(&self, cursor: Key) -> u32 {
-        let distance_to_range_end = contiguous_range_len(&(cursor..self.range.end));
-
-        if self.shard_identity.count < ShardCount::new(2) {
-            // Optimization: don't bother stepping through stripes if the tenant isn't sharded.
-            return distance_to_range_end;
-        }
-
-        if cursor.field6 == 0xffffffff {
-            // We are wrapping from one relation's logical size to the next relation's first data block
-            return 1;
-        }
-
-        let stripe_index = cursor.field6 / self.shard_identity.stripe_size.0;
-        let stripe_remainder = self.shard_identity.stripe_size.0
-            - (cursor.field6 - stripe_index * self.shard_identity.stripe_size.0);
-
-        if cfg!(debug_assertions) {
-            // We should never overflow field5 and field6 -- our callers check this earlier
-            // and would have returned their u32::MAX cases if the input range violated this.
-            let next_cursor = cursor.add(stripe_remainder);
-            debug_assert!(
-                next_cursor.field1 == cursor.field1
-                    && next_cursor.field2 == cursor.field2
-                    && next_cursor.field3 == cursor.field3
-                    && next_cursor.field4 == cursor.field4
-                    && next_cursor.field5 == cursor.field5
-            )
-        }
-
-        std::cmp::min(stripe_remainder, distance_to_range_end)
-    }
-
-    /// Whereas `page_count` estimates the number of pages physically in this range on this shard,
-    /// this function simply calculates the number of pages in the space, without accounting for those
-    /// pages that would not actually be stored on this node.
-    ///
-    /// Don't use this function in code that works with physical entities like layer files.
-    fn raw_size(range: &Range<Key>) -> u32 {
-        if is_contiguous_range(range) {
-            contiguous_range_len(range)
-        } else {
-            u32::MAX
-        }
-    }
-}
-
 impl KeySpace {
-    /// Create a key space with a single range.
-    pub fn single(key_range: Range<Key>) -> Self {
-        Self {
-            ranges: vec![key_range],
-        }
-    }
-
+    ///
    /// Partition a key space into roughly chunks of roughly 'target_size' bytes
    /// in each partition.
    ///
-    pub fn partition(&self, shard_identity: &ShardIdentity, target_size: u64) -> KeyPartitioning {
+    pub fn partition(&self, target_size: u64) -> KeyPartitioning {
        // Assume that each value is 8k in size.
-        let target_nblocks = (target_size / BLCKSZ as u64) as u32;
+        let target_nblocks = (target_size / BLCKSZ as u64) as usize;

        let mut parts = Vec::new();
        let mut current_part = Vec::new();
        let mut current_part_size: usize = 0;
        for range in &self.ranges {
-            // While doing partitioning, wrap the range in ShardedRange so that our size calculations
-            // will respect shard striping rather than assuming all keys within a range are present.
-            let range = ShardedRange::new(range.clone(), shard_identity);
-
-            // Chunk up the range into parts that each contain up to target_size local blocks
-            for (frag_on_shard_size, frag_range) in range.fragment(target_nblocks) {
-                // If appending the next contiguous range in the keyspace to the current
-                // partition would cause it to be too large, and our current partition
-                // covers at least one block that is physically present in this shard,
-                // then start a new partition
-                if current_part_size + frag_on_shard_size as usize > target_nblocks as usize
-                    && current_part_size > 0
-                {
-                    parts.push(KeySpace {
-                        ranges: current_part,
-                    });
-                    current_part = Vec::new();
-                    current_part_size = 0;
-                }
-                current_part.push(frag_range.start..frag_range.end);
-                current_part_size += frag_on_shard_size as usize;
+            // If appending the next contiguous range in the keyspace to the current
+            // partition would cause it to be too large, start a new partition.
+            let this_size = key_range_size(range) as usize;
+            if current_part_size + this_size > target_nblocks && !current_part.is_empty() {
+                parts.push(KeySpace {
+                    ranges: current_part,
+                });
+                current_part = Vec::new();
+                current_part_size = 0;
            }
+
+            // If the next range is larger than 'target_size', split it into
+            // 'target_size' chunks.
+            let mut remain_size = this_size;
+            let mut start = range.start;
+            while remain_size > target_nblocks {
+                let next = start.add(target_nblocks as u32);
+                parts.push(KeySpace {
+                    ranges: vec![start..next],
+                });
+                start = next;
+                remain_size -= target_nblocks
+            }
+            current_part.push(start..range.end);
+            current_part_size += remain_size;
        }

        // add last partition that wasn't full yet.
@@ -302,10 +64,6 @@ impl KeySpace {
        KeyPartitioning { parts }
    }

-    pub fn is_empty(&self) -> bool {
-        self.total_raw_size() == 0
-    }
-
    /// Merge another keyspace into the current one.
    /// Note: the keyspaces must not ovelap (enforced via assertions)
    pub fn merge(&mut self, other: &KeySpace) {
@@ -336,13 +94,12 @@ impl KeySpace {

    /// Remove all keys in `other` from `self`.
    /// This can involve splitting or removing of existing ranges.
-    /// Returns the removed keyspace
-    pub fn remove_overlapping_with(&mut self, other: &KeySpace) -> KeySpace {
+    pub fn remove_overlapping_with(&mut self, other: &KeySpace) {
        let (self_start, self_end) = match (self.start(), self.end()) {
            (Some(start), Some(end)) => (start, end),
            _ => {
                // self is empty
-                return KeySpace::default();
+                return;
            }
        };

@@ -355,37 +112,30 @@ impl KeySpace {
            .skip_while(|range| self_start >= range.end)
            .take_while(|range| self_end > range.start);

-        let mut removed_accum = KeySpaceRandomAccum::new();
        for range in other_ranges {
            while let Some(overlap_at) = self.overlaps_at(range) {
                let overlapped = self.ranges[overlap_at].clone();

                if overlapped.start < range.start && overlapped.end <= range.end {
                    // Higher part of the range is completely overlapped.
-                    removed_accum.add_range(range.start..self.ranges[overlap_at].end);
                    self.ranges[overlap_at].end = range.start;
                }
                if overlapped.start >= range.start && overlapped.end > range.end {
                    // Lower part of the range is completely overlapped.
-                    removed_accum.add_range(self.ranges[overlap_at].start..range.end);
                    self.ranges[overlap_at].start = range.end;
                }
                if overlapped.start < range.start && overlapped.end > range.end {
                    // Middle part of the range is overlapped.
-                    removed_accum.add_range(range.clone());
                    self.ranges[overlap_at].end = range.start;
                    self.ranges
                        .insert(overlap_at + 1, range.end..overlapped.end);
                }
                if overlapped.start >= range.start && overlapped.end <= range.end {
                    // Whole range is overlapped
-                    removed_accum.add_range(self.ranges[overlap_at].clone());
                    self.ranges.remove(overlap_at);
                }
            }
        }
-
-        removed_accum.to_keyspace()
    }

    pub fn start(&self) -> Option<Key> {
@@ -396,11 +146,11 @@ impl KeySpace {
        self.ranges.last().map(|range| range.end)
    }

-    /// The size of the keyspace in pages, before accounting for sharding
-    pub fn total_raw_size(&self) -> usize {
+    #[allow(unused)]
+    pub fn total_size(&self) -> usize {
        self.ranges
            .iter()
-            .map(|range| ShardedRange::raw_size(range) as usize)
+            .map(|range| key_range_size(range) as usize)
            .sum()
    }

@@ -420,11 +170,6 @@ impl KeySpace {
    pub fn overlaps(&self, range: &Range<Key>) -> bool {
        self.overlaps_at(range).is_some()
    }
-
-    /// Check if the keyspace contains a key
-    pub fn contains(&self, key: &Key) -> bool {
-        self.overlaps(&(*key..key.next()))
-    }
 }

 ///
@@ -439,33 +184,10 @@ pub struct KeyPartitioning {
    pub parts: Vec<KeySpace>,
 }

-/// Represents a partitioning of the sparse key space.
-#[derive(Clone, Debug, Default)]
-pub struct SparseKeyPartitioning {
-    pub parts: Vec<SparseKeySpace>,
-}
-
 impl KeyPartitioning {
    pub fn new() -> Self {
        KeyPartitioning { parts: Vec::new() }
    }
-
-    /// Convert a key partitioning to a sparse partition.
-    pub fn into_sparse(self) -> SparseKeyPartitioning {
-        SparseKeyPartitioning {
-            parts: self.parts.into_iter().map(SparseKeySpace).collect(),
-        }
-    }
-}
-
-impl SparseKeyPartitioning {
-    /// Note: use this function with caution. Attempt to handle a sparse keyspace in the same way as a dense keyspace will
-    /// cause long/dead loops.
-    pub fn into_dense(self) -> KeyPartitioning {
-        KeyPartitioning {
-            parts: self.parts.into_iter().map(|x| x.0).collect(),
-        }
-    }
 }

 ///
@@ -497,7 +219,7 @@ impl KeySpaceAccum {

    #[inline(always)]
    pub fn add_range(&mut self, range: Range<Key>) {
-        self.size += ShardedRange::raw_size(&range) as u64;
+        self.size += key_range_size(&range) as u64;

        match self.accum.as_mut() {
            Some(accum) => {
@@ -529,9 +251,7 @@ impl KeySpaceAccum {
        std::mem::take(self).to_keyspace()
    }

-    // The total number of keys in this object, ignoring any sharding effects that might cause some of
-    // the keys to be omitted in storage on this shard.
-    pub fn raw_size(&self) -> u64 {
+    pub fn size(&self) -> u64 {
        self.size
    }
 }
@@ -587,19 +307,36 @@ impl KeySpaceRandomAccum {
    }
 }

+#[inline(always)]
+pub fn key_range_size(key_range: &Range<Key>) -> u32 {
+    let start = key_range.start;
+    let end = key_range.end;
+
+    if end.field1 != start.field1
+        || end.field2 != start.field2
+        || end.field3 != start.field3
+        || end.field4 != start.field4
+    {
+        return u32::MAX;
+    }
+
+    let start = (start.field5 as u64) << 32 | start.field6 as u64;
+    let end = (end.field5 as u64) << 32 | end.field6 as u64;
+
+    let diff = end - start;
+    if diff > u32::MAX as u64 {
+        u32::MAX
+    } else {
+        diff as u32
+    }
+}
+
 pub fn singleton_range(key: Key) -> Range<Key> {
    key..key.next()
 }

 #[cfg(test)]
 mod tests {
-    use rand::{RngCore, SeedableRng};
-
-    use crate::{
-        models::ShardParameters,
-        shard::{ShardCount, ShardNumber},
-    };
-
    use super::*;
    use std::fmt::Write;

@@ -642,17 +379,14 @@ mod tests {
            accum.add_range(range.clone());
        }

-        let expected_size: u64 = ranges
-            .iter()
-            .map(|r| ShardedRange::raw_size(r) as u64)
-            .sum();
-        assert_eq!(accum.raw_size(), expected_size);
+        let expected_size: u64 = ranges.iter().map(|r| key_range_size(r) as u64).sum();
+        assert_eq!(accum.size(), expected_size);

        assert_ks_eq(&accum.consume_keyspace(), ranges.clone());
-        assert_eq!(accum.raw_size(), 0);
+        assert_eq!(accum.size(), 0);

        assert_ks_eq(&accum.consume_keyspace(), vec![]);
-        assert_eq!(accum.raw_size(), 0);
+        assert_eq!(accum.size(), 0);

        for range in &ranges {
            accum.add_range(range.clone());
@@ -819,16 +553,7 @@ mod tests {
                Key::from_i128(11)..Key::from_i128(13),
            ],
        };
-        let removed = key_space1.remove_overlapping_with(&key_space2);
-        let removed_expected = KeySpace {
-            ranges: vec![
-                Key::from_i128(2)..Key::from_i128(3),
-                Key::from_i128(6)..Key::from_i128(7),
-                Key::from_i128(11)..Key::from_i128(12),
-            ],
-        };
-        assert_eq!(removed, removed_expected);
-
+        key_space1.remove_overlapping_with(&key_space2);
        assert_eq!(
            key_space1.ranges,
            vec![
@@ -858,17 +583,7 @@ mod tests {
                Key::from_i128(14)..Key::from_i128(17),
            ],
        };
-
-        let removed = key_space1.remove_overlapping_with(&key_space2);
-        let removed_expected = KeySpace {
-            ranges: vec![
-                Key::from_i128(3)..Key::from_i128(5),
-                Key::from_i128(8)..Key::from_i128(10),
-                Key::from_i128(14)..Key::from_i128(15),
-            ],
-        };
-        assert_eq!(removed, removed_expected);
-
+        key_space1.remove_overlapping_with(&key_space2);
        assert_eq!(
            key_space1.ranges,
            vec![
@@ -895,11 +610,7 @@ mod tests {
                Key::from_i128(15)..Key::from_i128(17),
            ],
        };
-
-        let removed = key_space1.remove_overlapping_with(&key_space2);
-        let removed_expected = KeySpace::default();
-        assert_eq!(removed, removed_expected);
-
+        key_space1.remove_overlapping_with(&key_space2);
        assert_eq!(
            key_space1.ranges,
            vec![
@@ -926,17 +637,7 @@ mod tests {
        let key_space2 = KeySpace {
            ranges: vec![Key::from_i128(9)..Key::from_i128(19)],
        };
-
-        let removed = key_space1.remove_overlapping_with(&key_space2);
-        let removed_expected = KeySpace {
-            ranges: vec![
-                Key::from_i128(9)..Key::from_i128(10),
-                Key::from_i128(12)..Key::from_i128(15),
-                Key::from_i128(17)..Key::from_i128(19),
-            ],
-        };
-        assert_eq!(removed, removed_expected);
-
+        key_space1.remove_overlapping_with(&key_space2);
        assert_eq!(
            key_space1.ranges,
            vec![
@@ -949,412 +650,4 @@ mod tests {
            ]
        );
    }
-    #[test]
-    fn sharded_range_relation_gap() {
-        let shard_identity = ShardIdentity::new(
-            ShardNumber(0),
-            ShardCount::new(4),
-            ShardParameters::DEFAULT_STRIPE_SIZE,
-        )
-        .unwrap();
-
-        let range = ShardedRange::new(
-            Range {
-                start: Key::from_hex("000000067F00000005000040100300000000").unwrap(),
-                end: Key::from_hex("000000067F00000005000040130000004000").unwrap(),
-            },
-            &shard_identity,
-        );
-
-        // Key range spans relations, expect MAX
-        assert_eq!(range.page_count(), u32::MAX);
-    }
-
-    #[test]
-    fn shard_identity_keyspaces_single_key() {
-        let shard_identity = ShardIdentity::new(
-            ShardNumber(1),
-            ShardCount::new(4),
-            ShardParameters::DEFAULT_STRIPE_SIZE,
-        )
-        .unwrap();
-
-        let range = ShardedRange::new(
-            Range {
-                start: Key::from_hex("000000067f000000010000007000ffffffff").unwrap(),
-                end: Key::from_hex("000000067f00000001000000700100000000").unwrap(),
-            },
-            &shard_identity,
-        );
-        // Single-key range on logical size key
-        assert_eq!(range.page_count(), 1);
-    }
-
-    /// Test the helper that we use to identify ranges which go outside the data blocks of a single relation
-    #[test]
-    fn contiguous_range_check() {
-        assert!(!is_contiguous_range(
-            &(Key::from_hex("000000067f00000001000004df00fffffffe").unwrap()
-                ..Key::from_hex("000000067f00000001000004df0100000003").unwrap())
-        ),);
-
-        // The ranges goes all the way up to the 0xffffffff, including it: this is
-        // not considered a rel block range because 0xffffffff stores logical sizes,
-        // not blocks.
-        assert!(!is_contiguous_range(
-            &(Key::from_hex("000000067f00000001000004df00fffffffe").unwrap()
-                ..Key::from_hex("000000067f00000001000004df0100000000").unwrap())
-        ),);
-
-        // Keys within the normal data region of a relation
-        assert!(is_contiguous_range(
-            &(Key::from_hex("000000067f00000001000004df0000000000").unwrap()
-                ..Key::from_hex("000000067f00000001000004df0000000080").unwrap())
-        ),);
-
-        // The logical size key of one forkno, then some blocks in the next
-        assert!(is_contiguous_range(
-            &(Key::from_hex("000000067f00000001000004df00ffffffff").unwrap()
-                ..Key::from_hex("000000067f00000001000004df0100000080").unwrap())
-        ),);
-    }
-
-    #[test]
-    fn shard_identity_keyspaces_forkno_gap() {
-        let shard_identity = ShardIdentity::new(
-            ShardNumber(1),
-            ShardCount::new(4),
-            ShardParameters::DEFAULT_STRIPE_SIZE,
-        )
-        .unwrap();
-
-        let range = ShardedRange::new(
-            Range {
-                start: Key::from_hex("000000067f00000001000004df00fffffffe").unwrap(),
-                end: Key::from_hex("000000067f00000001000004df0100000003").unwrap(),
-            },
-            &shard_identity,
-        );
-
-        // Range spanning the end of one forkno and the start of the next: we do not attempt to
-        // calculate a valid size, because we have no way to know if they keys between start
-        // and end are actually in use.
-        assert_eq!(range.page_count(), u32::MAX);
-    }
-
-    #[test]
-    fn shard_identity_keyspaces_one_relation() {
-        for shard_number in 0..4 {
-            let shard_identity = ShardIdentity::new(
-                ShardNumber(shard_number),
-                ShardCount::new(4),
-                ShardParameters::DEFAULT_STRIPE_SIZE,
-            )
-            .unwrap();
-
-            let range = ShardedRange::new(
-                Range {
-                    start: Key::from_hex("000000067f00000001000000ae0000000000").unwrap(),
-                    end: Key::from_hex("000000067f00000001000000ae0000000001").unwrap(),
-                },
-                &shard_identity,
-            );
-
-            // Very simple case: range covering block zero of one relation, where that block maps to shard zero
-            if shard_number == 0 {
-                assert_eq!(range.page_count(), 1);
-            } else {
-                // Other shards should perceive the range's size as zero
-                assert_eq!(range.page_count(), 0);
-            }
-        }
-    }
-
-    /// Test helper: construct a ShardedRange and call fragment() on it, returning
-    /// the total page count in the range and the fragments.
-    fn do_fragment(
-        range_start: Key,
-        range_end: Key,
-        shard_identity: &ShardIdentity,
-        target_nblocks: u32,
-    ) -> (u32, Vec<(u32, Range<Key>)>) {
-        let range = ShardedRange::new(
-            Range {
-                start: range_start,
-                end: range_end,
-            },
-            shard_identity,
-        );
-
-        let page_count = range.page_count();
-        let fragments = range.fragment(target_nblocks);
-
-        // Invariant: we always get at least one fragment
-        assert!(!fragments.is_empty());
-
-        // Invariant: the first/last fragment start/end should equal the input start/end
-        assert_eq!(fragments.first().unwrap().1.start, range_start);
-        assert_eq!(fragments.last().unwrap().1.end, range_end);
-
-        if page_count > 0 {
-            // Invariant: every fragment must contain at least one shard-local page, if the
-            // total range contains at least one shard-local page
-            let all_nonzero = fragments.iter().all(|f| f.0 > 0);
-            if !all_nonzero {
-                eprintln!("Found a zero-length fragment: {:?}", fragments);
-            }
-            assert!(all_nonzero);
-        } else {
-            // A range with no shard-local pages should always be returned as a single fragment
-            assert_eq!(fragments, vec![(0, range_start..range_end)]);
-        }
-
-        // Invariant: fragments must be ordered and non-overlapping
-        let mut last: Option<Range<Key>> = None;
-        for frag in &fragments {
-            if let Some(last) = last {
-                assert!(frag.1.start >= last.end);
-                assert!(frag.1.start > last.start);
-            }
-            last = Some(frag.1.clone())
-        }
-
-        // Invariant: fragments respect target_nblocks
-        for frag in &fragments {
-            assert!(frag.0 == u32::MAX || frag.0 <= target_nblocks);
-        }
-
-        (page_count, fragments)
-    }
-
-    /// Really simple tests for fragment(), on a range that just contains a single stripe
-    /// for a single tenant.
-    #[test]
-    fn sharded_range_fragment_simple() {
-        let shard_identity = ShardIdentity::new(
-            ShardNumber(0),
-            ShardCount::new(4),
-            ShardParameters::DEFAULT_STRIPE_SIZE,
-        )
-        .unwrap();
-
-        // A range which we happen to know covers exactly one stripe which belongs to this shard
-        let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
-        let input_end = Key::from_hex("000000067f00000001000000ae0000008000").unwrap();
-
-        // Ask for stripe_size blocks, we get the whole stripe
-        assert_eq!(
-            do_fragment(input_start, input_end, &shard_identity, 32768),
-            (32768, vec![(32768, input_start..input_end)])
-        );
-
-        // Ask for more, we still get the whole stripe
-        assert_eq!(
-            do_fragment(input_start, input_end, &shard_identity, 10000000),
-            (32768, vec![(32768, input_start..input_end)])
-        );
-
-        // Ask for target_nblocks of half the stripe size, we get two halves
-        assert_eq!(
-            do_fragment(input_start, input_end, &shard_identity, 16384),
-            (
-                32768,
-                vec![
-                    (16384, input_start..input_start.add(16384)),
-                    (16384, input_start.add(16384)..input_end)
-                ]
-            )
-        );
-    }
-
-    #[test]
-    fn sharded_range_fragment_multi_stripe() {
-        let shard_identity = ShardIdentity::new(
-            ShardNumber(0),
-            ShardCount::new(4),
-            ShardParameters::DEFAULT_STRIPE_SIZE,
-        )
-        .unwrap();
-
-        // A range which covers multiple stripes, exactly one of which belongs to the current shard.
-        let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
-        let input_end = Key::from_hex("000000067f00000001000000ae0000020000").unwrap();
-        // Ask for all the blocks, get a fragment that covers the whole range but reports
-        // its size to be just the blocks belonging to our shard.
-        assert_eq!(
-            do_fragment(input_start, input_end, &shard_identity, 131072),
-            (32768, vec![(32768, input_start..input_end)])
-        );
-
-        // Ask for a sub-stripe quantity
-        assert_eq!(
-            do_fragment(input_start, input_end, &shard_identity, 16000),
-            (
-                32768,
-                vec![
-                    (16000, input_start..input_start.add(16000)),
-                    (16000, input_start.add(16000)..input_start.add(32000)),
-                    (768, input_start.add(32000)..input_end),
-                ]
-            )
-        );
-
-        // Try on a range that starts slightly after our owned stripe
-        assert_eq!(
-            do_fragment(input_start.add(1), input_end, &shard_identity, 131072),
-            (32767, vec![(32767, input_start.add(1)..input_end)])
-        );
-    }
-
-    /// Test our calculations work correctly when we start a range from the logical size key of
-    /// a previous relation.
-    #[test]
-    fn sharded_range_fragment_starting_from_logical_size() {
-        let input_start = Key::from_hex("000000067f00000001000000ae00ffffffff").unwrap();
-        let input_end = Key::from_hex("000000067f00000001000000ae0100008000").unwrap();
-
-        // Shard 0 owns the first stripe in the relation, and the preceding logical size is shard local too
-        let shard_identity = ShardIdentity::new(
-            ShardNumber(0),
-            ShardCount::new(4),
-            ShardParameters::DEFAULT_STRIPE_SIZE,
-        )
-        .unwrap();
-        assert_eq!(
-            do_fragment(input_start, input_end, &shard_identity, 0x10000),
-            (0x8001, vec![(0x8001, input_start..input_end)])
-        );
-
-        // Shard 1 does not own the first stripe in the relation, but it does own the logical size (all shards
-        // store all logical sizes)
-        let shard_identity = ShardIdentity::new(
-            ShardNumber(1),
-            ShardCount::new(4),
-            ShardParameters::DEFAULT_STRIPE_SIZE,
-        )
-        .unwrap();
-        assert_eq!(
-            do_fragment(input_start, input_end, &shard_identity, 0x10000),
-            (0x1, vec![(0x1, input_start..input_end)])
-        );
-    }
-
-    /// Test that ShardedRange behaves properly when used on un-sharded data
-    #[test]
-    fn sharded_range_fragment_unsharded() {
-        let shard_identity = ShardIdentity::unsharded();
-
-        let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
-        let input_end = Key::from_hex("000000067f00000001000000ae0000010000").unwrap();
-        assert_eq!(
-            do_fragment(input_start, input_end, &shard_identity, 0x8000),
-            (
-                0x10000,
-                vec![
-                    (0x8000, input_start..input_start.add(0x8000)),
-                    (0x8000, input_start.add(0x8000)..input_start.add(0x10000))
-                ]
-            )
-        );
-    }
-
-    #[test]
-    fn sharded_range_fragment_cross_relation() {
-        let shard_identity = ShardIdentity::unsharded();
-
-        // A range that spans relations: expect fragmentation to give up and return a u32::MAX size
-        let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
-        let input_end = Key::from_hex("000000068f00000001000000ae0000010000").unwrap();
-        assert_eq!(
-            do_fragment(input_start, input_end, &shard_identity, 0x8000),
-            (u32::MAX, vec![(u32::MAX, input_start..input_end),])
-        );
-
-        // Same, but using a sharded identity
-        let shard_identity = ShardIdentity::new(
-            ShardNumber(0),
-            ShardCount::new(4),
-            ShardParameters::DEFAULT_STRIPE_SIZE,
-        )
-        .unwrap();
-        assert_eq!(
-            do_fragment(input_start, input_end, &shard_identity, 0x8000),
-            (u32::MAX, vec![(u32::MAX, input_start..input_end),])
-        );
-    }
-
-    #[test]
-    fn sharded_range_fragment_tiny_nblocks() {
-        let shard_identity = ShardIdentity::unsharded();
-
-        // A range that spans relations: expect fragmentation to give up and return a u32::MAX size
-        let input_start = Key::from_hex("000000067F00000001000004E10000000000").unwrap();
-        let input_end = Key::from_hex("000000067F00000001000004E10000000038").unwrap();
-        assert_eq!(
-            do_fragment(input_start, input_end, &shard_identity, 16),
-            (
-                0x38,
-                vec![
-                    (16, input_start..input_start.add(16)),
-                    (16, input_start.add(16)..input_start.add(32)),
-                    (16, input_start.add(32)..input_start.add(48)),
-                    (8, input_start.add(48)..input_end),
-                ]
-            )
-        );
-    }
-
-    #[test]
-    fn sharded_range_fragment_fuzz() {
-        // Use a fixed seed: we don't want to explicitly pick values, but we do want
-        // the test to be reproducible.
-        let mut prng = rand::rngs::StdRng::seed_from_u64(0xdeadbeef);
-
-        for _i in 0..1000 {
-            let shard_identity = if prng.next_u32() % 2 == 0 {
-                ShardIdentity::unsharded()
-            } else {
-                let shard_count = prng.next_u32() % 127 + 1;
-                ShardIdentity::new(
-                    ShardNumber((prng.next_u32() % shard_count) as u8),
-                    ShardCount::new(shard_count as u8),
-                    ShardParameters::DEFAULT_STRIPE_SIZE,
-                )
-                .unwrap()
-            };
-
-            let target_nblocks = prng.next_u32() % 65536 + 1;
-
-            let start_offset = prng.next_u32() % 16384;
-
-            // Try ranges up to 4GiB in size, that are always at least 1
-            let range_size = prng.next_u32() % 8192 + 1;
-
-            // A range that spans relations: expect fragmentation to give up and return a u32::MAX size
-            let input_start = Key::from_hex("000000067F00000001000004E10000000000")
-                .unwrap()
-                .add(start_offset);
-            let input_end = input_start.add(range_size);
-
-            // This test's main success conditions are the invariants baked into do_fragment
-            let (_total_size, fragments) =
-                do_fragment(input_start, input_end, &shard_identity, target_nblocks);
-
-            // Pick a random key within the range and check it appears in the output
-            let example_key = input_start.add(prng.next_u32() % range_size);
-
-            // Panic on unwrap if it isn't found
-            let example_key_frag = fragments
-                .iter()
-                .find(|f| f.1.contains(&example_key))
-                .unwrap();
-
-            // Check that the fragment containing our random key has a nonzero size if
-            // that key is shard-local
-            let example_key_local = !shard_identity.is_key_disposable(&example_key);
-            if example_key_local {
-                assert!(example_key_frag.0 > 0);
-            }
-        }
-    }
 }
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -303,7 +303,6 @@ pub struct TenantConfig {
    pub lazy_slru_download: Option<bool>,
    pub timeline_get_throttle: Option<ThrottleConfig>,
    pub image_layer_creation_check_threshold: Option<u8>,
-    pub switch_to_aux_file_v2: Option<bool>,
 }

 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
@@ -430,7 +429,6 @@ pub struct StatusResponse {
 #[derive(Serialize, Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
 pub struct TenantLocationConfigRequest {
-    #[serde(skip_serializing_if = "Option::is_none")]
    pub tenant_id: Option<TenantShardId>,
    #[serde(flatten)]
    pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
@@ -749,18 +747,10 @@ pub struct TimelineGcRequest {
    pub gc_horizon: Option<u64>,
 }

-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct WalRedoManagerProcessStatus {
-    pub pid: u32,
-    /// The strum-generated `into::<&'static str>()` for `pageserver::walredo::ProcessKind`.
-    /// `ProcessKind` are a transitory thing, so, they have no enum representation in `pageserver_api`.
-    pub kind: Cow<'static, str>,
-}
-
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct WalRedoManagerStatus {
    pub last_redo_at: Option<chrono::DateTime<chrono::Utc>>,
-    pub process: Option<WalRedoManagerProcessStatus>,
+    pub pid: Option<u32>,
 }

 /// The progress of a secondary tenant is mostly useful when doing a long running download: e.g. initiating
@@ -782,17 +772,6 @@ pub struct SecondaryProgress {
    pub bytes_total: u64,
 }

-#[derive(Serialize, Deserialize, Debug)]
-pub struct TenantScanRemoteStorageShard {
-    pub tenant_shard_id: TenantShardId,
-    pub generation: Option<u32>,
-}
-
-#[derive(Serialize, Deserialize, Debug, Default)]
-pub struct TenantScanRemoteStorageResponse {
-    pub shards: Vec<TenantScanRemoteStorageShard>,
-}
-
 pub mod virtual_file {
    #[derive(
        Copy,
@@ -860,72 +839,39 @@ impl TryFrom<u8> for PagestreamBeMessageTag {
    }
 }

-// In the V2 protocol version, a GetPage request contains two LSN values:
-//
-// request_lsn: Get the page version at this point in time.  Lsn::Max is a special value that means
-// "get the latest version present". It's used by the primary server, which knows that no one else
-// is writing WAL. 'not_modified_since' must be set to a proper value even if request_lsn is
-// Lsn::Max. Standby servers use the current replay LSN as the request LSN.
-//
-// not_modified_since: Hint to the pageserver that the client knows that the page has not been
-// modified between 'not_modified_since' and the request LSN. It's always correct to set
-// 'not_modified_since equal' to 'request_lsn' (unless Lsn::Max is used as the 'request_lsn'), but
-// passing an earlier LSN can speed up the request, by allowing the pageserver to process the
-// request without waiting for 'request_lsn' to arrive.
-//
-// The legacy V1 interface contained only one LSN, and a boolean 'latest' flag. The V1 interface was
-// sufficient for the primary; the 'lsn' was equivalent to the 'not_modified_since' value, and
-// 'latest' was set to true. The V2 interface was added because there was no correct way for a
-// standby to request a page at a particular non-latest LSN, and also include the
-// 'not_modified_since' hint. That led to an awkward choice of either using an old LSN in the
-// request, if the standby knows that the page hasn't been modified since, and risk getting an error
-// if that LSN has fallen behind the GC horizon, or requesting the current replay LSN, which could
-// require the pageserver unnecessarily to wait for the WAL to arrive up to that point. The new V2
-// interface allows sending both LSNs, and let the pageserver do the right thing. There is no
-// difference in the responses between V1 and V2.
-//
-// The Request structs below reflect the V2 interface. If V1 is used, the parse function
-// maps the old format requests to the new format.
-//
-#[derive(Clone, Copy)]
-pub enum PagestreamProtocolVersion {
-    V1,
-    V2,
-}
-
 #[derive(Debug, PartialEq, Eq)]
 pub struct PagestreamExistsRequest {
-    pub request_lsn: Lsn,
-    pub not_modified_since: Lsn,
+    pub latest: bool,
+    pub lsn: Lsn,
    pub rel: RelTag,
 }

 #[derive(Debug, PartialEq, Eq)]
 pub struct PagestreamNblocksRequest {
-    pub request_lsn: Lsn,
-    pub not_modified_since: Lsn,
+    pub latest: bool,
+    pub lsn: Lsn,
    pub rel: RelTag,
 }

 #[derive(Debug, PartialEq, Eq)]
 pub struct PagestreamGetPageRequest {
-    pub request_lsn: Lsn,
-    pub not_modified_since: Lsn,
+    pub latest: bool,
+    pub lsn: Lsn,
    pub rel: RelTag,
    pub blkno: u32,
 }

 #[derive(Debug, PartialEq, Eq)]
 pub struct PagestreamDbSizeRequest {
-    pub request_lsn: Lsn,
-    pub not_modified_since: Lsn,
+    pub latest: bool,
+    pub lsn: Lsn,
    pub dbnode: u32,
 }

 #[derive(Debug, PartialEq, Eq)]
 pub struct PagestreamGetSlruSegmentRequest {
-    pub request_lsn: Lsn,
-    pub not_modified_since: Lsn,
+    pub latest: bool,
+    pub lsn: Lsn,
    pub kind: u8,
    pub segno: u32,
 }
@@ -972,16 +918,14 @@ pub struct TenantHistorySize {
 }

 impl PagestreamFeMessage {
-    /// Serialize a compute -> pageserver message. This is currently only used in testing
-    /// tools. Always uses protocol version 2.
    pub fn serialize(&self) -> Bytes {
        let mut bytes = BytesMut::new();

        match self {
            Self::Exists(req) => {
                bytes.put_u8(0);
-                bytes.put_u64(req.request_lsn.0);
-                bytes.put_u64(req.not_modified_since.0);
+                bytes.put_u8(u8::from(req.latest));
+                bytes.put_u64(req.lsn.0);
                bytes.put_u32(req.rel.spcnode);
                bytes.put_u32(req.rel.dbnode);
                bytes.put_u32(req.rel.relnode);
@@ -990,8 +934,8 @@ impl PagestreamFeMessage {

            Self::Nblocks(req) => {
                bytes.put_u8(1);
-                bytes.put_u64(req.request_lsn.0);
-                bytes.put_u64(req.not_modified_since.0);
+                bytes.put_u8(u8::from(req.latest));
+                bytes.put_u64(req.lsn.0);
                bytes.put_u32(req.rel.spcnode);
                bytes.put_u32(req.rel.dbnode);
                bytes.put_u32(req.rel.relnode);
@@ -1000,8 +944,8 @@ impl PagestreamFeMessage {

            Self::GetPage(req) => {
                bytes.put_u8(2);
-                bytes.put_u64(req.request_lsn.0);
-                bytes.put_u64(req.not_modified_since.0);
+                bytes.put_u8(u8::from(req.latest));
+                bytes.put_u64(req.lsn.0);
                bytes.put_u32(req.rel.spcnode);
                bytes.put_u32(req.rel.dbnode);
                bytes.put_u32(req.rel.relnode);
@@ -1011,15 +955,15 @@ impl PagestreamFeMessage {

            Self::DbSize(req) => {
                bytes.put_u8(3);
-                bytes.put_u64(req.request_lsn.0);
-                bytes.put_u64(req.not_modified_since.0);
+                bytes.put_u8(u8::from(req.latest));
+                bytes.put_u64(req.lsn.0);
                bytes.put_u32(req.dbnode);
            }

            Self::GetSlruSegment(req) => {
                bytes.put_u8(4);
-                bytes.put_u64(req.request_lsn.0);
-                bytes.put_u64(req.not_modified_since.0);
+                bytes.put_u8(u8::from(req.latest));
+                bytes.put_u64(req.lsn.0);
                bytes.put_u8(req.kind);
                bytes.put_u32(req.segno);
            }
@@ -1028,40 +972,18 @@ impl PagestreamFeMessage {
        bytes.into()
    }

-    pub fn parse<R: std::io::Read>(
-        body: &mut R,
-        protocol_version: PagestreamProtocolVersion,
-    ) -> anyhow::Result<PagestreamFeMessage> {
+    pub fn parse<R: std::io::Read>(body: &mut R) -> anyhow::Result<PagestreamFeMessage> {
+        // TODO these gets can fail
+
        // these correspond to the NeonMessageTag enum in pagestore_client.h
        //
        // TODO: consider using protobuf or serde bincode for less error prone
        // serialization.
        let msg_tag = body.read_u8()?;
-
-        let (request_lsn, not_modified_since) = match protocol_version {
-            PagestreamProtocolVersion::V2 => (
-                Lsn::from(body.read_u64::<BigEndian>()?),
-                Lsn::from(body.read_u64::<BigEndian>()?),
-            ),
-            PagestreamProtocolVersion::V1 => {
-                // In the old protocol, each message starts with a boolean 'latest' flag,
-                // followed by 'lsn'. Convert that to the two LSNs, 'request_lsn' and
-                // 'not_modified_since', used in the new protocol version.
-                let latest = body.read_u8()? != 0;
-                let request_lsn = Lsn::from(body.read_u64::<BigEndian>()?);
-                if latest {
-                    (Lsn::MAX, request_lsn) // get latest version
-                } else {
-                    (request_lsn, request_lsn) // get version at specified LSN
-                }
-            }
-        };
-
-        // The rest of the messages are the same between V1 and V2
        match msg_tag {
            0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
-                request_lsn,
-                not_modified_since,
+                latest: body.read_u8()? != 0,
+                lsn: Lsn::from(body.read_u64::<BigEndian>()?),
                rel: RelTag {
                    spcnode: body.read_u32::<BigEndian>()?,
                    dbnode: body.read_u32::<BigEndian>()?,
@@ -1070,8 +992,8 @@ impl PagestreamFeMessage {
                },
            })),
            1 => Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
-                request_lsn,
-                not_modified_since,
+                latest: body.read_u8()? != 0,
+                lsn: Lsn::from(body.read_u64::<BigEndian>()?),
                rel: RelTag {
                    spcnode: body.read_u32::<BigEndian>()?,
                    dbnode: body.read_u32::<BigEndian>()?,
@@ -1080,8 +1002,8 @@ impl PagestreamFeMessage {
                },
            })),
            2 => Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
-                request_lsn,
-                not_modified_since,
+                latest: body.read_u8()? != 0,
+                lsn: Lsn::from(body.read_u64::<BigEndian>()?),
                rel: RelTag {
                    spcnode: body.read_u32::<BigEndian>()?,
                    dbnode: body.read_u32::<BigEndian>()?,
@@ -1091,14 +1013,14 @@ impl PagestreamFeMessage {
                blkno: body.read_u32::<BigEndian>()?,
            })),
            3 => Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
-                request_lsn,
-                not_modified_since,
+                latest: body.read_u8()? != 0,
+                lsn: Lsn::from(body.read_u64::<BigEndian>()?),
                dbnode: body.read_u32::<BigEndian>()?,
            })),
            4 => Ok(PagestreamFeMessage::GetSlruSegment(
                PagestreamGetSlruSegmentRequest {
-                    request_lsn,
-                    not_modified_since,
+                    latest: body.read_u8()? != 0,
+                    lsn: Lsn::from(body.read_u64::<BigEndian>()?),
                    kind: body.read_u8()?,
                    segno: body.read_u32::<BigEndian>()?,
                },
@@ -1226,8 +1148,8 @@ mod tests {
        // Test serialization/deserialization of PagestreamFeMessage
        let messages = vec![
            PagestreamFeMessage::Exists(PagestreamExistsRequest {
-                request_lsn: Lsn(4),
-                not_modified_since: Lsn(3),
+                latest: true,
+                lsn: Lsn(4),
                rel: RelTag {
                    forknum: 1,
                    spcnode: 2,
@@ -1236,8 +1158,8 @@ mod tests {
                },
            }),
            PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
-                request_lsn: Lsn(4),
-                not_modified_since: Lsn(4),
+                latest: false,
+                lsn: Lsn(4),
                rel: RelTag {
                    forknum: 1,
                    spcnode: 2,
@@ -1246,8 +1168,8 @@ mod tests {
                },
            }),
            PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
-                request_lsn: Lsn(4),
-                not_modified_since: Lsn(3),
+                latest: true,
+                lsn: Lsn(4),
                rel: RelTag {
                    forknum: 1,
                    spcnode: 2,
@@ -1257,16 +1179,14 @@ mod tests {
                blkno: 7,
            }),
            PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
-                request_lsn: Lsn(4),
-                not_modified_since: Lsn(3),
+                latest: true,
+                lsn: Lsn(4),
                dbnode: 7,
            }),
        ];
        for msg in messages {
            let bytes = msg.serialize();
-            let reconstructed =
-                PagestreamFeMessage::parse(&mut bytes.reader(), PagestreamProtocolVersion::V2)
-                    .unwrap();
+            let reconstructed = PagestreamFeMessage::parse(&mut bytes.reader()).unwrap();
            assert!(msg == reconstructed);
        }
    }
--- a/libs/pageserver_api/src/models/partitioning.rs
+++ b/libs/pageserver_api/src/models/partitioning.rs
@@ -1,11 +1,9 @@
 use utils::lsn::Lsn;

-use crate::keyspace::SparseKeySpace;
-
 #[derive(Debug, PartialEq, Eq)]
 pub struct Partitioning {
    pub keys: crate::keyspace::KeySpace,
-    pub sparse_keys: crate::keyspace::SparseKeySpace,
+
    pub at_lsn: Lsn,
 }

@@ -34,8 +32,6 @@ impl serde::Serialize for Partitioning {
        let mut map = serializer.serialize_map(Some(2))?;
        map.serialize_key("keys")?;
        map.serialize_value(&KeySpace(&self.keys))?;
-        map.serialize_key("sparse_keys")?;
-        map.serialize_value(&KeySpace(&self.sparse_keys.0))?;
        map.serialize_key("at_lsn")?;
        map.serialize_value(&WithDisplay(&self.at_lsn))?;
        map.end()
@@ -103,7 +99,6 @@ impl<'a> serde::Deserialize<'a> for Partitioning {
        #[derive(serde::Deserialize)]
        struct De {
            keys: KeySpace,
-            sparse_keys: KeySpace,
            #[serde_as(as = "serde_with::DisplayFromStr")]
            at_lsn: Lsn,
        }
@@ -112,7 +107,6 @@ impl<'a> serde::Deserialize<'a> for Partitioning {
        Ok(Self {
            at_lsn: de.at_lsn,
            keys: de.keys.0,
-            sparse_keys: SparseKeySpace(de.sparse_keys.0),
        })
    }
 }
@@ -139,12 +133,6 @@ mod tests {
                "030000000000000000000000000000000003"
              ]
            ],
-            "sparse_keys": [
-              [
-                "620000000000000000000000000000000000",
-                "620000000000000000000000000000000003"
-              ]
-            ],
            "at_lsn": "0/2240160"
        }
        "#;
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -5,93 +5,15 @@ use crate::{
    models::ShardParameters,
 };
 use hex::FromHex;
-use postgres_ffi::relfile_utils::INIT_FORKNUM;
 use serde::{Deserialize, Serialize};
 use utils::id::TenantId;

-/// See docs/rfcs/031-sharding-static.md for an overview of sharding.
-///
-/// This module contains a variety of types used to represent the concept of sharding
-/// a Neon tenant across multiple physical shards.  Since there are quite a few of these,
-/// we provide an summary here.
-///
-/// Types used to describe shards:
-/// - [`ShardCount`] describes how many shards make up a tenant, plus the magic `unsharded` value
-///   which identifies a tenant which is not shard-aware.  This means its storage paths do not include
-///   a shard suffix.
-/// - [`ShardNumber`] is simply the zero-based index of a shard within a tenant.
-/// - [`ShardIndex`] is the 2-tuple of `ShardCount` and `ShardNumber`, it's just like a `TenantShardId`
-///   without the tenant ID.  This is useful for things that are implicitly scoped to a particular
-///   tenant, such as layer files.
-/// - [`ShardIdentity`]` is the full description of a particular shard's parameters, in sufficient
-///   detail to convert a [`Key`] to a [`ShardNumber`] when deciding where to write/read.
-/// - The [`ShardSlug`] is a terse formatter for ShardCount and ShardNumber, written as
-///   four hex digits.  An unsharded tenant is `0000`.
-/// - [`TenantShardId`] is the unique ID of a particular shard within a particular tenant
-///
-/// Types used to describe the parameters for data distribution in a sharded tenant:
-/// - [`ShardStripeSize`] controls how long contiguous runs of [`Key`]s (stripes) are when distributed across
-///   multiple shards.  Its value is given in 8kiB pages.
-/// - [`ShardLayout`] describes the data distribution scheme, and at time of writing is
-///   always zero: this is provided for future upgrades that might introduce different
-///   data distribution schemes.
-///
-/// Examples:
-/// - A legacy unsharded tenant has one shard with ShardCount(0), ShardNumber(0), and its slug is 0000
-/// - A single sharded tenant has one shard with ShardCount(1), ShardNumber(0), and its slug is 0001
-/// - In a tenant with 4 shards, each shard has ShardCount(N), ShardNumber(i) where i in 0..N-1 (inclusive),
-///   and their slugs are 0004, 0104, 0204, and 0304.
-
 #[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
 pub struct ShardNumber(pub u8);

 #[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
 pub struct ShardCount(u8);

-/// Combination of ShardNumber and ShardCount.  For use within the context of a particular tenant,
-/// when we need to know which shard we're dealing with, but do not need to know the full
-/// ShardIdentity (because we won't be doing any page->shard mapping), and do not need to know
-/// the fully qualified TenantShardId.
-#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
-pub struct ShardIndex {
-    pub shard_number: ShardNumber,
-    pub shard_count: ShardCount,
-}
-
-/// The ShardIdentity contains enough information to map a [`Key`] to a [`ShardNumber`],
-/// and to check whether that [`ShardNumber`] is the same as the current shard.
-#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
-pub struct ShardIdentity {
-    pub number: ShardNumber,
-    pub count: ShardCount,
-    pub stripe_size: ShardStripeSize,
-    layout: ShardLayout,
-}
-
-/// Formatting helper, for generating the `shard_id` label in traces.
-struct ShardSlug<'a>(&'a TenantShardId);
-
-/// TenantShardId globally identifies a particular shard in a particular tenant.
-///
-/// These are written as `<TenantId>-<ShardSlug>`, for example:
-///   # The second shard in a two-shard tenant
-///   072f1291a5310026820b2fe4b2968934-0102
-///
-/// If the `ShardCount` is _unsharded_, the `TenantShardId` is written without
-/// a shard suffix and is equivalent to the encoding of a `TenantId`: this enables
-/// an unsharded [`TenantShardId`] to be used interchangably with a [`TenantId`].
-///
-/// The human-readable encoding of an unsharded TenantShardId, such as used in API URLs,
-/// is both forward and backward compatible with TenantId: a legacy TenantId can be
-/// decoded as a TenantShardId, and when re-encoded it will be parseable
-/// as a TenantId.
-#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
-pub struct TenantShardId {
-    pub tenant_id: TenantId,
-    pub shard_number: ShardNumber,
-    pub shard_count: ShardCount,
-}
-
 impl ShardCount {
    pub const MAX: Self = Self(u8::MAX);

@@ -116,7 +38,6 @@ impl ShardCount {
        self.0
    }

-    ///
    pub fn is_unsharded(&self) -> bool {
        self.0 == 0
    }
@@ -132,6 +53,33 @@ impl ShardNumber {
    pub const MAX: Self = Self(u8::MAX);
 }

+/// TenantShardId identify the units of work for the Pageserver.
+///
+/// These are written as `<tenant_id>-<shard number><shard-count>`, for example:
+///
+///   # The second shard in a two-shard tenant
+///   072f1291a5310026820b2fe4b2968934-0102
+///
+/// Historically, tenants could not have multiple shards, and were identified
+/// by TenantId.  To support this, TenantShardId has a special legacy
+/// mode where `shard_count` is equal to zero: this represents a single-sharded
+/// tenant which should be written as a TenantId with no suffix.
+///
+/// The human-readable encoding of TenantShardId, such as used in API URLs,
+/// is both forward and backward compatible: a legacy TenantId can be
+/// decoded as a TenantShardId, and when re-encoded it will be parseable
+/// as a TenantId.
+///
+/// Note that the binary encoding is _not_ backward compatible, because
+/// at the time sharding is introduced, there are no existing binary structures
+/// containing TenantId that we need to handle.
+#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
+pub struct TenantShardId {
+    pub tenant_id: TenantId,
+    pub shard_number: ShardNumber,
+    pub shard_count: ShardCount,
+}
+
 impl TenantShardId {
    pub fn unsharded(tenant_id: TenantId) -> Self {
        Self {
@@ -163,13 +111,10 @@ impl TenantShardId {
    }

    /// Convenience for code that has special behavior on the 0th shard.
-    pub fn is_shard_zero(&self) -> bool {
+    pub fn is_zero(&self) -> bool {
        self.shard_number == ShardNumber(0)
    }

-    /// The "unsharded" value is distinct from simply having a single shard: it represents
-    /// a tenant which is not shard-aware at all, and whose storage paths will not include
-    /// a shard suffix.
    pub fn is_unsharded(&self) -> bool {
        self.shard_number == ShardNumber(0) && self.shard_count.is_unsharded()
    }
@@ -205,6 +150,9 @@ impl TenantShardId {
    }
 }

+/// Formatting helper
+struct ShardSlug<'a>(&'a TenantShardId);
+
 impl<'a> std::fmt::Display for ShardSlug<'a> {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(
@@ -274,6 +222,16 @@ impl From<[u8; 18]> for TenantShardId {
    }
 }

+/// For use within the context of a particular tenant, when we need to know which
+/// shard we're dealing with, but do not need to know the full ShardIdentity (because
+/// we won't be doing any page->shard mapping), and do not need to know the fully qualified
+/// TenantShardId.
+#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
+pub struct ShardIndex {
+    pub shard_number: ShardNumber,
+    pub shard_count: ShardCount,
+}
+
 impl ShardIndex {
    pub fn new(number: ShardNumber, count: ShardCount) -> Self {
        Self {
@@ -288,9 +246,6 @@ impl ShardIndex {
        }
    }

-    /// The "unsharded" value is distinct from simply having a single shard: it represents
-    /// a tenant which is not shard-aware at all, and whose storage paths will not include
-    /// a shard suffix.
    pub fn is_unsharded(&self) -> bool {
        self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
    }
@@ -358,8 +313,6 @@ impl Serialize for TenantShardId {
        if serializer.is_human_readable() {
            serializer.collect_str(self)
        } else {
-            // Note: while human encoding of [`TenantShardId`] is backward and forward
-            // compatible, this binary encoding is not.
            let mut packed: [u8; 18] = [0; 18];
            packed[0..16].clone_from_slice(&self.tenant_id.as_arr());
            packed[16] = self.shard_number.0;
@@ -437,6 +390,16 @@ const LAYOUT_BROKEN: ShardLayout = ShardLayout(255);
 /// Default stripe size in pages: 256MiB divided by 8kiB page size.
 const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);

+/// The ShardIdentity contains the information needed for one member of map
+/// to resolve a key to a shard, and then check whether that shard is ==self.
+#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
+pub struct ShardIdentity {
+    pub number: ShardNumber,
+    pub count: ShardCount,
+    pub stripe_size: ShardStripeSize,
+    layout: ShardLayout,
+}
+
 #[derive(thiserror::Error, Debug, PartialEq, Eq)]
 pub enum ShardConfigError {
    #[error("Invalid shard count")]
@@ -451,7 +414,7 @@ impl ShardIdentity {
    /// An identity with number=0 count=0 is a "none" identity, which represents legacy
    /// tenants.  Modern single-shard tenants should not use this: they should
    /// have number=0 count=1.
-    pub const fn unsharded() -> Self {
+    pub fn unsharded() -> Self {
        Self {
            number: ShardNumber(0),
            count: ShardCount(0),
@@ -476,9 +439,6 @@ impl ShardIdentity {
        }
    }

-    /// The "unsharded" value is distinct from simply having a single shard: it represents
-    /// a tenant which is not shard-aware at all, and whose storage paths will not include
-    /// a shard suffix.
    pub fn is_unsharded(&self) -> bool {
        self.number == ShardNumber(0) && self.count == ShardCount(0)
    }
@@ -527,8 +487,6 @@ impl ShardIdentity {
    }

    /// Return true if the key should be ingested by this shard
-    ///
-    /// Shards must ingest _at least_ keys which return true from this check.
    pub fn is_key_local(&self, key: &Key) -> bool {
        assert!(!self.is_broken());
        if self.count < ShardCount(2) || (key_is_shard0(key) && self.number == ShardNumber(0)) {
@@ -539,9 +497,7 @@ impl ShardIdentity {
    }

    /// Return true if the key should be discarded if found in this shard's
-    /// data store, e.g. during compaction after a split.
-    ///
-    /// Shards _may_ drop keys which return false here, but are not obliged to.
+    /// data store, e.g. during compaction after a split
    pub fn is_key_disposable(&self, key: &Key) -> bool {
        if key_is_shard0(key) {
            // Q: Why can't we dispose of shard0 content if we're not shard 0?
@@ -567,7 +523,7 @@ impl ShardIdentity {

    /// Convenience for checking if this identity is the 0th shard in a tenant,
    /// for special cases on shard 0 such as ingesting relation sizes.
-    pub fn is_shard_zero(&self) -> bool {
+    pub fn is_zero(&self) -> bool {
        self.number == ShardNumber(0)
    }
 }
@@ -650,13 +606,7 @@ fn key_is_shard0(key: &Key) -> bool {
    // relation pages are distributed to shards other than shard zero. Everything else gets
    // stored on shard 0.  This guarantees that shard 0 can independently serve basebackup
    // requests, and any request other than those for particular blocks in relations.
-    //
-    // The only exception to this rule is "initfork" data -- this relates to postgres's UNLOGGED table
-    // type. These are special relations, usually with only 0 or 1 blocks, and we store them on shard 0
-    // because they must be included in basebackups.
-    let is_initfork = key.field5 == INIT_FORKNUM;
-
-    !is_rel_block_key(key) || is_initfork
+    !is_rel_block_key(key)
 }

 /// Provide the same result as the function in postgres `hashfn.h` with the same name
--- a/libs/postgres_ffi/src/lib.rs
+++ b/libs/postgres_ffi/src/lib.rs
@@ -118,9 +118,7 @@ pub use v14::bindings::{TimeLineID, TimestampTz, XLogRecPtr, XLogSegNo};
 // Likewise for these, although the assumption that these don't change is a little more iffy.
 pub use v14::bindings::{MultiXactOffset, MultiXactStatus};
 pub use v14::bindings::{PageHeaderData, XLogRecord};
-pub use v14::xlog_utils::{
-    XLOG_SIZE_OF_XLOG_LONG_PHD, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD,
-};
+pub use v14::xlog_utils::{XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD};

 pub use v14::bindings::{CheckPoint, ControlFileData};

--- a/libs/postgres_ffi/wal_craft/src/lib.rs
+++ b/libs/postgres_ffi/wal_craft/src/lib.rs
@@ -4,9 +4,7 @@ use log::*;
 use postgres::types::PgLsn;
 use postgres::Client;
 use postgres_ffi::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ};
-use postgres_ffi::{
-    XLOG_SIZE_OF_XLOG_LONG_PHD, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD,
-};
+use postgres_ffi::{XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD};
 use std::path::{Path, PathBuf};
 use std::process::Command;
 use std::time::{Duration, Instant};
@@ -264,21 +262,11 @@ fn craft_internal<C: postgres::GenericClient>(
        intermediate_lsns.insert(0, initial_lsn);
    }

-    // Some records may be not flushed, e.g. non-transactional logical messages. Flush now.
+    // Some records may be not flushed, e.g. non-transactional logical messages.
    //
-    // If the previous WAL record ended exactly at page boundary, pg_current_wal_insert_lsn
-    // returns the position just after the page header on the next page. That's where the next
-    // record will be inserted. But the page header hasn't actually been written to the WAL
-    // yet, and if you try to flush it, you get a "request to flush past end of generated WAL"
-    // error. Because of that, if the insert location is just after a page header, back off to
-    // previous page boundary.
-    let mut lsn = u64::from(client.pg_current_wal_insert_lsn()?);
-    if lsn % WAL_SEGMENT_SIZE as u64 == XLOG_SIZE_OF_XLOG_LONG_PHD as u64 {
-        lsn -= XLOG_SIZE_OF_XLOG_LONG_PHD as u64;
-    } else if lsn % XLOG_BLCKSZ as u64 == XLOG_SIZE_OF_XLOG_SHORT_PHD as u64 {
-        lsn -= XLOG_SIZE_OF_XLOG_SHORT_PHD as u64;
-    }
-    client.execute("select neon_xlogflush($1)", &[&PgLsn::from(lsn)])?;
+    // Note: this is broken if pg_current_wal_insert_lsn is at page boundary
+    // because pg_current_wal_insert_lsn skips page headers.
+    client.execute("select neon_xlogflush(pg_current_wal_insert_lsn())", &[])?;
    Ok(intermediate_lsns)
 }

@@ -332,49 +320,38 @@ impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {

        client.execute("CREATE table t(x int)", &[])?;

-        // Add padding so the XLOG_SWITCH record ends exactly on XLOG_BLCKSZ boundary.  We
-        // will use carefully-sized logical messages to advance WAL insert location such
-        // that there is just enough space on the page for the XLOG_SWITCH record.
-        loop {
-            // We start with measuring how much WAL it takes for one logical message,
-            // considering all alignments and headers.
+        // Add padding so the XLOG_SWITCH record ends exactly on XLOG_BLCKSZ boundary.
+        // We will use logical message as the padding. We start with detecting how much WAL
+        // it takes for one logical message, considering all alignments and headers.
+        let base_wal_advance = {
            let before_lsn = client.pg_current_wal_insert_lsn()?;
+            // Small non-empty message bigger than few bytes is more likely than an empty
+            // message to have the same format as the big padding message.
            client.execute(
                "SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', 10))",
                &[],
            )?;
-            let after_lsn = client.pg_current_wal_insert_lsn()?;
-
-            // Did the record cross a page boundary? If it did, start over. Crossing a
-            // page boundary adds to the apparent size of the record because of the page
-            // header, which throws off the calculation.
-            if u64::from(before_lsn) / XLOG_BLCKSZ as u64
-                != u64::from(after_lsn) / XLOG_BLCKSZ as u64
-            {
-                continue;
-            }
-            // base_size is the size of a logical message without the payload
-            let base_size = u64::from(after_lsn) - u64::from(before_lsn) - 10;
-
-            // Is there enough space on the page for another logical message and an
-            // XLOG_SWITCH? If not, start over.
-            let page_remain = XLOG_BLCKSZ as u64 - u64::from(after_lsn) % XLOG_BLCKSZ as u64;
-            if page_remain < base_size - XLOG_SIZE_OF_XLOG_RECORD as u64 {
-                continue;
-            }
-
-            // We will write another logical message, such that after the logical message
-            // record, there will be space for exactly one XLOG_SWITCH. How large should
-            // the logical message's payload be? An XLOG_SWITCH record has no data => its
-            // size is exactly XLOG_SIZE_OF_XLOG_RECORD.
-            let repeats = page_remain - base_size - XLOG_SIZE_OF_XLOG_RECORD as u64;
-
-            client.execute(
-                "SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', $1))",
-                &[&(repeats as i32)],
-            )?;
-            break;
+            // The XLOG_SWITCH record has no data => its size is exactly XLOG_SIZE_OF_XLOG_RECORD.
+            (u64::from(client.pg_current_wal_insert_lsn()?) - u64::from(before_lsn)) as usize
+                + XLOG_SIZE_OF_XLOG_RECORD
+        };
+        let mut remaining_lsn =
+            XLOG_BLCKSZ - u64::from(client.pg_current_wal_insert_lsn()?) as usize % XLOG_BLCKSZ;
+        if remaining_lsn < base_wal_advance {
+            remaining_lsn += XLOG_BLCKSZ;
        }
+        let repeats = 10 + remaining_lsn - base_wal_advance;
+        info!(
+            "current_wal_insert_lsn={}, remaining_lsn={}, base_wal_advance={}, repeats={}",
+            client.pg_current_wal_insert_lsn()?,
+            remaining_lsn,
+            base_wal_advance,
+            repeats
+        );
+        client.execute(
+            "SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', $1))",
+            &[&(repeats as i32)],
+        )?;
        info!(
            "current_wal_insert_lsn={}, XLOG_SIZE_OF_XLOG_RECORD={}",
            client.pg_current_wal_insert_lsn()?,
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -21,13 +21,11 @@ use std::{
    fmt::Debug,
    num::{NonZeroU32, NonZeroUsize},
    pin::Pin,
-    str::FromStr,
    sync::Arc,
    time::{Duration, SystemTime},
 };

 use anyhow::{bail, Context};
-use aws_sdk_s3::types::StorageClass;
 use camino::{Utf8Path, Utf8PathBuf};

 use bytes::Bytes;
@@ -136,11 +134,6 @@ impl RemotePath {
    pub fn strip_prefix(&self, p: &RemotePath) -> Result<&Utf8Path, std::path::StripPrefixError> {
        self.0.strip_prefix(&p.0)
    }
-
-    pub fn add_trailing_slash(&self) -> Self {
-        // Unwrap safety inputs are guararnteed to be valid UTF-8
-        Self(format!("{}/", self.0).try_into().unwrap())
-    }
 }

 /// We don't need callers to be able to pass arbitrary delimiters: just control
@@ -164,21 +157,47 @@ pub struct Listing {
 /// providing basic CRUD operations for storage files.
 #[allow(async_fn_in_trait)]
 pub trait RemoteStorage: Send + Sync + 'static {
-    /// List objects in remote storage, with semantics matching AWS S3's ListObjectsV2.
-    /// (see `<https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html>`)
-    ///
-    /// Note that the prefix is relative to any `prefix_in_bucket` configured for the client, not
-    /// from the absolute root of the bucket.
-    ///
-    /// `mode` configures whether to use a delimiter.  Without a delimiter all keys
-    /// within the prefix are listed in the `keys` of the result.  With a delimiter, any "directories" at the top level of
-    /// the prefix are returned in the `prefixes` of the result, and keys in the top level of the prefix are
-    /// returned in `keys` ().
-    ///
-    /// `max_keys` controls the maximum number of keys that will be returned.  If this is None, this function
-    /// will iteratively call listobjects until it runs out of keys.  Note that this is not safe to use on
-    /// unlimted size buckets, as the full list of objects is allocated into a monolithic data structure.
+    /// Lists all top level subdirectories for a given prefix
+    /// Note: here we assume that if the prefix is passed it was obtained via remote_object_id
+    /// which already takes into account any kind of global prefix (prefix_in_bucket for S3 or storage_root for LocalFS)
+    /// so this method doesnt need to.
+    async fn list_prefixes(
+        &self,
+        prefix: Option<&RemotePath>,
+        cancel: &CancellationToken,
+    ) -> Result<Vec<RemotePath>, DownloadError> {
+        let result = self
+            .list(prefix, ListingMode::WithDelimiter, None, cancel)
+            .await?
+            .prefixes;
+        Ok(result)
+    }
+    /// Lists all files in directory "recursively"
+    /// (not really recursively, because AWS has a flat namespace)
+    /// Note: This is subtely different than list_prefixes,
+    /// because it is for listing files instead of listing
+    /// names sharing common prefixes.
+    /// For example,
+    /// list_files("foo/bar") = ["foo/bar/cat123.txt",
+    /// "foo/bar/cat567.txt", "foo/bar/dog123.txt", "foo/bar/dog456.txt"]
+    /// whereas,
+    /// list_prefixes("foo/bar/") = ["cat", "dog"]
+    /// See `test_real_s3.rs` for more details.
    ///
+    /// max_keys limits max number of keys returned; None means unlimited.
+    async fn list_files(
+        &self,
+        prefix: Option<&RemotePath>,
+        max_keys: Option<NonZeroU32>,
+        cancel: &CancellationToken,
+    ) -> Result<Vec<RemotePath>, DownloadError> {
+        let result = self
+            .list(prefix, ListingMode::NoDelimiter, max_keys, cancel)
+            .await?
+            .keys;
+        Ok(result)
+    }
+
    async fn list(
        &self,
        prefix: Option<&RemotePath>,
@@ -317,6 +336,41 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
        }
    }

+    // A function for listing all the files in a "directory"
+    // Example:
+    // list_files("foo/bar") = ["foo/bar/a.txt", "foo/bar/b.txt"]
+    //
+    // max_keys limits max number of keys returned; None means unlimited.
+    pub async fn list_files(
+        &self,
+        folder: Option<&RemotePath>,
+        max_keys: Option<NonZeroU32>,
+        cancel: &CancellationToken,
+    ) -> Result<Vec<RemotePath>, DownloadError> {
+        match self {
+            Self::LocalFs(s) => s.list_files(folder, max_keys, cancel).await,
+            Self::AwsS3(s) => s.list_files(folder, max_keys, cancel).await,
+            Self::AzureBlob(s) => s.list_files(folder, max_keys, cancel).await,
+            Self::Unreliable(s) => s.list_files(folder, max_keys, cancel).await,
+        }
+    }
+
+    // lists common *prefixes*, if any of files
+    // Example:
+    // list_prefixes("foo123","foo567","bar123","bar432") = ["foo", "bar"]
+    pub async fn list_prefixes(
+        &self,
+        prefix: Option<&RemotePath>,
+        cancel: &CancellationToken,
+    ) -> Result<Vec<RemotePath>, DownloadError> {
+        match self {
+            Self::LocalFs(s) => s.list_prefixes(prefix, cancel).await,
+            Self::AwsS3(s) => s.list_prefixes(prefix, cancel).await,
+            Self::AzureBlob(s) => s.list_prefixes(prefix, cancel).await,
+            Self::Unreliable(s) => s.list_prefixes(prefix, cancel).await,
+        }
+    }
+
    /// See [`RemoteStorage::upload`]
    pub async fn upload(
        &self,
@@ -565,7 +619,6 @@ pub struct S3Config {
    /// See [`DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT`] for more details.
    pub concurrency_limit: NonZeroUsize,
    pub max_keys_per_list_response: Option<i32>,
-    pub upload_storage_class: Option<StorageClass>,
 }

 impl Debug for S3Config {
@@ -694,18 +747,6 @@ impl RemoteStorageConfig {
                    endpoint,
                    concurrency_limit,
                    max_keys_per_list_response,
-                    upload_storage_class: toml
-                        .get("upload_storage_class")
-                        .map(|prefix_in_bucket| -> anyhow::Result<_> {
-                            let s = parse_toml_string("upload_storage_class", prefix_in_bucket)?;
-                            let storage_class = StorageClass::from_str(&s).expect("infallible");
-                            #[allow(deprecated)]
-                            if matches!(storage_class, StorageClass::Unknown(_)) {
-                                bail!("Specified storage class unknown to SDK: '{s}'. Allowed values: {:?}", StorageClass::values());
-                            }
-                            Ok(storage_class)
-                        })
-                        .transpose()?,
                })
            }
            (_, _, _, Some(_), None) => {
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -5,9 +5,11 @@
 //! volume is mounted to the local FS.

 use std::{
-    collections::HashSet,
+    borrow::Cow,
+    future::Future,
    io::ErrorKind,
    num::NonZeroU32,
+    pin::Pin,
    time::{Duration, SystemTime, UNIX_EPOCH},
 };

@@ -20,11 +22,11 @@ use tokio::{
    io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt},
 };
 use tokio_util::{io::ReaderStream, sync::CancellationToken};
-use utils::crashsafe::path_with_suffix_extension;
+use tracing::*;
+use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};

 use crate::{
    Download, DownloadError, Listing, ListingMode, RemotePath, TimeTravelError, TimeoutOrCancel,
-    REMOTE_STORAGE_PREFIX_SEPARATOR,
 };

 use super::{RemoteStorage, StorageMetadata};
@@ -91,47 +93,7 @@ impl LocalFs {

    #[cfg(test)]
    async fn list_all(&self) -> anyhow::Result<Vec<RemotePath>> {
-        use std::{future::Future, pin::Pin};
-        fn get_all_files<'a, P>(
-            directory_path: P,
-        ) -> Pin<Box<dyn Future<Output = anyhow::Result<Vec<Utf8PathBuf>>> + Send + Sync + 'a>>
-        where
-            P: AsRef<Utf8Path> + Send + Sync + 'a,
-        {
-            Box::pin(async move {
-                let directory_path = directory_path.as_ref();
-                if directory_path.exists() {
-                    if directory_path.is_dir() {
-                        let mut paths = Vec::new();
-                        let mut dir_contents = fs::read_dir(directory_path).await?;
-                        while let Some(dir_entry) = dir_contents.next_entry().await? {
-                            let file_type = dir_entry.file_type().await?;
-                            let entry_path =
-                                Utf8PathBuf::from_path_buf(dir_entry.path()).map_err(|pb| {
-                                    anyhow::Error::msg(format!(
-                                        "non-Unicode path: {}",
-                                        pb.to_string_lossy()
-                                    ))
-                                })?;
-                            if file_type.is_symlink() {
-                                tracing::debug!("{entry_path:?} is a symlink, skipping")
-                            } else if file_type.is_dir() {
-                                paths.extend(get_all_files(&entry_path).await?.into_iter())
-                            } else {
-                                paths.push(entry_path);
-                            }
-                        }
-                        Ok(paths)
-                    } else {
-                        bail!("Path {directory_path:?} is not a directory")
-                    }
-                } else {
-                    Ok(Vec::new())
-                }
-            })
-        }
-
-        Ok(get_all_files(&self.storage_root)
+        Ok(get_all_files(&self.storage_root, true)
            .await?
            .into_iter()
            .map(|path| {
@@ -158,14 +120,6 @@ impl LocalFs {
        // S3 object list prefixes can be arbitrary strings, but when reading
        // the local filesystem we need a directory to start calling read_dir on.
        let mut initial_dir = full_path.clone();
-
-        // If there's no trailing slash, we have to start looking from one above: even if
-        // `initial_dir` is a directory, we should still list any prefixes in the parent
-        // that start with the same string.
-        if !full_path.to_string().ends_with('/') {
-            initial_dir.pop();
-        }
-
        loop {
            // Did we make it to the root?
            if initial_dir.parent().is_none() {
@@ -341,66 +295,61 @@ impl RemoteStorage for LocalFs {
        let op = async {
            let mut result = Listing::default();

-            // Filter out directories: in S3 directories don't exist, only the keys within them do.
-            let keys = self
-                .list_recursive(prefix)
+            if let ListingMode::NoDelimiter = mode {
+                let keys = self
+                    .list_recursive(prefix)
+                    .await
+                    .map_err(DownloadError::Other)?;
+
+                result.keys = keys
+                    .into_iter()
+                    .filter(|k| {
+                        let path = k.with_base(&self.storage_root);
+                        !path.is_dir()
+                    })
+                    .collect();
+
+                if let Some(max_keys) = max_keys {
+                    result.keys.truncate(max_keys.get() as usize);
+                }
+
+                return Ok(result);
+            }
+
+            let path = match prefix {
+                Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)),
+                None => Cow::Borrowed(&self.storage_root),
+            };
+
+            let prefixes_to_filter = get_all_files(path.as_ref(), false)
                .await
                .map_err(DownloadError::Other)?;
-            let keys = keys
-                .into_iter()
-                .filter(|k| {
-                    let path = k.with_base(&self.storage_root);
-                    !path.is_dir()
-                })
-                .collect();

-            if let ListingMode::NoDelimiter = mode {
-                result.keys = keys;
-            } else {
-                let mut prefixes = HashSet::new();
-                for key in keys {
-                    // If the part after the prefix includes a "/", take only the first part and put it in `prefixes`.
-                    let relative_key = if let Some(prefix) = prefix {
-                        let mut prefix = prefix.clone();
-                        // We only strip the dirname of the prefix, so that when we strip it from the start of keys we
-                        // end up with full file/dir names.
-                        let prefix_full_local_path = prefix.with_base(&self.storage_root);
-                        let has_slash = prefix.0.to_string().ends_with('/');
-                        let strip_prefix = if prefix_full_local_path.is_dir() && has_slash {
-                            prefix
-                        } else {
-                            prefix.0.pop();
-                            prefix
-                        };
-
-                        RemotePath::new(key.strip_prefix(&strip_prefix).unwrap()).unwrap()
-                    } else {
-                        key
-                    };
-
-                    let relative_key = format!("{}", relative_key);
-                    if relative_key.contains(REMOTE_STORAGE_PREFIX_SEPARATOR) {
-                        let first_part = relative_key
-                            .split(REMOTE_STORAGE_PREFIX_SEPARATOR)
-                            .next()
-                            .unwrap()
-                            .to_owned();
-                        prefixes.insert(first_part);
-                    } else {
-                        result
-                            .keys
-                            .push(RemotePath::from_string(&relative_key).unwrap());
-                    }
+            // filter out empty directories to mirror s3 behavior.
+            for prefix in prefixes_to_filter {
+                if prefix.is_dir()
+                    && is_directory_empty(&prefix)
+                        .await
+                        .map_err(DownloadError::Other)?
+                {
+                    continue;
+                }
+
+                let stripped = prefix
+                    .strip_prefix(&self.storage_root)
+                    .context("Failed to strip prefix")
+                    .and_then(RemotePath::new)
+                    .expect(
+                        "We list files for storage root, hence should be able to remote the prefix",
+                    );
+
+                if prefix.is_dir() {
+                    result.prefixes.push(stripped);
+                } else {
+                    result.keys.push(stripped);
                }
-                result.prefixes = prefixes
-                    .into_iter()
-                    .map(|s| RemotePath::from_string(&s).unwrap())
-                    .collect();
            }

-            if let Some(max_keys) = max_keys {
-                result.keys.truncate(max_keys.get() as usize);
-            }
            Ok(result)
        };

@@ -611,6 +560,50 @@ fn storage_metadata_path(original_path: &Utf8Path) -> Utf8PathBuf {
    path_with_suffix_extension(original_path, "metadata")
 }

+fn get_all_files<'a, P>(
+    directory_path: P,
+    recursive: bool,
+) -> Pin<Box<dyn Future<Output = anyhow::Result<Vec<Utf8PathBuf>>> + Send + Sync + 'a>>
+where
+    P: AsRef<Utf8Path> + Send + Sync + 'a,
+{
+    Box::pin(async move {
+        let directory_path = directory_path.as_ref();
+        if directory_path.exists() {
+            if directory_path.is_dir() {
+                let mut paths = Vec::new();
+                let mut dir_contents = fs::read_dir(directory_path).await?;
+                while let Some(dir_entry) = dir_contents.next_entry().await? {
+                    let file_type = dir_entry.file_type().await?;
+                    let entry_path =
+                        Utf8PathBuf::from_path_buf(dir_entry.path()).map_err(|pb| {
+                            anyhow::Error::msg(format!(
+                                "non-Unicode path: {}",
+                                pb.to_string_lossy()
+                            ))
+                        })?;
+                    if file_type.is_symlink() {
+                        debug!("{entry_path:?} is a symlink, skipping")
+                    } else if file_type.is_dir() {
+                        if recursive {
+                            paths.extend(get_all_files(&entry_path, true).await?.into_iter())
+                        } else {
+                            paths.push(entry_path)
+                        }
+                    } else {
+                        paths.push(entry_path);
+                    }
+                }
+                Ok(paths)
+            } else {
+                bail!("Path {directory_path:?} is not a directory")
+            }
+        } else {
+            Ok(Vec::new())
+        }
+    })
+}
+
 async fn create_target_directory(target_file_path: &Utf8Path) -> anyhow::Result<()> {
    let target_dir = match target_file_path.parent() {
        Some(parent_dir) => parent_dir,
@@ -930,18 +923,13 @@ mod fs_tests {
        // No delimiter: should recursively list everything
        let (storage, cancel) = create_storage()?;
        let child = upload_dummy_file(&storage, "grandparent/parent/child", None, &cancel).await?;
-        let child_sibling =
-            upload_dummy_file(&storage, "grandparent/parent/child_sibling", None, &cancel).await?;
        let uncle = upload_dummy_file(&storage, "grandparent/uncle", None, &cancel).await?;

        let listing = storage
            .list(None, ListingMode::NoDelimiter, None, &cancel)
            .await?;
        assert!(listing.prefixes.is_empty());
-        assert_eq!(
-            listing.keys.into_iter().collect::<HashSet<_>>(),
-            HashSet::from([uncle.clone(), child.clone(), child_sibling.clone()])
-        );
+        assert_eq!(listing.keys, [uncle.clone(), child.clone()].to_vec());

        // Delimiter: should only go one deep
        let listing = storage
@@ -954,25 +942,7 @@ mod fs_tests {
        );
        assert!(listing.keys.is_empty());

-        // Delimiter & prefix with a trailing slash
-        let listing = storage
-            .list(
-                Some(&RemotePath::from_string("timelines/some_timeline/grandparent/").unwrap()),
-                ListingMode::WithDelimiter,
-                None,
-                &cancel,
-            )
-            .await?;
-        assert_eq!(
-            listing.keys,
-            [RemotePath::from_string("uncle").unwrap()].to_vec()
-        );
-        assert_eq!(
-            listing.prefixes,
-            [RemotePath::from_string("parent").unwrap()].to_vec()
-        );
-
-        // Delimiter and prefix without a trailing slash
+        // Delimiter & prefix
        let listing = storage
            .list(
                Some(&RemotePath::from_string("timelines/some_timeline/grandparent").unwrap()),
@@ -981,66 +951,12 @@ mod fs_tests {
                &cancel,
            )
            .await?;
-        assert_eq!(listing.keys, [].to_vec());
        assert_eq!(
            listing.prefixes,
-            [RemotePath::from_string("grandparent").unwrap()].to_vec()
-        );
-
-        // Delimiter and prefix that's partway through a path component
-        let listing = storage
-            .list(
-                Some(&RemotePath::from_string("timelines/some_timeline/grandp").unwrap()),
-                ListingMode::WithDelimiter,
-                None,
-                &cancel,
-            )
-            .await?;
-        assert_eq!(listing.keys, [].to_vec());
-        assert_eq!(
-            listing.prefixes,
-            [RemotePath::from_string("grandparent").unwrap()].to_vec()
-        );
-
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn list_part_component() -> anyhow::Result<()> {
-        // No delimiter: should recursively list everything
-        let (storage, cancel) = create_storage()?;
-
-        // Imitates what happens in a tenant path when we have an unsharded path and a sharded path, and do a listing
-        // of the unsharded path: although there is a "directory" at the unsharded path, it should be handled as
-        // a freeform prefix.
-        let _child_a =
-            upload_dummy_file(&storage, "grandparent/tenant-01/child", None, &cancel).await?;
-        let _child_b =
-            upload_dummy_file(&storage, "grandparent/tenant/child", None, &cancel).await?;
-
-        // Delimiter and prefix that's partway through a path component
-        let listing = storage
-            .list(
-                Some(
-                    &RemotePath::from_string("timelines/some_timeline/grandparent/tenant").unwrap(),
-                ),
-                ListingMode::WithDelimiter,
-                None,
-                &cancel,
-            )
-            .await?;
-        assert_eq!(listing.keys, [].to_vec());
-
-        let mut found_prefixes = listing.prefixes.clone();
-        found_prefixes.sort();
-        assert_eq!(
-            found_prefixes,
-            [
-                RemotePath::from_string("tenant").unwrap(),
-                RemotePath::from_string("tenant-01").unwrap(),
-            ]
-            .to_vec()
+            [RemotePath::from_string("timelines/some_timeline/grandparent/parent").unwrap()]
+                .to_vec()
        );
+        assert_eq!(listing.keys, [uncle.clone()].to_vec());

        Ok(())
    }
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -30,7 +30,7 @@ use aws_sdk_s3::{
    config::{AsyncSleep, Builder, IdentityCache, Region, SharedAsyncSleep},
    error::SdkError,
    operation::get_object::GetObjectError,
-    types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion, StorageClass},
+    types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion},
    Client,
 };
 use aws_smithy_async::rt::sleep::TokioSleep;
@@ -62,7 +62,6 @@ pub struct S3Bucket {
    bucket_name: String,
    prefix_in_bucket: Option<String>,
    max_keys_per_list_response: Option<i32>,
-    upload_storage_class: Option<StorageClass>,
    concurrency_limiter: ConcurrencyLimiter,
    // Per-request timeout. Accessible for tests.
    pub timeout: Duration,
@@ -155,7 +154,6 @@ impl S3Bucket {
            max_keys_per_list_response: aws_config.max_keys_per_list_response,
            prefix_in_bucket,
            concurrency_limiter: ConcurrencyLimiter::new(aws_config.concurrency_limit.get()),
-            upload_storage_class: aws_config.upload_storage_class.clone(),
            timeout,
        })
    }
@@ -180,7 +178,10 @@ impl S3Bucket {

    pub fn relative_path_to_s3_object(&self, path: &RemotePath) -> String {
        assert_eq!(std::path::MAIN_SEPARATOR, REMOTE_STORAGE_PREFIX_SEPARATOR);
-        let path_string = path.get_path().as_str();
+        let path_string = path
+            .get_path()
+            .as_str()
+            .trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR);
        match &self.prefix_in_bucket {
            Some(prefix) => prefix.clone() + "/" + path_string,
            None => path_string.to_string(),
@@ -470,11 +471,16 @@ impl RemoteStorage for S3Bucket {
        // get the passed prefix or if it is not set use prefix_in_bucket value
        let list_prefix = prefix
            .map(|p| self.relative_path_to_s3_object(p))
-            .or_else(|| {
-                self.prefix_in_bucket.clone().map(|mut s| {
-                    s.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
-                    s
-                })
+            .or_else(|| self.prefix_in_bucket.clone())
+            .map(|mut p| {
+                // required to end with a separator
+                // otherwise request will return only the entry of a prefix
+                if matches!(mode, ListingMode::WithDelimiter)
+                    && !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR)
+                {
+                    p.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
+                }
+                p
            });

        let _permit = self.permit(kind, cancel).await?;
@@ -543,15 +549,11 @@ impl RemoteStorage for S3Bucket {
                }
            }

-            // S3 gives us prefixes like "foo/", we return them like "foo"
-            result.prefixes.extend(prefixes.iter().filter_map(|o| {
-                Some(
-                    self.s3_object_to_relative_path(
-                        o.prefix()?
-                            .trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR),
-                    ),
-                )
-            }));
+            result.prefixes.extend(
+                prefixes
+                    .iter()
+                    .filter_map(|o| Some(self.s3_object_to_relative_path(o.prefix()?))),
+            );

            continuation_token = match response.next_continuation_token {
                Some(new_token) => Some(new_token),
@@ -584,7 +586,6 @@ impl RemoteStorage for S3Bucket {
            .bucket(self.bucket_name.clone())
            .key(self.relative_path_to_s3_object(to))
            .set_metadata(metadata.map(|m| m.0))
-            .set_storage_class(self.upload_storage_class.clone())
            .content_length(from_size_bytes.try_into()?)
            .body(bytes_stream)
            .send();
@@ -636,7 +637,6 @@ impl RemoteStorage for S3Bucket {
            .copy_object()
            .bucket(self.bucket_name.clone())
            .key(self.relative_path_to_s3_object(to))
-            .set_storage_class(self.upload_storage_class.clone())
            .copy_source(copy_source)
            .send();

@@ -894,7 +894,6 @@ impl RemoteStorage for S3Bucket {
                                    .copy_object()
                                    .bucket(self.bucket_name.clone())
                                    .key(key)
-                                    .set_storage_class(self.upload_storage_class.clone())
                                    .copy_source(&source_id)
                                    .send();

@@ -1051,22 +1050,22 @@ mod tests {
            Some("/test/prefix/"),
        ];
        let expected_outputs = [
-            vec!["", "some/path", "some/path/"],
-            vec!["/", "/some/path", "/some/path/"],
+            vec!["", "some/path", "some/path"],
+            vec!["/", "/some/path", "/some/path"],
            vec![
                "test/prefix/",
                "test/prefix/some/path",
-                "test/prefix/some/path/",
+                "test/prefix/some/path",
            ],
            vec![
                "test/prefix/",
                "test/prefix/some/path",
-                "test/prefix/some/path/",
+                "test/prefix/some/path",
            ],
            vec![
                "test/prefix/",
                "test/prefix/some/path",
-                "test/prefix/some/path/",
+                "test/prefix/some/path",
            ],
        ];

@@ -1078,7 +1077,6 @@ mod tests {
                endpoint: None,
                concurrency_limit: NonZeroUsize::new(100).unwrap(),
                max_keys_per_list_response: Some(5),
-                upload_storage_class: None,
            };
            let storage =
                S3Bucket::new(&config, std::time::Duration::ZERO).expect("remote storage init");
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -107,6 +107,27 @@ impl UnreliableWrapper {
 type VoidStorage = crate::LocalFs;

 impl RemoteStorage for UnreliableWrapper {
+    async fn list_prefixes(
+        &self,
+        prefix: Option<&RemotePath>,
+        cancel: &CancellationToken,
+    ) -> Result<Vec<RemotePath>, DownloadError> {
+        self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))
+            .map_err(DownloadError::Other)?;
+        self.inner.list_prefixes(prefix, cancel).await
+    }
+
+    async fn list_files(
+        &self,
+        folder: Option<&RemotePath>,
+        max_keys: Option<NonZeroU32>,
+        cancel: &CancellationToken,
+    ) -> Result<Vec<RemotePath>, DownloadError> {
+        self.attempt(RemoteOp::ListPrefixes(folder.cloned()))
+            .map_err(DownloadError::Other)?;
+        self.inner.list_files(folder, max_keys, cancel).await
+    }
+
    async fn list(
        &self,
        prefix: Option<&RemotePath>,
--- a/libs/remote_storage/tests/common/tests.rs
+++ b/libs/remote_storage/tests/common/tests.rs
@@ -1,6 +1,5 @@
 use anyhow::Context;
 use camino::Utf8Path;
-use remote_storage::ListingMode;
 use remote_storage::RemotePath;
 use std::sync::Arc;
 use std::{collections::HashSet, num::NonZeroU32};
@@ -55,9 +54,9 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a
    let base_prefix = RemotePath::new(Utf8Path::new(ctx.enabled.base_prefix))
        .context("common_prefix construction")?;
    let root_remote_prefixes = test_client
-        .list(None, ListingMode::WithDelimiter, None, &cancel)
-        .await?
-        .prefixes
+        .list_prefixes(None, &cancel)
+        .await
+        .context("client list root prefixes failure")?
        .into_iter()
        .collect::<HashSet<_>>();
    assert_eq!(
@@ -66,14 +65,9 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a
    );

    let nested_remote_prefixes = test_client
-        .list(
-            Some(&base_prefix.add_trailing_slash()),
-            ListingMode::WithDelimiter,
-            None,
-            &cancel,
-        )
-        .await?
-        .prefixes
+        .list_prefixes(Some(&base_prefix), &cancel)
+        .await
+        .context("client list nested prefixes failure")?
        .into_iter()
        .collect::<HashSet<_>>();
    let remote_only_prefixes = nested_remote_prefixes
@@ -96,13 +90,11 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a
 ///
 /// First, create a set of S3 objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_remote_data`]
 /// Then performs the following queries:
-///    1. `list(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
-///    2. `list("folder1")`.  This  should return all files `random_prefix/folder1/blob_{i}.txt`
+///    1. `list_files(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
+///    2. `list_files("folder1")`.  This  should return all files `random_prefix/folder1/blob_{i}.txt`
 #[test_context(MaybeEnabledStorageWithSimpleTestBlobs)]
 #[tokio::test]
-async fn list_no_delimiter_works(
-    ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs,
-) -> anyhow::Result<()> {
+async fn list_files_works(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> anyhow::Result<()> {
    let ctx = match ctx {
        MaybeEnabledStorageWithSimpleTestBlobs::Enabled(ctx) => ctx,
        MaybeEnabledStorageWithSimpleTestBlobs::Disabled => return Ok(()),
@@ -115,36 +107,29 @@ async fn list_no_delimiter_works(
    let base_prefix =
        RemotePath::new(Utf8Path::new("folder1")).context("common_prefix construction")?;
    let root_files = test_client
-        .list(None, ListingMode::NoDelimiter, None, &cancel)
+        .list_files(None, None, &cancel)
        .await
        .context("client list root files failure")?
-        .keys
        .into_iter()
        .collect::<HashSet<_>>();
    assert_eq!(
        root_files,
        ctx.remote_blobs.clone(),
-        "remote storage list on root mismatches with the uploads."
+        "remote storage list_files on root mismatches with the uploads."
    );

    // Test that max_keys limit works. In total there are about 21 files (see
    // upload_simple_remote_data call in test_real_s3.rs).
    let limited_root_files = test_client
-        .list(
-            None,
-            ListingMode::NoDelimiter,
-            Some(NonZeroU32::new(2).unwrap()),
-            &cancel,
-        )
+        .list_files(None, Some(NonZeroU32::new(2).unwrap()), &cancel)
        .await
        .context("client list root files failure")?;
-    assert_eq!(limited_root_files.keys.len(), 2);
+    assert_eq!(limited_root_files.len(), 2);

    let nested_remote_files = test_client
-        .list(Some(&base_prefix), ListingMode::NoDelimiter, None, &cancel)
+        .list_files(Some(&base_prefix), None, &cancel)
        .await
        .context("client list nested files failure")?
-        .keys
        .into_iter()
        .collect::<HashSet<_>>();
    let trim_remote_blobs: HashSet<_> = ctx
@@ -156,7 +141,7 @@ async fn list_no_delimiter_works(
        .collect();
    assert_eq!(
        nested_remote_files, trim_remote_blobs,
-        "remote storage list on subdirrectory mismatches with the uploads."
+        "remote storage list_files on subdirrectory mismatches with the uploads."
    );
    Ok(())
 }
@@ -214,11 +199,7 @@ async fn delete_objects_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<(

    ctx.client.delete_objects(&[path1, path2], &cancel).await?;

-    let prefixes = ctx
-        .client
-        .list(None, ListingMode::WithDelimiter, None, &cancel)
-        .await?
-        .prefixes;
+    let prefixes = ctx.client.list_prefixes(None, &cancel).await?;

    assert_eq!(prefixes.len(), 1);

--- a/libs/remote_storage/tests/test_real_azure.rs
+++ b/libs/remote_storage/tests/test_real_azure.rs
@@ -132,6 +132,10 @@ impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
    }
 }

+// NOTE: the setups for the list_prefixes test and the list_files test are very similar
+// However, they are not idential. The list_prefixes function is concerned with listing prefixes,
+// whereas the list_files function is concerned with listing files.
+// See `RemoteStorage::list_files` documentation for more details
 enum MaybeEnabledStorageWithSimpleTestBlobs {
    Enabled(AzureWithSimpleTestBlobs),
    Disabled,
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -12,8 +12,8 @@ use anyhow::Context;
 use camino::Utf8Path;
 use futures_util::StreamExt;
 use remote_storage::{
-    DownloadError, GenericRemoteStorage, ListingMode, RemotePath, RemoteStorageConfig,
-    RemoteStorageKind, S3Config,
+    DownloadError, GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind,
+    S3Config,
 };
 use test_context::test_context;
 use test_context::AsyncTestContext;
@@ -75,14 +75,11 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
        client: &Arc<GenericRemoteStorage>,
        cancel: &CancellationToken,
    ) -> anyhow::Result<HashSet<RemotePath>> {
-        Ok(
-            retry(|| client.list(None, ListingMode::NoDelimiter, None, cancel))
-                .await
-                .context("list root files failure")?
-                .keys
-                .into_iter()
-                .collect::<HashSet<_>>(),
-        )
+        Ok(retry(|| client.list_files(None, None, cancel))
+            .await
+            .context("list root files failure")?
+            .into_iter()
+            .collect::<HashSet<_>>())
    }

    let cancel = CancellationToken::new();
@@ -297,6 +294,10 @@ impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
    }
 }

+// NOTE: the setups for the list_prefixes test and the list_files test are very similar
+// However, they are not idential. The list_prefixes function is concerned with listing prefixes,
+// whereas the list_files function is concerned with listing files.
+// See `RemoteStorage::list_files` documentation for more details
 enum MaybeEnabledStorageWithSimpleTestBlobs {
    Enabled(S3WithSimpleTestBlobs),
    Disabled,
@@ -380,7 +381,6 @@ fn create_s3_client(
            endpoint: None,
            concurrency_limit: NonZeroUsize::new(100).unwrap(),
            max_keys_per_list_response,
-            upload_storage_class: None,
        }),
        timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
    };
--- a/libs/utils/src/generation.rs
+++ b/libs/utils/src/generation.rs
@@ -34,8 +34,6 @@ pub enum Generation {
 /// scenarios where pageservers might otherwise issue conflicting writes to
 /// remote storage
 impl Generation {
-    pub const MAX: Self = Self::Valid(u32::MAX);
-
    /// Create a new Generation that represents a legacy key format with
    /// no generation suffix
    pub fn none() -> Self {
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -92,8 +92,6 @@ pub mod zstd;

 pub mod env;

-pub mod poison;
-
 /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
 ///
 /// we have several cases:
--- a/libs/utils/src/poison.rs
+++ b/libs/utils/src/poison.rs
@@ -1,121 +0,0 @@
-//!  Protect a piece of state from reuse after it is left in an inconsistent state.
-//!
-//!  # Example
-//!
-//!  ```
-//!  # tokio_test::block_on(async {
-//!  use utils::poison::Poison;
-//!  use std::time::Duration;
-//!
-//!  struct State {
-//!    clean: bool,
-//!  }
-//!  let state = tokio::sync::Mutex::new(Poison::new("mystate", State { clean: true }));
-//!
-//!  let mut mutex_guard = state.lock().await;
-//!  let mut poison_guard = mutex_guard.check_and_arm()?;
-//!  let state = poison_guard.data_mut();
-//!  state.clean = false;
-//!  // If we get cancelled at this await point, subsequent check_and_arm() calls will fail.
-//!  tokio::time::sleep(Duration::from_secs(10)).await;
-//!  state.clean = true;
-//!  poison_guard.disarm();
-//!  # Ok::<(), utils::poison::Error>(())
-//!  # });
-//!  ```
-
-use tracing::warn;
-
-pub struct Poison<T> {
-    what: &'static str,
-    state: State,
-    data: T,
-}
-
-#[derive(Clone, Copy)]
-enum State {
-    Clean,
-    Armed,
-    Poisoned { at: chrono::DateTime<chrono::Utc> },
-}
-
-impl<T> Poison<T> {
-    /// We log `what` `warning!` level if the [`Guard`] gets dropped without being [`Guard::disarm`]ed.
-    pub fn new(what: &'static str, data: T) -> Self {
-        Self {
-            what,
-            state: State::Clean,
-            data,
-        }
-    }
-
-    /// Check for poisoning and return a [`Guard`] that provides access to the wrapped state.
-    pub fn check_and_arm(&mut self) -> Result<Guard<T>, Error> {
-        match self.state {
-            State::Clean => {
-                self.state = State::Armed;
-                Ok(Guard(self))
-            }
-            State::Armed => unreachable!("transient state"),
-            State::Poisoned { at } => Err(Error::Poisoned {
-                what: self.what,
-                at,
-            }),
-        }
-    }
-}
-
-/// Use [`Self::data`] and [`Self::data_mut`] to access the wrapped state.
-/// Once modifications are done, use [`Self::disarm`].
-/// If [`Guard`] gets dropped instead of calling [`Self::disarm`], the state is poisoned
-/// and subsequent calls to [`Poison::check_and_arm`] will fail with an error.
-pub struct Guard<'a, T>(&'a mut Poison<T>);
-
-impl<'a, T> Guard<'a, T> {
-    pub fn data(&self) -> &T {
-        &self.0.data
-    }
-    pub fn data_mut(&mut self) -> &mut T {
-        &mut self.0.data
-    }
-
-    pub fn disarm(self) {
-        match self.0.state {
-            State::Clean => unreachable!("we set it to Armed in check_and_arm()"),
-            State::Armed => {
-                self.0.state = State::Clean;
-            }
-            State::Poisoned { at } => {
-                unreachable!("we fail check_and_arm() if it's in that state: {at}")
-            }
-        }
-    }
-}
-
-impl<'a, T> Drop for Guard<'a, T> {
-    fn drop(&mut self) {
-        match self.0.state {
-            State::Clean => {
-                // set by disarm()
-            }
-            State::Armed => {
-                // still armed => poison it
-                let at = chrono::Utc::now();
-                self.0.state = State::Poisoned { at };
-                warn!(at=?at, "poisoning {}", self.0.what);
-            }
-            State::Poisoned { at } => {
-                unreachable!("we fail check_and_arm() if it's in that state: {at}")
-            }
-        }
-    }
-}
-
-#[derive(thiserror::Error, Debug)]
-pub enum Error {
-    #[error("poisoned at {at}: {what}")]
-    Poisoned {
-        what: &'static str,
-        at: chrono::DateTime<chrono::Utc>,
-    },
-}
--- a/libs/utils/src/seqwait.rs
+++ b/libs/utils/src/seqwait.rs
@@ -2,10 +2,11 @@

 use std::cmp::{Eq, Ordering};
 use std::collections::BinaryHeap;
+use std::fmt::Debug;
 use std::mem;
 use std::sync::Mutex;
 use std::time::Duration;
-use tokio::sync::watch::{self, channel};
+use tokio::sync::watch::{channel, Receiver, Sender};
 use tokio::time::timeout;

 /// An error happened while waiting for a number
@@ -34,73 +35,23 @@ pub trait MonotonicCounter<V> {
    fn cnt_value(&self) -> V;
 }

-/// Heap of waiters, lowest numbers pop first.
-struct Waiters<V>
+/// Internal components of a `SeqWait`
+struct SeqWaitInt<S, V>
 where
+    S: MonotonicCounter<V>,
    V: Ord,
 {
-    heap: BinaryHeap<Waiter<V>>,
-    /// Number of the first waiter in the heap, or None if there are no waiters.
-    status_channel: watch::Sender<Option<V>>,
-}
-
-impl<V> Waiters<V>
-where
-    V: Ord + Copy,
-{
-    fn new() -> Self {
-        Waiters {
-            heap: BinaryHeap::new(),
-            status_channel: channel(None).0,
-        }
-    }
-
-    /// `status_channel` contains the number of the first waiter in the heap.
-    /// This function should be called whenever waiters heap changes.
-    fn update_status(&self) {
-        let first_waiter = self.heap.peek().map(|w| w.wake_num);
-        let _ = self.status_channel.send_replace(first_waiter);
-    }
-
-    /// Add new waiter to the heap, return a channel that will be notified when the number arrives.
-    fn add(&mut self, num: V) -> watch::Receiver<()> {
-        let (tx, rx) = channel(());
-        self.heap.push(Waiter {
-            wake_num: num,
-            wake_channel: tx,
-        });
-        self.update_status();
-        rx
-    }
-
-    /// Pop all waiters <= num from the heap. Collect channels in a vector,
-    /// so that caller can wake them up.
-    fn pop_leq(&mut self, num: V) -> Vec<watch::Sender<()>> {
-        let mut wake_these = Vec::new();
-        while let Some(n) = self.heap.peek() {
-            if n.wake_num > num {
-                break;
-            }
-            wake_these.push(self.heap.pop().unwrap().wake_channel);
-        }
-        self.update_status();
-        wake_these
-    }
-
-    /// Used on shutdown to efficiently drop all waiters.
-    fn take_all(&mut self) -> BinaryHeap<Waiter<V>> {
-        let heap = mem::take(&mut self.heap);
-        self.update_status();
-        heap
-    }
+    waiters: BinaryHeap<Waiter<V>>,
+    current: S,
+    shutdown: bool,
 }

 struct Waiter<T>
 where
    T: Ord,
 {
-    wake_num: T,                     // wake me when this number arrives ...
-    wake_channel: watch::Sender<()>, // ... by sending a message to this channel
+    wake_num: T,              // wake me when this number arrives ...
+    wake_channel: Sender<()>, // ... by sending a message to this channel
 }

 // BinaryHeap is a max-heap, and we want a min-heap. Reverse the ordering here
@@ -125,17 +76,6 @@ impl<T: Ord> PartialEq for Waiter<T> {

 impl<T: Ord> Eq for Waiter<T> {}

-/// Internal components of a `SeqWait`
-struct SeqWaitInt<S, V>
-where
-    S: MonotonicCounter<V>,
-    V: Ord,
-{
-    waiters: Waiters<V>,
-    current: S,
-    shutdown: bool,
-}
-
 /// A tool for waiting on a sequence number
 ///
 /// This provides a way to wait the arrival of a number.
@@ -168,7 +108,7 @@ where
    /// Create a new `SeqWait`, initialized to a particular number
    pub fn new(starting_num: S) -> Self {
        let internal = SeqWaitInt {
-            waiters: Waiters::new(),
+            waiters: BinaryHeap::new(),
            current: starting_num,
            shutdown: false,
        };
@@ -188,8 +128,9 @@ where
            // Block any future waiters from starting
            internal.shutdown = true;

-            // Take all waiters to drop them later.
-            internal.waiters.take_all()
+            // This will steal the entire waiters map.
+            // When we drop it all waiters will be woken.
+            mem::take(&mut internal.waiters)

            // Drop the lock as we exit this scope.
        };
@@ -255,7 +196,7 @@ where

    /// Register and return a channel that will be notified when a number arrives,
    /// or None, if it has already arrived.
-    fn queue_for_wait(&self, num: V) -> Result<Option<watch::Receiver<()>>, SeqWaitError> {
+    fn queue_for_wait(&self, num: V) -> Result<Option<Receiver<()>>, SeqWaitError> {
        let mut internal = self.internal.lock().unwrap();
        if internal.current.cnt_value() >= num {
            return Ok(None);
@@ -264,8 +205,12 @@ where
            return Err(SeqWaitError::Shutdown);
        }

-        // Add waiter channel to the queue.
-        let rx = internal.waiters.add(num);
+        // Create a new channel.
+        let (tx, rx) = channel(());
+        internal.waiters.push(Waiter {
+            wake_num: num,
+            wake_channel: tx,
+        });
        // Drop the lock as we exit this scope.
        Ok(Some(rx))
    }
@@ -286,8 +231,16 @@ where
            }
            internal.current.cnt_advance(num);

-            // Pop all waiters <= num from the heap.
-            internal.waiters.pop_leq(num)
+            // Pop all waiters <= num from the heap. Collect them in a vector, and
+            // wake them up after releasing the lock.
+            let mut wake_these = Vec::new();
+            while let Some(n) = internal.waiters.peek() {
+                if n.wake_num > num {
+                    break;
+                }
+                wake_these.push(internal.waiters.pop().unwrap().wake_channel);
+            }
+            wake_these
        };

        for tx in wake_these {
@@ -302,23 +255,6 @@ where
    pub fn load(&self) -> S {
        self.internal.lock().unwrap().current
    }
-
-    /// Get a Receiver for the current status.
-    ///
-    /// The current status is the number of the first waiter in the queue,
-    /// or None if there are no waiters.
-    ///
-    /// This receiver will be notified whenever the status changes.
-    /// It is useful for receiving notifications when the first waiter
-    /// starts waiting for a number, or when there are no more waiters left.
-    pub fn status_receiver(&self) -> watch::Receiver<Option<V>> {
-        self.internal
-            .lock()
-            .unwrap()
-            .waiters
-            .status_channel
-            .subscribe()
-    }
 }

 #[cfg(test)]
--- a/libs/utils/src/sync/heavier_once_cell.rs
+++ b/libs/utils/src/sync/heavier_once_cell.rs
@@ -192,14 +192,6 @@ impl<T> OnceCell<T> {
        }
    }

-    /// Like [`Guard::take_and_deinit`], but will return `None` if this OnceCell was never
-    /// initialized.
-    pub fn take_and_deinit(&mut self) -> Option<(T, InitPermit)> {
-        let inner = self.inner.get_mut().unwrap();
-
-        inner.take_and_deinit()
-    }
-
    /// Return the number of [`Self::get_or_init`] calls waiting for initialization to complete.
    pub fn initializer_count(&self) -> usize {
        self.initializers.load(Ordering::Relaxed)
@@ -254,23 +246,15 @@ impl<'a, T> Guard<'a, T> {
    /// The permit will be on a semaphore part of the new internal value, and any following
    /// [`OnceCell::get_or_init`] will wait on it to complete.
    pub fn take_and_deinit(mut self) -> (T, InitPermit) {
-        self.0
-            .take_and_deinit()
-            .expect("guard is not created unless value has been initialized")
-    }
-}
-
-impl<T> Inner<T> {
-    pub fn take_and_deinit(&mut self) -> Option<(T, InitPermit)> {
-        let value = self.value.take()?;
-
        let mut swapped = Inner::default();
        let sem = swapped.init_semaphore.clone();
        // acquire and forget right away, moving the control over to InitPermit
        sem.try_acquire().expect("we just created this").forget();
-        let permit = InitPermit(sem);
-        std::mem::swap(self, &mut swapped);
-        Some((value, permit))
+        std::mem::swap(&mut *self.0, &mut swapped);
+        swapped
+            .value
+            .map(|v| (v, InitPermit(sem)))
+            .expect("guard is not created unless value has been initialized")
    }
 }

@@ -279,13 +263,6 @@ impl<T> Inner<T> {
 /// On drop, this type will return the permit.
 pub struct InitPermit(Arc<tokio::sync::Semaphore>);

-impl std::fmt::Debug for InitPermit {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        let ptr = Arc::as_ptr(&self.0) as *const ();
-        f.debug_tuple("InitPermit").field(&ptr).finish()
-    }
-}
-
 impl Drop for InitPermit {
    fn drop(&mut self) {
        assert_eq!(
@@ -582,22 +559,4 @@ mod tests {

        assert_eq!(*target.get().unwrap(), 11);
    }
-
-    #[tokio::test]
-    async fn take_and_deinit_on_mut() {
-        use std::convert::Infallible;
-
-        let mut target = OnceCell::<u32>::default();
-        assert!(target.take_and_deinit().is_none());
-
-        target
-            .get_or_init(|permit| async move { Ok::<_, Infallible>((42, permit)) })
-            .await
-            .unwrap();
-
-        let again = target.take_and_deinit();
-        assert!(matches!(again, Some((42, _))), "{again:?}");
-
-        assert!(target.take_and_deinit().is_none());
-    }
 }
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -70,7 +70,6 @@ tokio-stream.workspace = true
 tokio-util.workspace = true
 toml_edit = { workspace = true, features = [ "serde" ] }
 tracing.workspace = true
-twox-hash.workspace = true
 url.workspace = true
 walkdir.workspace = true
 metrics.workspace = true
--- a/pageserver/benches/bench_walredo.rs
+++ b/pageserver/benches/bench_walredo.rs
@@ -27,50 +27,30 @@
 //!
 //! # Reference Numbers
 //!
-//! 2024-04-15 on i3en.3xlarge
+//! 2024-04-04 on i3en.3xlarge
 //!
 //! ```text
-//! async-short/1           time:   [24.584 µs 24.737 µs 24.922 µs]
-//! async-short/2           time:   [33.479 µs 33.660 µs 33.888 µs]
-//! async-short/4           time:   [42.713 µs 43.046 µs 43.440 µs]
-//! async-short/8           time:   [71.814 µs 72.478 µs 73.240 µs]
-//! async-short/16          time:   [132.73 µs 134.45 µs 136.22 µs]
-//! async-short/32          time:   [258.31 µs 260.73 µs 263.27 µs]
-//! async-short/64          time:   [511.61 µs 514.44 µs 517.51 µs]
-//! async-short/128         time:   [992.64 µs 998.23 µs 1.0042 ms]
-//! async-medium/1          time:   [110.11 µs 110.50 µs 110.96 µs]
-//! async-medium/2          time:   [153.06 µs 153.85 µs 154.99 µs]
-//! async-medium/4          time:   [317.51 µs 319.92 µs 322.85 µs]
-//! async-medium/8          time:   [638.30 µs 644.68 µs 652.12 µs]
-//! async-medium/16         time:   [1.2651 ms 1.2773 ms 1.2914 ms]
-//! async-medium/32         time:   [2.5117 ms 2.5410 ms 2.5720 ms]
-//! async-medium/64         time:   [4.8088 ms 4.8555 ms 4.9047 ms]
-//! async-medium/128        time:   [8.8311 ms 8.9849 ms 9.1263 ms]
-//! sync-short/1            time:   [25.503 µs 25.626 µs 25.771 µs]
-//! sync-short/2            time:   [30.850 µs 31.013 µs 31.208 µs]
-//! sync-short/4            time:   [45.543 µs 45.856 µs 46.193 µs]
-//! sync-short/8            time:   [84.114 µs 84.639 µs 85.220 µs]
-//! sync-short/16           time:   [185.22 µs 186.15 µs 187.13 µs]
-//! sync-short/32           time:   [377.43 µs 378.87 µs 380.46 µs]
-//! sync-short/64           time:   [756.49 µs 759.04 µs 761.70 µs]
-//! sync-short/128          time:   [1.4825 ms 1.4874 ms 1.4923 ms]
-//! sync-medium/1           time:   [105.66 µs 106.01 µs 106.43 µs]
-//! sync-medium/2           time:   [153.10 µs 153.84 µs 154.72 µs]
-//! sync-medium/4           time:   [327.13 µs 329.44 µs 332.27 µs]
-//! sync-medium/8           time:   [654.26 µs 658.73 µs 663.63 µs]
-//! sync-medium/16          time:   [1.2682 ms 1.2748 ms 1.2816 ms]
-//! sync-medium/32          time:   [2.4456 ms 2.4595 ms 2.4731 ms]
-//! sync-medium/64          time:   [4.6523 ms 4.6890 ms 4.7256 ms]
-//! sync-medium/128         time:   [8.7215 ms 8.8323 ms 8.9344 ms]
+//! short/1                 time:   [25.925 µs 26.060 µs 26.209 µs]
+//! short/2                 time:   [31.277 µs 31.483 µs 31.722 µs]
+//! short/4                 time:   [45.496 µs 45.831 µs 46.182 µs]
+//! short/8                 time:   [84.298 µs 84.920 µs 85.566 µs]
+//! short/16                time:   [185.04 µs 186.41 µs 187.88 µs]
+//! short/32                time:   [385.01 µs 386.77 µs 388.70 µs]
+//! short/64                time:   [770.24 µs 773.04 µs 776.04 µs]
+//! short/128               time:   [1.5017 ms 1.5064 ms 1.5113 ms]
+//! medium/1                time:   [106.65 µs 107.20 µs 107.85 µs]
+//! medium/2                time:   [153.28 µs 154.24 µs 155.56 µs]
+//! medium/4                time:   [325.67 µs 327.01 µs 328.71 µs]
+//! medium/8                time:   [646.82 µs 650.17 µs 653.91 µs]
+//! medium/16               time:   [1.2645 ms 1.2701 ms 1.2762 ms]
+//! medium/32               time:   [2.4409 ms 2.4550 ms 2.4692 ms]
+//! medium/64               time:   [4.6814 ms 4.7114 ms 4.7408 ms]
+//! medium/128              time:   [8.7790 ms 8.9037 ms 9.0282 ms]
 //! ```

 use bytes::{Buf, Bytes};
 use criterion::{BenchmarkId, Criterion};
-use pageserver::{
-    config::PageServerConf,
-    walrecord::NeonWalRecord,
-    walredo::{PostgresRedoManager, ProcessKind},
-};
+use pageserver::{config::PageServerConf, walrecord::NeonWalRecord, walredo::PostgresRedoManager};
 use pageserver_api::{key::Key, shard::TenantShardId};
 use std::{
    sync::Arc,
@@ -80,39 +60,33 @@ use tokio::{sync::Barrier, task::JoinSet};
 use utils::{id::TenantId, lsn::Lsn};

 fn bench(c: &mut Criterion) {
-    for process_kind in &[ProcessKind::Async, ProcessKind::Sync] {
-        {
-            let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
-            for nclients in nclients {
-                let mut group = c.benchmark_group(format!("{process_kind}-short"));
-                group.bench_with_input(
-                    BenchmarkId::from_parameter(nclients),
-                    &nclients,
-                    |b, nclients| {
-                        let redo_work = Arc::new(Request::short_input());
-                        b.iter_custom(|iters| {
-                            bench_impl(*process_kind, Arc::clone(&redo_work), iters, *nclients)
-                        });
-                    },
-                );
-            }
+    {
+        let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
+        for nclients in nclients {
+            let mut group = c.benchmark_group("short");
+            group.bench_with_input(
+                BenchmarkId::from_parameter(nclients),
+                &nclients,
+                |b, nclients| {
+                    let redo_work = Arc::new(Request::short_input());
+                    b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
+                },
+            );
        }
+    }

-        {
-            let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
-            for nclients in nclients {
-                let mut group = c.benchmark_group(format!("{process_kind}-medium"));
-                group.bench_with_input(
-                    BenchmarkId::from_parameter(nclients),
-                    &nclients,
-                    |b, nclients| {
-                        let redo_work = Arc::new(Request::medium_input());
-                        b.iter_custom(|iters| {
-                            bench_impl(*process_kind, Arc::clone(&redo_work), iters, *nclients)
-                        });
-                    },
-                );
-            }
+    {
+        let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
+        for nclients in nclients {
+            let mut group = c.benchmark_group("medium");
+            group.bench_with_input(
+                BenchmarkId::from_parameter(nclients),
+                &nclients,
+                |b, nclients| {
+                    let redo_work = Arc::new(Request::medium_input());
+                    b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
+                },
+            );
        }
    }
 }
@@ -120,16 +94,10 @@ criterion::criterion_group!(benches, bench);
 criterion::criterion_main!(benches);

 // Returns the sum of each client's wall-clock time spent executing their share of the n_redos.
-fn bench_impl(
-    process_kind: ProcessKind,
-    redo_work: Arc<Request>,
-    n_redos: u64,
-    nclients: u64,
-) -> Duration {
+fn bench_impl(redo_work: Arc<Request>, n_redos: u64, nclients: u64) -> Duration {
    let repo_dir = camino_tempfile::tempdir_in(env!("CARGO_TARGET_TMPDIR")).unwrap();

-    let mut conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
-    conf.walredo_process_kind = process_kind;
+    let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
    let conf = Box::leak(Box::new(conf));
    let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());

@@ -145,40 +113,25 @@ fn bench_impl(
    let manager = PostgresRedoManager::new(conf, tenant_shard_id);
    let manager = Arc::new(manager);

-    // divide the amount of work equally among the clients.
-    let nredos_per_client = n_redos / nclients;
    for _ in 0..nclients {
        rt.block_on(async {
            tasks.spawn(client(
                Arc::clone(&manager),
                Arc::clone(&start),
                Arc::clone(&redo_work),
-                nredos_per_client,
+                // divide the amount of work equally among the clients
+                n_redos / nclients,
            ))
        });
    }

-    let elapsed = rt.block_on(async move {
-        let mut total_wallclock_time = Duration::ZERO;
+    rt.block_on(async move {
+        let mut total_wallclock_time = std::time::Duration::from_millis(0);
        while let Some(res) = tasks.join_next().await {
            total_wallclock_time += res.unwrap();
        }
        total_wallclock_time
-    });
-
-    // consistency check to ensure process kind setting worked
-    if nredos_per_client > 0 {
-        assert_eq!(
-            manager
-                .status()
-                .process
-                .map(|p| p.kind)
-                .expect("the benchmark work causes a walredo process to be spawned"),
-            std::borrow::Cow::Borrowed(process_kind.into())
-        );
-    }
-
-    elapsed
+    })
 }

 async fn client(
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -243,19 +243,6 @@ impl Client {
        Ok(())
    }

-    pub async fn tenant_scan_remote_storage(
-        &self,
-        tenant_id: TenantId,
-    ) -> Result<TenantScanRemoteStorageResponse> {
-        let uri = format!(
-            "{}/v1/tenant/{tenant_id}/scan_remote_storage",
-            self.mgmt_api_endpoint
-        );
-        let response = self.request(Method::GET, &uri, ()).await?;
-        let body = response.json().await.map_err(Error::ReceiveBody)?;
-        Ok(body)
-    }
-
    pub async fn tenant_config(&self, req: &TenantConfigRequest) -> Result<()> {
        let uri = format!("{}/v1/tenant/config", self.mgmt_api_endpoint);
        self.request(Method::PUT, &uri, req).await?;
@@ -292,7 +279,7 @@ impl Client {
        lazy: bool,
    ) -> Result<()> {
        let req_body = TenantLocationConfigRequest {
-            tenant_id: None,
+            tenant_id: Some(tenant_shard_id),
            config,
        };

--- a/pageserver/client/src/page_service.rs
+++ b/pageserver/client/src/page_service.rs
@@ -60,7 +60,7 @@ impl Client {
    ) -> anyhow::Result<PagestreamClient> {
        let copy_both: tokio_postgres::CopyBothDuplex<bytes::Bytes> = self
            .client
-            .copy_both_simple(&format!("pagestream_v2 {tenant_id} {timeline_id}"))
+            .copy_both_simple(&format!("pagestream {tenant_id} {timeline_id}"))
            .await?;
        let Client {
            cancel_on_client_drop,
--- a/pageserver/compaction/src/compact_tiered.rs
+++ b/pageserver/compaction/src/compact_tiered.rs
@@ -18,7 +18,6 @@
 //! database size. For example, if the logical database size is 10 GB, we would
 //! generate new image layers every 10 GB of WAL.
 use futures::StreamExt;
-use pageserver_api::shard::ShardIdentity;
 use tracing::{debug, info};

 use std::collections::{HashSet, VecDeque};
@@ -126,7 +125,6 @@ async fn compact_level<E: CompactionJobExecutor>(
    }

    let mut state = LevelCompactionState {
-        shard_identity: *executor.get_shard_identity(),
        target_file_size,
        _lsn_range: lsn_range.clone(),
        layers: layer_fragments,
@@ -166,8 +164,6 @@ struct LevelCompactionState<'a, E>
 where
    E: CompactionJobExecutor,
 {
-    shard_identity: ShardIdentity,
-
    // parameters
    target_file_size: u64,

@@ -370,7 +366,6 @@ where
                .executor
                .get_keyspace(&job.key_range, job.lsn_range.end, ctx)
                .await?,
-            &self.shard_identity,
        ) * 8192;

        let wal_size = job
@@ -435,7 +430,7 @@ where
            keyspace,
            self.target_file_size / 8192,
        );
-        while let Some(key_range) = window.choose_next_image(&self.shard_identity) {
+        while let Some(key_range) = window.choose_next_image() {
            new_jobs.push(CompactionJob::<E> {
                key_range,
                lsn_range: job.lsn_range.clone(),
@@ -628,12 +623,7 @@ impl<K: CompactionKey> KeyspaceWindowPos<K> {
    }

    // Advance the cursor until it reaches 'target_keysize'.
-    fn advance_until_size(
-        &mut self,
-        w: &KeyspaceWindowHead<K>,
-        max_size: u64,
-        shard_identity: &ShardIdentity,
-    ) {
+    fn advance_until_size(&mut self, w: &KeyspaceWindowHead<K>, max_size: u64) {
        while self.accum_keysize < max_size && !self.reached_end(w) {
            let curr_range = &w.keyspace[self.keyspace_idx];
            if self.end_key < curr_range.start {
@@ -642,7 +632,7 @@ impl<K: CompactionKey> KeyspaceWindowPos<K> {
            }

            // We're now within 'curr_range'. Can we advance past it completely?
-            let distance = K::key_range_size(&(self.end_key..curr_range.end), shard_identity);
+            let distance = K::key_range_size(&(self.end_key..curr_range.end));
            if (self.accum_keysize + distance as u64) < max_size {
                // oh yeah, it fits
                self.end_key = curr_range.end;
@@ -651,7 +641,7 @@ impl<K: CompactionKey> KeyspaceWindowPos<K> {
            } else {
                // advance within the range
                let skip_key = self.end_key.skip_some();
-                let distance = K::key_range_size(&(self.end_key..skip_key), shard_identity);
+                let distance = K::key_range_size(&(self.end_key..skip_key));
                if (self.accum_keysize + distance as u64) < max_size {
                    self.end_key = skip_key;
                    self.accum_keysize += distance as u64;
@@ -687,7 +677,7 @@ where
        }
    }

-    fn choose_next_image(&mut self, shard_identity: &ShardIdentity) -> Option<Range<K>> {
+    fn choose_next_image(&mut self) -> Option<Range<K>> {
        if self.start_pos.keyspace_idx == self.head.keyspace.len() {
            // we've reached the end
            return None;
@@ -697,7 +687,6 @@ where
        next_pos.advance_until_size(
            &self.head,
            self.start_pos.accum_keysize + self.head.target_keysize,
-            shard_identity,
        );

        // See if we can gobble up the rest of the keyspace if we stretch out the layer, up to
@@ -706,7 +695,6 @@ where
        end_pos.advance_until_size(
            &self.head,
            self.start_pos.accum_keysize + (self.head.target_keysize * 5 / 4),
-            shard_identity,
        );
        if end_pos.reached_end(&self.head) {
            // gobble up any unused keyspace between the last used key and end of the range
--- a/pageserver/compaction/src/helpers.rs
+++ b/pageserver/compaction/src/helpers.rs
@@ -5,7 +5,6 @@ use crate::interface::*;
 use futures::future::BoxFuture;
 use futures::{Stream, StreamExt};
 use itertools::Itertools;
-use pageserver_api::shard::ShardIdentity;
 use pin_project_lite::pin_project;
 use std::collections::BinaryHeap;
 use std::collections::VecDeque;
@@ -14,17 +13,11 @@ use std::ops::{DerefMut, Range};
 use std::pin::Pin;
 use std::task::{ready, Poll};

-pub fn keyspace_total_size<K>(
-    keyspace: &CompactionKeySpace<K>,
-    shard_identity: &ShardIdentity,
-) -> u64
+pub fn keyspace_total_size<K>(keyspace: &CompactionKeySpace<K>) -> u64
 where
    K: CompactionKey,
 {
-    keyspace
-        .iter()
-        .map(|r| K::key_range_size(r, shard_identity) as u64)
-        .sum()
+    keyspace.iter().map(|r| K::key_range_size(r) as u64).sum()
 }

 pub fn overlaps_with<T: Ord>(a: &Range<T>, b: &Range<T>) -> bool {
--- a/pageserver/compaction/src/interface.rs
+++ b/pageserver/compaction/src/interface.rs
@@ -4,7 +4,7 @@
 //! All the heavy lifting is done by the create_image and create_delta
 //! functions that the implementor provides.
 use futures::Future;
-use pageserver_api::{key::Key, keyspace::ShardedRange, shard::ShardIdentity};
+use pageserver_api::{key::Key, keyspace::key_range_size};
 use std::ops::Range;
 use utils::lsn::Lsn;

@@ -32,8 +32,6 @@ pub trait CompactionJobExecutor {
    // Functions that the planner uses to support its decisions
    // ----

-    fn get_shard_identity(&self) -> &ShardIdentity;
-
    /// Return all layers that overlap the given bounding box.
    fn get_layers(
        &mut self,
@@ -100,7 +98,7 @@ pub trait CompactionKey: std::cmp::Ord + Clone + Copy + std::fmt::Display {
    ///
    /// This returns u32, for compatibility with Repository::key. If the
    /// distance is larger, return u32::MAX.
-    fn key_range_size(key_range: &Range<Self>, shard_identity: &ShardIdentity) -> u32;
+    fn key_range_size(key_range: &Range<Self>) -> u32;

    // return "self + 1"
    fn next(&self) -> Self;
@@ -115,8 +113,8 @@ impl CompactionKey for Key {
    const MIN: Self = Self::MIN;
    const MAX: Self = Self::MAX;

-    fn key_range_size(r: &std::ops::Range<Self>, shard_identity: &ShardIdentity) -> u32 {
-        ShardedRange::new(r.clone(), shard_identity).page_count()
+    fn key_range_size(r: &std::ops::Range<Self>) -> u32 {
+        key_range_size(r)
    }
    fn next(&self) -> Key {
        (self as &Key).next()
--- a/pageserver/compaction/src/simulator.rs
+++ b/pageserver/compaction/src/simulator.rs
@@ -3,7 +3,6 @@ mod draw;
 use draw::{LayerTraceEvent, LayerTraceFile, LayerTraceOp};

 use futures::StreamExt;
-use pageserver_api::shard::ShardIdentity;
 use rand::Rng;
 use tracing::info;

@@ -72,7 +71,7 @@ impl interface::CompactionKey for Key {
    const MIN: Self = u64::MIN;
    const MAX: Self = u64::MAX;

-    fn key_range_size(key_range: &Range<Self>, _shard_identity: &ShardIdentity) -> u32 {
+    fn key_range_size(key_range: &Range<Self>) -> u32 {
        std::cmp::min(key_range.end - key_range.start, u32::MAX as u64) as u32
    }

@@ -435,11 +434,6 @@ impl interface::CompactionJobExecutor for MockTimeline {
    type ImageLayer = Arc<MockImageLayer>;
    type RequestContext = MockRequestContext;

-    fn get_shard_identity(&self) -> &ShardIdentity {
-        static IDENTITY: ShardIdentity = ShardIdentity::unsharded();
-        &IDENTITY
-    }
-
    async fn get_layers(
        &mut self,
        key_range: &Range<Self::Key>,
--- a/pageserver/ctl/src/draw_timeline_dir.rs
+++ b/pageserver/ctl/src/draw_timeline_dir.rs
@@ -9,45 +9,18 @@
 //! Coordinates in both axis are compressed for better readability.
 //! (see <https://medium.com/algorithms-digest/coordinate-compression-2fff95326fb>)
 //!
-//! The plain text API was chosen so that we can easily work with filenames from various
-//! sources; see the Usage section below for examples.
-//!
-//! # Usage
-//!
-//! ## Producing the SVG
-//!
+//! Example use:
 //! ```bash
-//!
-//! # local timeline dir
-//! ls test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE | \
-//!     grep "__" | cargo run --release --bin pagectl draw-timeline-dir > out.svg
-//!
-//! # Layer map dump from `/v1/tenant/$TENANT/timeline/$TIMELINE/layer`
-//! (jq -r '.historic_layers[] | .layer_file_name' | cargo  run -p pagectl draw-timeline) < layer-map.json > out.svg
-//!
-//! # From an `index_part.json` in S3
-//! (jq -r '.layer_metadata | keys[]' | cargo  run -p pagectl draw-timeline ) < index_part.json-00000016 > out.svg
-//!
+//! $ ls test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE | \
+//! $   grep "__" | cargo run --release --bin pagectl draw-timeline-dir > out.svg
+//! $ firefox out.svg
 //! ```
 //!
-//! ## Viewing
+//! This API was chosen so that we can easily work with filenames extracted from ssh,
+//! or from pageserver log files.
 //!
-//! **Inkscape** is better than the built-in viewers in browsers.
-//!
-//! After selecting a layer file rectangle, use "Open XML Editor" (Ctrl|Cmd + Shift + X)
-//! to see the layer file name in the comment field.
-//!
-//! ```bash
-//!
-//! # Linux
-//! inkscape out.svg
-//!
-//! # macOS
-//! /Applications/Inkscape.app/Contents/MacOS/inkscape out.svg
-//!
-//! ```
-//!
-
+//! TODO Consider shipping this as a grafana panel plugin:
+//!      <https://grafana.com/tutorials/build-a-panel-plugin/>
 use anyhow::Result;
 use pageserver::repository::Key;
 use pageserver::METADATA_FILE_NAME;
@@ -92,12 +65,7 @@ fn parse_filename(name: &str) -> (Range<Key>, Range<Lsn>) {

 pub fn main() -> Result<()> {
    // Parse layer filenames from stdin
-    struct Layer {
-        filename: String,
-        key_range: Range<Key>,
-        lsn_range: Range<Lsn>,
-    }
-    let mut files: Vec<Layer> = vec![];
+    let mut ranges: Vec<(Range<Key>, Range<Lsn>)> = vec![];
    let stdin = io::stdin();
    for line in stdin.lock().lines() {
        let line = line.unwrap();
@@ -108,23 +76,14 @@ pub fn main() -> Result<()> {
            // Don't try and parse "metadata" like a key-lsn range
            continue;
        }
-        let (key_range, lsn_range) = parse_filename(filename);
-        files.push(Layer {
-            filename: filename.to_owned(),
-            key_range,
-            lsn_range,
-        });
+        let range = parse_filename(filename);
+        ranges.push(range);
    }

    // Collect all coordinates
    let mut keys: Vec<Key> = vec![];
    let mut lsns: Vec<Lsn> = vec![];
-    for Layer {
-        key_range: keyr,
-        lsn_range: lsnr,
-        ..
-    } in &files
-    {
+    for (keyr, lsnr) in &ranges {
        keys.push(keyr.start);
        keys.push(keyr.end);
        lsns.push(lsnr.start);
@@ -148,12 +107,7 @@ pub fn main() -> Result<()> {
            h: stretch * lsn_map.len() as f32
        }
    );
-    for Layer {
-        filename,
-        key_range: keyr,
-        lsn_range: lsnr,
-    } in &files
-    {
+    for (keyr, lsnr) in &ranges {
        let key_start = *key_map.get(&keyr.start).unwrap();
        let key_end = *key_map.get(&keyr.end).unwrap();
        let key_diff = key_end - key_start;
@@ -197,7 +151,6 @@ pub fn main() -> Result<()> {
            .fill(fill)
            .stroke(Stroke::Color(rgb(0, 0, 0), 0.1))
            .border_radius(0.4)
-            .comment(filename)
        );
    }
    println!("{}", EndSvg);
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -312,12 +312,8 @@ async fn main_impl(
                    let (rel_tag, block_no) =
                        key_to_rel_block(key).expect("we filter non-rel-block keys out above");
                    PagestreamGetPageRequest {
-                        request_lsn: if rng.gen_bool(args.req_latest_probability) {
-                            Lsn::MAX
-                        } else {
-                            r.timeline_lsn
-                        },
-                        not_modified_since: r.timeline_lsn,
+                        latest: rng.gen_bool(args.req_latest_probability),
+                        lsn: r.timeline_lsn,
                        rel: rel_tag,
                        blkno: block_no,
                    }
--- a/pageserver/src/aux_file.rs
+++ b/pageserver/src/aux_file.rs
@@ -1,112 +0,0 @@
-use pageserver_api::key::{Key, AUX_KEY_PREFIX, METADATA_KEY_SIZE};
-use tracing::warn;
-
-/// Create a metadata key from a hash, encoded as [AUX_KEY_PREFIX, 2B directory prefix, first 13B of 128b xxhash].
-fn aux_hash_to_metadata_key(dir_level1: u8, dir_level2: u8, data: &[u8]) -> Key {
-    let mut key = [0; METADATA_KEY_SIZE];
-    let hash = twox_hash::xxh3::hash128(data).to_be_bytes();
-    key[0] = AUX_KEY_PREFIX;
-    key[1] = dir_level1;
-    key[2] = dir_level2;
-    key[3..16].copy_from_slice(&hash[0..13]);
-    Key::from_metadata_key_fixed_size(&key)
-}
-
-const AUX_DIR_PG_LOGICAL: u8 = 0x01;
-const AUX_DIR_PG_REPLSLOT: u8 = 0x02;
-const AUX_DIR_PG_UNKNOWN: u8 = 0xFF;
-
-/// Encode the aux file into a fixed-size key.
-///
-/// The first byte is the AUX key prefix. We use the next 2 bytes of the key for the directory / aux file type.
-/// We have one-to-one mapping for each of the aux file that we support. We hash the remaining part of the path
-/// (usually a single file name, or several components) into 13-byte hash. The way we determine the 2-byte prefix
-/// is roughly based on the first two components of the path, one unique number for one component.
-///
-/// * pg_logical/mappings -> 0x0101
-/// * pg_logical/snapshots -> 0x0102
-/// * pg_logical/replorigin_checkpoint -> 0x0103
-/// * pg_logical/others -> 0x01FF
-/// * pg_replslot/ -> 0x0201
-/// * others -> 0xFFFF
-///
-/// If you add new AUX files to this function, please also add a test case to `test_encoding_portable`.
-/// The new file type must have never been written to the storage before. Otherwise, there could be data
-/// corruptions as the new file belongs to a new prefix but it might have been stored under the `others` prefix.
-pub fn encode_aux_file_key(path: &str) -> Key {
-    if let Some(fname) = path.strip_prefix("pg_logical/mappings/") {
-        aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0x01, fname.as_bytes())
-    } else if let Some(fname) = path.strip_prefix("pg_logical/snapshots/") {
-        aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0x02, fname.as_bytes())
-    } else if path == "pg_logical/replorigin_checkpoint" {
-        aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0x03, b"")
-    } else if let Some(fname) = path.strip_prefix("pg_logical/") {
-        if cfg!(debug_assertions) {
-            warn!(
-                "unsupported pg_logical aux file type: {}, putting to 0x01FF, would affect path scanning",
-                path
-            );
-        }
-        aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0xFF, fname.as_bytes())
-    } else if let Some(fname) = path.strip_prefix("pg_replslot/") {
-        aux_hash_to_metadata_key(AUX_DIR_PG_REPLSLOT, 0x01, fname.as_bytes())
-    } else {
-        if cfg!(debug_assertions) {
-            warn!(
-                "unsupported aux file type: {}, putting to 0xFFFF, would affect path scanning",
-                path
-            );
-        }
-        aux_hash_to_metadata_key(AUX_DIR_PG_UNKNOWN, 0xFF, path.as_bytes())
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn test_hash_portable() {
-        // AUX file encoding requires the hash to be portable across all platforms. This test case checks
-        // if the algorithm produces the same hash across different environments.
-        assert_eq!(
-            305317690835051308206966631765527126151,
-            twox_hash::xxh3::hash128("test1".as_bytes())
-        );
-        assert_eq!(
-            85104974691013376326742244813280798847,
-            twox_hash::xxh3::hash128("test/test2".as_bytes())
-        );
-        assert_eq!(0, twox_hash::xxh3::hash128("".as_bytes()));
-    }
-
-    #[test]
-    fn test_encoding_portable() {
-        // To correct retrieve AUX files, the generated keys for the same file must be the same for all versions
-        // of the page server.
-        assert_eq!(
-            "6200000101E5B20C5F8DD5AA3289D6D9EAFA",
-            encode_aux_file_key("pg_logical/mappings/test1").to_string()
-        );
-        assert_eq!(
-            "620000010239AAC544893139B26F501B97E6",
-            encode_aux_file_key("pg_logical/snapshots/test2").to_string()
-        );
-        assert_eq!(
-            "620000010300000000000000000000000000",
-            encode_aux_file_key("pg_logical/replorigin_checkpoint").to_string()
-        );
-        assert_eq!(
-            "62000001FF8635AF2134B7266EC5B4189FD6",
-            encode_aux_file_key("pg_logical/unsupported").to_string()
-        );
-        assert_eq!(
-            "6200000201772D0E5D71DE14DA86142A1619",
-            encode_aux_file_key("pg_replslot/test3").to_string()
-        );
-        assert_eq!(
-            "620000FFFF1866EBEB53B807B26A2416F317",
-            encode_aux_file_key("other_file_not_supported").to_string()
-        );
-    }
-}
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -10,7 +10,7 @@
 //! This module is responsible for creation of such tarball
 //! from data stored in object storage.
 //!
-use anyhow::{anyhow, Context};
+use anyhow::{anyhow, bail, ensure, Context};
 use bytes::{BufMut, Bytes, BytesMut};
 use fail::fail_point;
 use pageserver_api::key::{key_to_slru_block, Key};
@@ -38,14 +38,6 @@ use postgres_ffi::PG_TLI;
 use postgres_ffi::{BLCKSZ, RELSEG_SIZE, WAL_SEGMENT_SIZE};
 use utils::lsn::Lsn;

-#[derive(Debug, thiserror::Error)]
-pub enum BasebackupError {
-    #[error("basebackup pageserver error {0:#}")]
-    Server(#[from] anyhow::Error),
-    #[error("basebackup client error {0:#}")]
-    Client(#[source] io::Error),
-}
-
 /// Create basebackup with non-rel data in it.
 /// Only include relational data if 'full_backup' is true.
 ///
@@ -61,7 +53,7 @@ pub async fn send_basebackup_tarball<'a, W>(
    prev_lsn: Option<Lsn>,
    full_backup: bool,
    ctx: &'a RequestContext,
-) -> Result<(), BasebackupError>
+) -> anyhow::Result<()>
 where
    W: AsyncWrite + Send + Sync + Unpin,
 {
@@ -100,10 +92,8 @@ where

    // Consolidate the derived and the provided prev_lsn values
    let prev_lsn = if let Some(provided_prev_lsn) = prev_lsn {
-        if backup_prev != Lsn(0) && backup_prev != provided_prev_lsn {
-            return Err(BasebackupError::Server(anyhow!(
-                "backup_prev {backup_prev} != provided_prev_lsn {provided_prev_lsn}"
-            )));
+        if backup_prev != Lsn(0) {
+            ensure!(backup_prev == provided_prev_lsn);
        }
        provided_prev_lsn
    } else {
@@ -169,26 +159,15 @@ where
        }
    }

-    async fn add_block(&mut self, key: &Key, block: Bytes) -> Result<(), BasebackupError> {
+    async fn add_block(&mut self, key: &Key, block: Bytes) -> anyhow::Result<()> {
        let (kind, segno, _) = key_to_slru_block(*key)?;

        match kind {
            SlruKind::Clog => {
-                if !(block.len() == BLCKSZ as usize || block.len() == BLCKSZ as usize + 8) {
-                    return Err(BasebackupError::Server(anyhow!(
-                        "invalid SlruKind::Clog record: block.len()={}",
-                        block.len()
-                    )));
-                }
+                ensure!(block.len() == BLCKSZ as usize || block.len() == BLCKSZ as usize + 8);
            }
            SlruKind::MultiXactMembers | SlruKind::MultiXactOffsets => {
-                if block.len() != BLCKSZ as usize {
-                    return Err(BasebackupError::Server(anyhow!(
-                        "invalid {:?} record: block.len()={}",
-                        kind,
-                        block.len()
-                    )));
-                }
+                ensure!(block.len() == BLCKSZ as usize);
            }
        }

@@ -215,15 +194,12 @@ where
        Ok(())
    }

-    async fn flush(&mut self) -> Result<(), BasebackupError> {
+    async fn flush(&mut self) -> anyhow::Result<()> {
        let nblocks = self.buf.len() / BLCKSZ as usize;
        let (kind, segno) = self.current_segment.take().unwrap();
        let segname = format!("{}/{:>04X}", kind.to_str(), segno);
        let header = new_tar_header(&segname, self.buf.len() as u64)?;
-        self.ar
-            .append(&header, self.buf.as_slice())
-            .await
-            .map_err(BasebackupError::Client)?;
+        self.ar.append(&header, self.buf.as_slice()).await?;

        self.total_blocks += nblocks;
        debug!("Added to basebackup slru {} relsize {}", segname, nblocks);
@@ -233,7 +209,7 @@ where
        Ok(())
    }

-    async fn finish(mut self) -> Result<(), BasebackupError> {
+    async fn finish(mut self) -> anyhow::Result<()> {
        let res = if self.current_segment.is_none() || self.buf.is_empty() {
            Ok(())
        } else {
@@ -250,7 +226,7 @@ impl<'a, W> Basebackup<'a, W>
 where
    W: AsyncWrite + Send + Sync + Unpin,
 {
-    async fn send_tarball(mut self) -> Result<(), BasebackupError> {
+    async fn send_tarball(mut self) -> anyhow::Result<()> {
        // TODO include checksum

        let lazy_slru_download = self.timeline.get_lazy_slru_download() && !self.full_backup;
@@ -286,25 +262,16 @@ where
            let slru_partitions = self
                .timeline
                .get_slru_keyspace(Version::Lsn(self.lsn), self.ctx)
-                .await
-                .map_err(|e| BasebackupError::Server(e.into()))?
-                .partition(
-                    self.timeline.get_shard_identity(),
-                    Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64,
-                );
+                .await?
+                .partition(Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64);

            let mut slru_builder = SlruSegmentsBuilder::new(&mut self.ar);

            for part in slru_partitions.parts {
-                let blocks = self
-                    .timeline
-                    .get_vectored(part, self.lsn, self.ctx)
-                    .await
-                    .map_err(|e| BasebackupError::Server(e.into()))?;
+                let blocks = self.timeline.get_vectored(part, self.lsn, self.ctx).await?;

                for (key, block) in blocks {
-                    let block = block.map_err(|e| BasebackupError::Server(e.into()))?;
-                    slru_builder.add_block(&key, block).await?;
+                    slru_builder.add_block(&key, block?).await?;
                }
            }
            slru_builder.finish().await?;
@@ -312,11 +279,8 @@ where

        let mut min_restart_lsn: Lsn = Lsn::MAX;
        // Create tablespace directories
-        for ((spcnode, dbnode), has_relmap_file) in self
-            .timeline
-            .list_dbdirs(self.lsn, self.ctx)
-            .await
-            .map_err(|e| BasebackupError::Server(e.into()))?
+        for ((spcnode, dbnode), has_relmap_file) in
+            self.timeline.list_dbdirs(self.lsn, self.ctx).await?
        {
            self.add_dbdir(spcnode, dbnode, has_relmap_file).await?;

@@ -325,8 +289,7 @@ where
            let rels = self
                .timeline
                .list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
-                .await
-                .map_err(|e| BasebackupError::Server(e.into()))?;
+                .await?;
            for &rel in rels.iter() {
                // Send init fork as main fork to provide well formed empty
                // contents of UNLOGGED relations. Postgres copies it in
@@ -349,12 +312,7 @@ where
                }
            }

-            for (path, content) in self
-                .timeline
-                .list_aux_files(self.lsn, self.ctx)
-                .await
-                .map_err(|e| BasebackupError::Server(e.into()))?
-            {
+            for (path, content) in self.timeline.list_aux_files(self.lsn, self.ctx).await? {
                if path.starts_with("pg_replslot") {
                    let offs = pg_constants::REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN;
                    let restart_lsn = Lsn(u64::from_le_bytes(
@@ -385,41 +343,34 @@ where
        for xid in self
            .timeline
            .list_twophase_files(self.lsn, self.ctx)
-            .await
-            .map_err(|e| BasebackupError::Server(e.into()))?
+            .await?
        {
            self.add_twophase_file(xid).await?;
        }

        fail_point!("basebackup-before-control-file", |_| {
-            Err(BasebackupError::Server(anyhow!(
-                "failpoint basebackup-before-control-file"
-            )))
+            bail!("failpoint basebackup-before-control-file")
        });

        // Generate pg_control and bootstrap WAL segment.
        self.add_pgcontrol_file().await?;
-        self.ar.finish().await.map_err(BasebackupError::Client)?;
+        self.ar.finish().await?;
        debug!("all tarred up!");
        Ok(())
    }

    /// Add contents of relfilenode `src`, naming it as `dst`.
-    async fn add_rel(&mut self, src: RelTag, dst: RelTag) -> Result<(), BasebackupError> {
+    async fn add_rel(&mut self, src: RelTag, dst: RelTag) -> anyhow::Result<()> {
        let nblocks = self
            .timeline
-            .get_rel_size(src, Version::Lsn(self.lsn), self.ctx)
-            .await
-            .map_err(|e| BasebackupError::Server(e.into()))?;
+            .get_rel_size(src, Version::Lsn(self.lsn), false, self.ctx)
+            .await?;

        // If the relation is empty, create an empty file
        if nblocks == 0 {
            let file_name = dst.to_segfile_name(0);
            let header = new_tar_header(&file_name, 0)?;
-            self.ar
-                .append(&header, &mut io::empty())
-                .await
-                .map_err(BasebackupError::Client)?;
+            self.ar.append(&header, &mut io::empty()).await?;
            return Ok(());
        }

@@ -433,18 +384,14 @@ where
            for blknum in startblk..endblk {
                let img = self
                    .timeline
-                    .get_rel_page_at_lsn(src, blknum, Version::Lsn(self.lsn), self.ctx)
-                    .await
-                    .map_err(|e| BasebackupError::Server(e.into()))?;
+                    .get_rel_page_at_lsn(src, blknum, Version::Lsn(self.lsn), false, self.ctx)
+                    .await?;
                segment_data.extend_from_slice(&img[..]);
            }

            let file_name = dst.to_segfile_name(seg as u32);
            let header = new_tar_header(&file_name, segment_data.len() as u64)?;
-            self.ar
-                .append(&header, segment_data.as_slice())
-                .await
-                .map_err(BasebackupError::Client)?;
+            self.ar.append(&header, segment_data.as_slice()).await?;

            seg += 1;
            startblk = endblk;
@@ -464,22 +411,20 @@ where
        spcnode: u32,
        dbnode: u32,
        has_relmap_file: bool,
-    ) -> Result<(), BasebackupError> {
+    ) -> anyhow::Result<()> {
        let relmap_img = if has_relmap_file {
            let img = self
                .timeline
                .get_relmap_file(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
-                .await
-                .map_err(|e| BasebackupError::Server(e.into()))?;
+                .await?;

-            if img.len()
-                != dispatch_pgversion!(self.timeline.pg_version, pgv::bindings::SIZEOF_RELMAPFILE)
-            {
-                return Err(BasebackupError::Server(anyhow!(
-                    "img.len() != SIZE_OF_RELMAPFILE, img.len()={}",
-                    img.len(),
-                )));
-            }
+            ensure!(
+                img.len()
+                    == dispatch_pgversion!(
+                        self.timeline.pg_version,
+                        pgv::bindings::SIZEOF_RELMAPFILE
+                    )
+            );

            Some(img)
        } else {
@@ -492,20 +437,14 @@ where
                ver => format!("{ver}\x0A"),
            };
            let header = new_tar_header("PG_VERSION", pg_version_str.len() as u64)?;
-            self.ar
-                .append(&header, pg_version_str.as_bytes())
-                .await
-                .map_err(BasebackupError::Client)?;
+            self.ar.append(&header, pg_version_str.as_bytes()).await?;

            info!("timeline.pg_version {}", self.timeline.pg_version);

            if let Some(img) = relmap_img {
                // filenode map for global tablespace
                let header = new_tar_header("global/pg_filenode.map", img.len() as u64)?;
-                self.ar
-                    .append(&header, &img[..])
-                    .await
-                    .map_err(BasebackupError::Client)?;
+                self.ar.append(&header, &img[..]).await?;
            } else {
                warn!("global/pg_filenode.map is missing");
            }
@@ -524,26 +463,18 @@ where
                && self
                    .timeline
                    .list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
-                    .await
-                    .map_err(|e| BasebackupError::Server(e.into()))?
+                    .await?
                    .is_empty()
            {
                return Ok(());
            }
            // User defined tablespaces are not supported
-            if spcnode != DEFAULTTABLESPACE_OID {
-                return Err(BasebackupError::Server(anyhow!(
-                    "spcnode != DEFAULTTABLESPACE_OID, spcnode={spcnode}"
-                )));
-            }
+            ensure!(spcnode == DEFAULTTABLESPACE_OID);

            // Append dir path for each database
            let path = format!("base/{}", dbnode);
            let header = new_tar_header_dir(&path)?;
-            self.ar
-                .append(&header, &mut io::empty())
-                .await
-                .map_err(BasebackupError::Client)?;
+            self.ar.append(&header, &mut io::empty()).await?;

            if let Some(img) = relmap_img {
                let dst_path = format!("base/{}/PG_VERSION", dbnode);
@@ -553,17 +484,11 @@ where
                    ver => format!("{ver}\x0A"),
                };
                let header = new_tar_header(&dst_path, pg_version_str.len() as u64)?;
-                self.ar
-                    .append(&header, pg_version_str.as_bytes())
-                    .await
-                    .map_err(BasebackupError::Client)?;
+                self.ar.append(&header, pg_version_str.as_bytes()).await?;

                let relmap_path = format!("base/{}/pg_filenode.map", dbnode);
                let header = new_tar_header(&relmap_path, img.len() as u64)?;
-                self.ar
-                    .append(&header, &img[..])
-                    .await
-                    .map_err(BasebackupError::Client)?;
+                self.ar.append(&header, &img[..]).await?;
            }
        };
        Ok(())
@@ -572,12 +497,11 @@ where
    //
    // Extract twophase state files
    //
-    async fn add_twophase_file(&mut self, xid: TransactionId) -> Result<(), BasebackupError> {
+    async fn add_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> {
        let img = self
            .timeline
            .get_twophase_file(xid, self.lsn, self.ctx)
-            .await
-            .map_err(|e| BasebackupError::Server(e.into()))?;
+            .await?;

        let mut buf = BytesMut::new();
        buf.extend_from_slice(&img[..]);
@@ -585,10 +509,7 @@ where
        buf.put_u32_le(crc);
        let path = format!("pg_twophase/{:>08X}", xid);
        let header = new_tar_header(&path, buf.len() as u64)?;
-        self.ar
-            .append(&header, &buf[..])
-            .await
-            .map_err(BasebackupError::Client)?;
+        self.ar.append(&header, &buf[..]).await?;

        Ok(())
    }
@@ -597,28 +518,24 @@ where
    // Add generated pg_control file and bootstrap WAL segment.
    // Also send zenith.signal file with extra bootstrap data.
    //
-    async fn add_pgcontrol_file(&mut self) -> Result<(), BasebackupError> {
+    async fn add_pgcontrol_file(&mut self) -> anyhow::Result<()> {
        // add zenith.signal file
        let mut zenith_signal = String::new();
        if self.prev_record_lsn == Lsn(0) {
            if self.lsn == self.timeline.get_ancestor_lsn() {
-                write!(zenith_signal, "PREV LSN: none")
-                    .map_err(|e| BasebackupError::Server(e.into()))?;
+                write!(zenith_signal, "PREV LSN: none")?;
            } else {
-                write!(zenith_signal, "PREV LSN: invalid")
-                    .map_err(|e| BasebackupError::Server(e.into()))?;
+                write!(zenith_signal, "PREV LSN: invalid")?;
            }
        } else {
-            write!(zenith_signal, "PREV LSN: {}", self.prev_record_lsn)
-                .map_err(|e| BasebackupError::Server(e.into()))?;
+            write!(zenith_signal, "PREV LSN: {}", self.prev_record_lsn)?;
        }
        self.ar
            .append(
                &new_tar_header("zenith.signal", zenith_signal.len() as u64)?,
                zenith_signal.as_bytes(),
            )
-            .await
-            .map_err(BasebackupError::Client)?;
+            .await?;

        let checkpoint_bytes = self
            .timeline
@@ -640,10 +557,7 @@ where

        //send pg_control
        let header = new_tar_header("global/pg_control", pg_control_bytes.len() as u64)?;
-        self.ar
-            .append(&header, &pg_control_bytes[..])
-            .await
-            .map_err(BasebackupError::Client)?;
+        self.ar.append(&header, &pg_control_bytes[..]).await?;

        //send wal segment
        let segno = self.lsn.segment_number(WAL_SEGMENT_SIZE);
@@ -658,16 +572,8 @@ where
            self.lsn,
        )
        .map_err(|e| anyhow!(e).context("Failed generating wal segment"))?;
-        if wal_seg.len() != WAL_SEGMENT_SIZE {
-            return Err(BasebackupError::Server(anyhow!(
-                "wal_seg.len() != WAL_SEGMENT_SIZE, wal_seg.len()={}",
-                wal_seg.len()
-            )));
-        }
-        self.ar
-            .append(&header, &wal_seg[..])
-            .await
-            .map_err(BasebackupError::Client)?;
+        ensure!(wal_seg.len() == WAL_SEGMENT_SIZE);
+        self.ar.append(&header, &wal_seg[..]).await?;
        Ok(())
    }
 }
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -121,10 +121,8 @@ fn main() -> anyhow::Result<()> {
        &[("node_id", &conf.id.to_string())],
    );

-    // after setting up logging, log the effective IO engine choice and read path implementations
+    // after setting up logging, log the effective IO engine choice
    info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
-    info!(?conf.get_impl, "starting with get page implementation");
-    info!(?conf.get_vectored_impl, "starting with vectored get page implementation");

    let tenants_path = conf.tenants_path();
    if !tenants_path.exists() {
@@ -287,7 +285,6 @@ fn start_pageserver(
    ))
    .unwrap();
    pageserver::preinitialize_metrics();
-    pageserver::metrics::wal_redo::set_process_kind_metric(conf.walredo_process_kind);

    // If any failpoints were set from FAILPOINTS environment variable,
    // print them to the log for debugging purposes
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -30,9 +30,9 @@ use utils::{
    logging::LogFormat,
 };

+use crate::tenant::config::TenantConfOpt;
 use crate::tenant::timeline::GetVectoredImpl;
 use crate::tenant::vectored_blob_io::MaxVectoredReadBytes;
-use crate::tenant::{config::TenantConfOpt, timeline::GetImpl};
 use crate::tenant::{
    TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
 };
@@ -91,16 +91,12 @@ pub mod defaults {

    pub const DEFAULT_GET_VECTORED_IMPL: &str = "sequential";

-    pub const DEFAULT_GET_IMPL: &str = "legacy";
-
    pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB

    pub const DEFAULT_VALIDATE_VECTORED_GET: bool = true;

    pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;

-    pub const DEFAULT_WALREDO_PROCESS_KIND: &str = "sync";
-
    ///
    /// Default built-in configuration file.
    ///
@@ -140,14 +136,10 @@ pub mod defaults {

 #get_vectored_impl = '{DEFAULT_GET_VECTORED_IMPL}'

-#get_impl = '{DEFAULT_GET_IMPL}'
-
 #max_vectored_read_bytes = '{DEFAULT_MAX_VECTORED_READ_BYTES}'

 #validate_vectored_get = '{DEFAULT_VALIDATE_VECTORED_GET}'

-#walredo_process_kind = '{DEFAULT_WALREDO_PROCESS_KIND}'
-
 [tenant_config]
 #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
 #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
@@ -288,8 +280,6 @@ pub struct PageServerConf {

    pub get_vectored_impl: GetVectoredImpl,

-    pub get_impl: GetImpl,
-
    pub max_vectored_read_bytes: MaxVectoredReadBytes,

    pub validate_vectored_get: bool,
@@ -300,8 +290,6 @@ pub struct PageServerConf {
    ///
    /// Setting this to zero disables limits on total ephemeral layer size.
    pub ephemeral_bytes_per_memory_kb: usize,
-
-    pub walredo_process_kind: crate::walredo::ProcessKind,
 }

 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -420,15 +408,11 @@ struct PageServerConfigBuilder {

    get_vectored_impl: BuilderValue<GetVectoredImpl>,

-    get_impl: BuilderValue<GetImpl>,
-
    max_vectored_read_bytes: BuilderValue<MaxVectoredReadBytes>,

    validate_vectored_get: BuilderValue<bool>,

    ephemeral_bytes_per_memory_kb: BuilderValue<usize>,
-
-    walredo_process_kind: BuilderValue<crate::walredo::ProcessKind>,
 }

 impl PageServerConfigBuilder {
@@ -511,14 +495,11 @@ impl PageServerConfigBuilder {
            virtual_file_io_engine: Set(DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap()),

            get_vectored_impl: Set(DEFAULT_GET_VECTORED_IMPL.parse().unwrap()),
-            get_impl: Set(DEFAULT_GET_IMPL.parse().unwrap()),
            max_vectored_read_bytes: Set(MaxVectoredReadBytes(
                NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
            )),
            validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET),
            ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
-
-            walredo_process_kind: Set(DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap()),
        }
    }
 }
@@ -690,10 +671,6 @@ impl PageServerConfigBuilder {
        self.get_vectored_impl = BuilderValue::Set(value);
    }

-    pub fn get_impl(&mut self, value: GetImpl) {
-        self.get_impl = BuilderValue::Set(value);
-    }
-
    pub fn get_max_vectored_read_bytes(&mut self, value: MaxVectoredReadBytes) {
        self.max_vectored_read_bytes = BuilderValue::Set(value);
    }
@@ -706,10 +683,6 @@ impl PageServerConfigBuilder {
        self.ephemeral_bytes_per_memory_kb = BuilderValue::Set(value);
    }

-    pub fn get_walredo_process_kind(&mut self, value: crate::walredo::ProcessKind) {
-        self.walredo_process_kind = BuilderValue::Set(value);
-    }
-
    pub fn build(self) -> anyhow::Result<PageServerConf> {
        let default = Self::default_values();

@@ -763,11 +736,9 @@ impl PageServerConfigBuilder {
                secondary_download_concurrency,
                ingest_batch_size,
                get_vectored_impl,
-                get_impl,
                max_vectored_read_bytes,
                validate_vectored_get,
                ephemeral_bytes_per_memory_kb,
-                walredo_process_kind,
            }
            CUSTOM LOGIC
            {
@@ -1049,9 +1020,6 @@ impl PageServerConf {
                "get_vectored_impl" => {
                    builder.get_vectored_impl(parse_toml_from_str("get_vectored_impl", item)?)
                }
-                "get_impl" => {
-                    builder.get_impl(parse_toml_from_str("get_impl", item)?)
-                }
                "max_vectored_read_bytes" => {
                    let bytes = parse_toml_u64("max_vectored_read_bytes", item)? as usize;
                    builder.get_max_vectored_read_bytes(
@@ -1064,9 +1032,6 @@ impl PageServerConf {
                "ephemeral_bytes_per_memory_kb" => {
                    builder.get_ephemeral_bytes_per_memory_kb(parse_toml_u64("ephemeral_bytes_per_memory_kb", item)? as usize)
                }
-                "walredo_process_kind" => {
-                    builder.get_walredo_process_kind(parse_toml_from_str("walredo_process_kind", item)?)
-                }
                _ => bail!("unrecognized pageserver option '{key}'"),
            }
        }
@@ -1143,14 +1108,12 @@ impl PageServerConf {
            ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
            virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
            get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
-            get_impl: defaults::DEFAULT_GET_IMPL.parse().unwrap(),
            max_vectored_read_bytes: MaxVectoredReadBytes(
                NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
                    .expect("Invalid default constant"),
            ),
            validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
            ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
-            walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(),
        }
    }
 }
@@ -1383,14 +1346,12 @@ background_task_maximum_delay = '334 s'
                ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
                virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
                get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
-                get_impl: defaults::DEFAULT_GET_IMPL.parse().unwrap(),
                max_vectored_read_bytes: MaxVectoredReadBytes(
                    NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
                        .expect("Invalid default constant")
                ),
                validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
-                ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
-                walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(),
+                ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB
            },
            "Correct defaults should be used when no config values are provided"
        );
@@ -1457,14 +1418,12 @@ background_task_maximum_delay = '334 s'
                ingest_batch_size: 100,
                virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
                get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
-                get_impl: defaults::DEFAULT_GET_IMPL.parse().unwrap(),
                max_vectored_read_bytes: MaxVectoredReadBytes(
                    NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
                        .expect("Invalid default constant")
                ),
                validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
-                ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
-                walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(),
+                ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB
            },
            "Should be able to parse all basic config values correctly"
        );
@@ -1577,7 +1536,6 @@ broker_endpoint = '{broker_endpoint}'
                        endpoint: Some(endpoint.clone()),
                        concurrency_limit: s3_concurrency_limit,
                        max_keys_per_list_response: None,
-                        upload_storage_class: None,
                    }),
                    timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
                },
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -304,7 +304,7 @@ async fn calculate_synthetic_size_worker(
                continue;
            }

-            if !tenant_shard_id.is_shard_zero() {
+            if !tenant_shard_id.is_zero() {
                // We only send consumption metrics from shard 0, so don't waste time calculating
                // synthetic size on other shards.
                continue;
--- a/pageserver/src/consumption_metrics/metrics.rs
+++ b/pageserver/src/consumption_metrics/metrics.rs
@@ -199,7 +199,7 @@ pub(super) async fn collect_all_metrics(
    };

    let tenants = futures::stream::iter(tenants).filter_map(|(id, state, _)| async move {
-        if state != TenantState::Active || !id.is_shard_zero() {
+        if state != TenantState::Active || !id.is_zero() {
            None
        } else {
            tenant_manager
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -58,6 +58,24 @@ paths:
      responses:
        "200":
          description: The reload completed successfully.
+        "401":
+          description: Unauthorized Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/UnauthorizedError"
+        "403":
+          description: Forbidden Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ForbiddenError"
+        "500":
+          description: Generic operation error (also hits if no keys were found)
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"

  /v1/tenant/{tenant_id}:
    parameters:
@@ -75,14 +93,62 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/TenantInfo"
+        "400":
+          description: Error when no tenant id found in path or no timeline id
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "401":
+          description: Unauthorized Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/UnauthorizedError"
+        "403":
+          description: Forbidden Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ForbiddenError"
+        "500":
+          description: Generic operation error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"

    delete:
      description: |
        Attempts to delete specified tenant. 500, 503 and 409 errors should be retried until 404 is retrieved.
        404 means that deletion successfully finished"
      responses:
+        "400":
+          description: Error when no tenant id found in path
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "401":
+          description: Unauthorized Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/UnauthorizedError"
+        "403":
+          description: Forbidden Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ForbiddenError"
        "404":
-          description: Tenant not found. This is the success path.
+          description: Tenant not found
          content:
            application/json:
              schema:
@@ -99,6 +165,18 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/PreconditionFailedError"
+        "500":
+          description: Generic operation error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"

  /v1/tenant/{tenant_id}/time_travel_remote_storage:
    parameters:
@@ -128,6 +206,36 @@ paths:
            application/json:
              schema:
                type: string
+        "400":
+          description: Error when no tenant id found in path or invalid timestamp
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "401":
+          description: Unauthorized Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/UnauthorizedError"
+        "403":
+          description: Forbidden Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ForbiddenError"
+        "500":
+          description: Generic operation error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"

  /v1/tenant/{tenant_id}/timeline:
    parameters:
@@ -147,6 +255,36 @@ paths:
                type: array
                items:
                  $ref: "#/components/schemas/TimelineInfo"
+        "400":
+          description: Error when no tenant id found in path
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "401":
+          description: Unauthorized Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/UnauthorizedError"
+        "403":
+          description: Forbidden Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ForbiddenError"
+        "500":
+          description: Generic operation error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"


  /v1/tenant/{tenant_id}/timeline/{timeline_id}:
@@ -171,12 +309,60 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/TimelineInfo"
+        "400":
+          description: Error when no tenant id found in path or no timeline id
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "401":
+          description: Unauthorized Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/UnauthorizedError"
+        "403":
+          description: Forbidden Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ForbiddenError"
+        "500":
+          description: Generic operation error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"

    delete:
      description: "Attempts to delete specified timeline. 500 and 409 errors should be retried"
      responses:
+        "400":
+          description: Error when no tenant id found in path or no timeline id
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "401":
+          description: Unauthorized Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/UnauthorizedError"
+        "403":
+          description: Forbidden Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ForbiddenError"
        "404":
-          description: Timeline not found. This is the success path.
+          description: Timeline not found
          content:
            application/json:
              schema:
@@ -193,6 +379,18 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/PreconditionFailedError"
+        "500":
+          description: Generic operation error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"

  /v1/tenant/{tenant_id}/timeline/{timeline_id}/get_timestamp_of_lsn:
    parameters:
@@ -225,6 +423,36 @@ paths:
              schema:
                type: string
                format: date-time
+        "400":
+          description: Error when no tenant id found in path, no timeline id or invalid timestamp
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "401":
+          description: Unauthorized Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/UnauthorizedError"
+        "403":
+          description: Forbidden Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ForbiddenError"
+        "404":
+          description: Timeline not found, or there is no timestamp information for the given lsn
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/NotFoundError"
+        "500":
+          description: Generic operation error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"

  /v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp:
    parameters:
@@ -256,6 +484,36 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/LsnByTimestampResponse"
+        "400":
+          description: Error when no tenant id found in path, no timeline id or invalid timestamp
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "401":
+          description: Unauthorized Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/UnauthorizedError"
+        "403":
+          description: Forbidden Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ForbiddenError"
+        "500":
+          description: Generic operation error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"

  /v1/tenant/{tenant_id}/timeline/{timeline_id}/do_gc:
    parameters:
@@ -279,6 +537,36 @@ paths:
            application/json:
              schema:
                type: string
+        "400":
+          description: Error when no tenant id found in path, no timeline id or invalid timestamp
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "401":
+          description: Unauthorized Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/UnauthorizedError"
+        "403":
+          description: Forbidden Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ForbiddenError"
+        "500":
+          description: Generic operation error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"
  /v1/tenant/{tenant_shard_id}/location_config:
    parameters:
      - name: tenant_shard_id
@@ -340,6 +628,24 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/TenantLocationConfigResponse"
+        "503":
+          description: Tenant's state cannot be changed right now.  Wait a few seconds and retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "401":
+          description: Unauthorized Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/UnauthorizedError"
+        "403":
+          description: Forbidden Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ForbiddenError"
        "409":
          description: |
            The tenant is already known to Pageserver in some way,
@@ -356,6 +662,12 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/ConflictError"
+        "500":
+          description: Generic operation error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
  /v1/tenant/{tenant_id}/ignore:
    parameters:
      - name: tenant_id
@@ -372,6 +684,36 @@ paths:
      responses:
        "200":
          description: Tenant ignored
+        "400":
+          description: Error when no tenant id found in path parameters
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "401":
+          description: Unauthorized Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/UnauthorizedError"
+        "403":
+          description: Forbidden Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ForbiddenError"
+        "500":
+          description: Generic operation error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"


  /v1/tenant/{tenant_id}/load:
@@ -398,6 +740,36 @@ paths:
      responses:
        "202":
          description: Tenant scheduled to load successfully
+        "400":
+          description: Error when no tenant id found in path parameters
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "401":
+          description: Unauthorized Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/UnauthorizedError"
+        "403":
+          description: Forbidden Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ForbiddenError"
+        "500":
+          description: Generic operation error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"

  /v1/tenant/{tenant_id}/{timeline_id}/preserve_initdb_archive:
    parameters:
@@ -418,6 +790,37 @@ paths:
      responses:
        "202":
          description: Tenant scheduled to load successfully
+        "404":
+          description: No tenant or timeline found for the specified ids
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "401":
+          description: Unauthorized Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/UnauthorizedError"
+        "403":
+          description: Forbidden Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ForbiddenError"
+        "500":
+          description: Generic operation error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"
+

  /v1/tenant/{tenant_id}/synthetic_size:
    parameters:
@@ -436,8 +839,31 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/SyntheticSizeResponse"
+        "401":
+          description: Unauthorized Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/UnauthorizedError"
+        "403":
+          description: Forbidden Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ForbiddenError"
+        "500":
+          description: Generic operation error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"

-  # This route has no handler. TODO: remove?
  /v1/tenant/{tenant_id}/size:
    parameters:
      - name: tenant_id
@@ -519,6 +945,18 @@ paths:
      responses:
        "200":
          description: Success
+        "500":
+          description: Generic operation error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"

  /v1/tenant/{tenant_shard_id}/secondary/download:
    parameters:
@@ -549,6 +987,20 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/SecondaryProgress"
+        "500":
+          description: Generic operation error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"
+
+

  /v1/tenant/{tenant_id}/timeline/:
    parameters:
@@ -591,6 +1043,24 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/TimelineInfo"
+        "400":
+          description: Malformed timeline create request
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "401":
+          description: Unauthorized Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/UnauthorizedError"
+        "403":
+          description: Forbidden Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ForbiddenError"
        "406":
          description: Permanently unsatisfiable request, don't retry.
          content:
@@ -609,6 +1079,18 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
+        "500":
+          description: Generic operation error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"

  /v1/tenant/:
    get:
@@ -622,6 +1104,30 @@ paths:
                type: array
                items:
                  $ref: "#/components/schemas/TenantInfo"
+        "401":
+          description: Unauthorized Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/UnauthorizedError"
+        "403":
+          description: Forbidden Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ForbiddenError"
+        "500":
+          description: Generic operation error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"

    post:
      description: |
@@ -642,12 +1148,43 @@ paths:
            application/json:
              schema:
                type: string
+        "400":
+          description: Malformed tenant create request
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "401":
+          description: Unauthorized Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/UnauthorizedError"
+        "403":
+          description: Forbidden Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ForbiddenError"
        "409":
          description: Tenant already exists, creation skipped
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ConflictError"
+        "500":
+          description: Generic operation error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"
+

  /v1/tenant/config:
    put:
@@ -669,6 +1206,36 @@ paths:
                type: array
                items:
                  $ref: "#/components/schemas/TenantInfo"
+        "400":
+          description: Malformed tenant config request
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "401":
+          description: Unauthorized Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/UnauthorizedError"
+        "403":
+          description: Forbidden Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ForbiddenError"
+        "500":
+          description: Generic operation error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"

  /v1/tenant/{tenant_id}/config/:
    parameters:
@@ -688,6 +1255,42 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/TenantConfigResponse"
+        "400":
+          description: Malformed get tenanant config request
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "401":
+          description: Unauthorized Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/UnauthorizedError"
+        "403":
+          description: Forbidden Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ForbiddenError"
+        "404":
+          description: Tenand or timeline were not found
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/NotFoundError"
+        "500":
+          description: Generic operation error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"

  /v1/utilization:
    get:
@@ -701,6 +1304,12 @@ paths:
              application/json:
                schema:
                  $ref: "#/components/schemas/PageserverUtilization"
+        "500":
+          description: Generic operation error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"

 components:
  securitySchemes:
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -19,8 +19,6 @@ use pageserver_api::models::LocationConfigListResponse;
 use pageserver_api::models::ShardParameters;
 use pageserver_api::models::TenantDetails;
 use pageserver_api::models::TenantLocationConfigResponse;
-use pageserver_api::models::TenantScanRemoteStorageResponse;
-use pageserver_api::models::TenantScanRemoteStorageShard;
 use pageserver_api::models::TenantShardLocation;
 use pageserver_api::models::TenantShardSplitRequest;
 use pageserver_api::models::TenantShardSplitResponse;
@@ -31,7 +29,6 @@ use pageserver_api::models::{
 };
 use pageserver_api::shard::ShardCount;
 use pageserver_api::shard::TenantShardId;
-use remote_storage::DownloadError;
 use remote_storage::GenericRemoteStorage;
 use remote_storage::TimeTravelError;
 use tenant_size_model::{SizeResult, StorageModel};
@@ -57,9 +54,6 @@ use crate::tenant::mgr::{
 };
 use crate::tenant::mgr::{TenantSlot, UpsertLocationError};
 use crate::tenant::remote_timeline_client;
-use crate::tenant::remote_timeline_client::download_index_part;
-use crate::tenant::remote_timeline_client::list_remote_tenant_shards;
-use crate::tenant::remote_timeline_client::list_remote_timelines;
 use crate::tenant::secondary::SecondaryController;
 use crate::tenant::size::ModelInputs;
 use crate::tenant::storage_layer::LayerAccessStatsReset;
@@ -166,9 +160,6 @@ impl From<PageReconstructError> for ApiError {
    fn from(pre: PageReconstructError) -> ApiError {
        match pre {
            PageReconstructError::Other(pre) => ApiError::InternalServerError(pre),
-            PageReconstructError::MissingKey(e) => {
-                ApiError::InternalServerError(anyhow::anyhow!("{e}"))
-            }
            PageReconstructError::Cancelled => {
                ApiError::InternalServerError(anyhow::anyhow!("request was cancelled"))
            }
@@ -466,12 +457,8 @@ async fn reload_auth_validation_keys_handler(
            json_response(StatusCode::OK, ())
        }
        Err(e) => {
-            let err_msg = "Error reloading public keys";
            warn!("Error reloading public keys from {key_path:?}: {e:}");
-            json_response(
-                StatusCode::INTERNAL_SERVER_ERROR,
-                HttpErrorBody::from_msg(err_msg.to_string()),
-            )
+            json_response(StatusCode::INTERNAL_SERVER_ERROR, ())
        }
    }
 }
@@ -709,7 +696,7 @@ async fn get_lsn_by_timestamp_handler(
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
    let state = get_state(&request);

-    if !tenant_shard_id.is_shard_zero() {
+    if !tenant_shard_id.is_zero() {
        // Requires SLRU contents, which are only stored on shard zero
        return Err(ApiError::BadRequest(anyhow!(
            "Size calculations are only available on shard zero"
@@ -760,7 +747,7 @@ async fn get_timestamp_of_lsn_handler(
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
    let state = get_state(&request);

-    if !tenant_shard_id.is_shard_zero() {
+    if !tenant_shard_id.is_zero() {
        // Requires SLRU contents, which are only stored on shard zero
        return Err(ApiError::BadRequest(anyhow!(
            "Size calculations are only available on shard zero"
@@ -785,9 +772,7 @@ async fn get_timestamp_of_lsn_handler(
            let time = format_rfc3339(postgres_ffi::from_pg_timestamp(time)).to_string();
            json_response(StatusCode::OK, time)
        }
-        None => Err(ApiError::NotFound(
-            anyhow::anyhow!("Timestamp for lsn {} not found", lsn).into(),
-        )),
+        None => json_response(StatusCode::NOT_FOUND, ()),
    }
 }

@@ -1101,7 +1086,7 @@ async fn tenant_size_handler(
    let headers = request.headers();
    let state = get_state(&request);

-    if !tenant_shard_id.is_shard_zero() {
+    if !tenant_shard_id.is_zero() {
        return Err(ApiError::BadRequest(anyhow!(
            "Size calculations are only available on shard zero"
        )));
@@ -1918,14 +1903,12 @@ async fn timeline_collect_keyspace(
        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
        let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
        let at_lsn = at_lsn.unwrap_or_else(|| timeline.get_last_record_lsn());
-        let (dense_ks, sparse_ks) = timeline
+        let keys = timeline
            .collect_keyspace(at_lsn, &ctx)
            .await
            .map_err(|e| ApiError::InternalServerError(e.into()))?;

-        // This API is currently used by pagebench. Pagebench will iterate all keys within the keyspace.
-        // Therefore, we split dense/sparse keys in this API.
-        let res = pageserver_api::models::partitioning::Partitioning { keys: dense_ks, sparse_keys: sparse_ks, at_lsn };
+        let res = pageserver_api::models::partitioning::Partitioning { keys, at_lsn };

        json_response(StatusCode::OK, res)
    }
@@ -2043,79 +2026,6 @@ async fn secondary_upload_handler(
    json_response(StatusCode::OK, ())
 }

-async fn tenant_scan_remote_handler(
-    request: Request<Body>,
-    cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    let state = get_state(&request);
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
-
-    let Some(remote_storage) = state.remote_storage.as_ref() else {
-        return Err(ApiError::BadRequest(anyhow::anyhow!(
-            "Remote storage not configured"
-        )));
-    };
-
-    let mut response = TenantScanRemoteStorageResponse::default();
-
-    let (shards, _other_keys) =
-        list_remote_tenant_shards(remote_storage, tenant_id, cancel.clone())
-            .await
-            .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;
-
-    for tenant_shard_id in shards {
-        let (timeline_ids, _other_keys) =
-            list_remote_timelines(remote_storage, tenant_shard_id, cancel.clone())
-                .await
-                .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;
-
-        let mut generation = Generation::none();
-        for timeline_id in timeline_ids {
-            match download_index_part(
-                remote_storage,
-                &tenant_shard_id,
-                &timeline_id,
-                Generation::MAX,
-                &cancel,
-            )
-            .instrument(info_span!("download_index_part",
-                         tenant_id=%tenant_shard_id.tenant_id,
-                         shard_id=%tenant_shard_id.shard_slug(),
-                         %timeline_id))
-            .await
-            {
-                Ok((index_part, index_generation)) => {
-                    tracing::info!("Found timeline {tenant_shard_id}/{timeline_id} metadata (gen {index_generation:?}, {} layers, {} consistent LSN)",
-                        index_part.layer_metadata.len(), index_part.get_disk_consistent_lsn());
-                    generation = std::cmp::max(generation, index_generation);
-                }
-                Err(DownloadError::NotFound) => {
-                    // This is normal for tenants that were created with multiple shards: they have an unsharded path
-                    // containing the timeline's initdb tarball but no index.  Otherwise it is a bit strange.
-                    tracing::info!("Timeline path {tenant_shard_id}/{timeline_id} exists in remote storage but has no index, skipping");
-                    continue;
-                }
-                Err(e) => {
-                    return Err(ApiError::InternalServerError(anyhow::anyhow!(e)));
-                }
-            };
-        }
-
-        response.shards.push(TenantScanRemoteStorageShard {
-            tenant_shard_id,
-            generation: generation.into(),
-        });
-    }
-
-    if response.shards.is_empty() {
-        return Err(ApiError::NotFound(
-            anyhow::anyhow!("No shards found for tenant ID {tenant_id}").into(),
-        ));
-    }
-
-    json_response(StatusCode::OK, response)
-}
-
 async fn secondary_download_handler(
    request: Request<Body>,
    _cancel: CancellationToken,
@@ -2512,9 +2422,6 @@ pub fn make_router(
        .post("/v1/tenant/:tenant_shard_id/heatmap_upload", |r| {
            api_handler(r, secondary_upload_handler)
        })
-        .get("/v1/tenant/:tenant_id/scan_remote_storage", |r| {
-            api_handler(r, tenant_scan_remote_handler)
-        })
        .put("/v1/disk_usage_eviction/run", |r| {
            api_handler(r, disk_usage_eviction_run)
        })
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -12,7 +12,6 @@ pub mod disk_usage_eviction_task;
 pub mod http;
 pub mod import_datadir;
 pub use pageserver_api::keyspace;
-pub mod aux_file;
 pub mod metrics;
 pub mod page_cache;
 pub mod page_service;
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -51,9 +51,6 @@ pub(crate) enum StorageTimeOperation {
    #[strum(serialize = "gc")]
    Gc,

-    #[strum(serialize = "update gc info")]
-    UpdateGcInfo,
-
    #[strum(serialize = "create tenant")]
    CreateTenant,
 }
@@ -89,58 +86,41 @@ pub(crate) static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub(crate) static READ_NUM_LAYERS_VISITED: Lazy<Histogram> = Lazy::new(|| {
+pub(crate) static READ_NUM_FS_LAYERS: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
-        "pageserver_layers_visited_per_read_global",
-        "Number of layers visited to reconstruct one key",
-        vec![1.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0],
-    )
-    .expect("failed to define a metric")
-});
-
-pub(crate) static VEC_READ_NUM_LAYERS_VISITED: Lazy<Histogram> = Lazy::new(|| {
-    register_histogram!(
-        "pageserver_layers_visited_per_vectored_read_global",
-        "Average number of layers visited to reconstruct one key",
-        vec![1.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0],
+        "pageserver_read_num_fs_layers",
+        "Number of persistent layers accessed for processing a read request, including those in the cache",
+        vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 10.0, 20.0, 50.0, 100.0],
    )
    .expect("failed to define a metric")
 });

 // Metrics collected on operations on the storage repository.
-#[derive(
-    Clone, Copy, enum_map::Enum, strum_macros::EnumString, strum_macros::Display, IntoStaticStr,
-)]
-pub(crate) enum GetKind {
-    Singular,
-    Vectored,
-}

 pub(crate) struct ReconstructTimeMetrics {
-    singular: Histogram,
-    vectored: Histogram,
+    ok: Histogram,
+    err: Histogram,
 }

 pub(crate) static RECONSTRUCT_TIME: Lazy<ReconstructTimeMetrics> = Lazy::new(|| {
    let inner = register_histogram_vec!(
        "pageserver_getpage_reconstruct_seconds",
        "Time spent in reconstruct_value (reconstruct a page from deltas)",
-        &["get_kind"],
+        &["result"],
        CRITICAL_OP_BUCKETS.into(),
    )
    .expect("failed to define a metric");
-
    ReconstructTimeMetrics {
-        singular: inner.with_label_values(&[GetKind::Singular.into()]),
-        vectored: inner.with_label_values(&[GetKind::Vectored.into()]),
+        ok: inner.get_metric_with_label_values(&["ok"]).unwrap(),
+        err: inner.get_metric_with_label_values(&["err"]).unwrap(),
    }
 });

 impl ReconstructTimeMetrics {
-    pub(crate) fn for_get_kind(&self, get_kind: GetKind) -> &Histogram {
-        match get_kind {
-            GetKind::Singular => &self.singular,
-            GetKind::Vectored => &self.vectored,
+    pub(crate) fn for_result<T, E>(&self, result: &Result<T, E>) -> &Histogram {
+        match result {
+            Ok(_) => &self.ok,
+            Err(_) => &self.err,
        }
    }
 }
@@ -153,33 +133,13 @@ pub(crate) static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounter> = Lazy::n
    .expect("failed to define a metric")
 });

-pub(crate) struct ReconstructDataTimeMetrics {
-    singular: Histogram,
-    vectored: Histogram,
-}
-
-impl ReconstructDataTimeMetrics {
-    pub(crate) fn for_get_kind(&self, get_kind: GetKind) -> &Histogram {
-        match get_kind {
-            GetKind::Singular => &self.singular,
-            GetKind::Vectored => &self.vectored,
-        }
-    }
-}
-
-pub(crate) static GET_RECONSTRUCT_DATA_TIME: Lazy<ReconstructDataTimeMetrics> = Lazy::new(|| {
-    let inner = register_histogram_vec!(
+pub(crate) static GET_RECONSTRUCT_DATA_TIME: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
        "pageserver_getpage_get_reconstruct_data_seconds",
        "Time spent in get_reconstruct_value_data",
-        &["get_kind"],
        CRITICAL_OP_BUCKETS.into(),
    )
-    .expect("failed to define a metric");
-
-    ReconstructDataTimeMetrics {
-        singular: inner.with_label_values(&[GetKind::Singular.into()]),
-        vectored: inner.with_label_values(&[GetKind::Vectored.into()]),
-    }
+    .expect("failed to define a metric")
 });

 pub(crate) static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
@@ -1522,6 +1482,35 @@ pub(crate) static DELETION_QUEUE: Lazy<DeletionQueueMetrics> = Lazy::new(|| {
 }
 });

+pub(crate) struct WalIngestMetrics {
+    pub(crate) bytes_received: IntCounter,
+    pub(crate) records_received: IntCounter,
+    pub(crate) records_committed: IntCounter,
+    pub(crate) records_filtered: IntCounter,
+}
+
+pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMetrics {
+    bytes_received: register_int_counter!(
+        "pageserver_wal_ingest_bytes_received",
+        "Bytes of WAL ingested from safekeepers",
+    )
+    .unwrap(),
+    records_received: register_int_counter!(
+        "pageserver_wal_ingest_records_received",
+        "Number of WAL records received from safekeepers"
+    )
+    .expect("failed to define a metric"),
+    records_committed: register_int_counter!(
+        "pageserver_wal_ingest_records_committed",
+        "Number of WAL records which resulted in writes to pageserver storage"
+    )
+    .expect("failed to define a metric"),
+    records_filtered: register_int_counter!(
+        "pageserver_wal_ingest_records_filtered",
+        "Number of WAL records filtered out due to sharding"
+    )
+    .expect("failed to define a metric"),
+});
 pub(crate) struct SecondaryModeMetrics {
    pub(crate) upload_heatmap: IntCounter,
    pub(crate) upload_heatmap_errors: IntCounter,
@@ -1529,8 +1518,7 @@ pub(crate) struct SecondaryModeMetrics {
    pub(crate) download_heatmap: IntCounter,
    pub(crate) download_layer: IntCounter,
 }
-pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| {
-    SecondaryModeMetrics {
+pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| SecondaryModeMetrics {
    upload_heatmap: register_int_counter!(
        "pageserver_secondary_upload_heatmap",
        "Number of heatmaps written to remote storage by attached tenants"
@@ -1548,7 +1536,7 @@ pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| {
    .expect("failed to define a metric"),
    download_heatmap: register_int_counter!(
        "pageserver_secondary_download_heatmap",
-        "Number of downloads of heatmaps by secondary mode locations, including when it hasn't changed"
+        "Number of downloads of heatmaps by secondary mode locations"
    )
    .expect("failed to define a metric"),
    download_layer: register_int_counter!(
@@ -1556,7 +1544,6 @@ pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| {
        "Number of downloads of layers by secondary mode locations"
    )
    .expect("failed to define a metric"),
-}
 });

 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
@@ -1723,43 +1710,6 @@ macro_rules! redo_bytes_histogram_count_buckets {
    };
 }

-pub(crate) struct WalIngestMetrics {
-    pub(crate) bytes_received: IntCounter,
-    pub(crate) records_received: IntCounter,
-    pub(crate) records_committed: IntCounter,
-    pub(crate) records_filtered: IntCounter,
-    pub(crate) time_spent_on_ingest: Histogram,
-}
-
-pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMetrics {
-    bytes_received: register_int_counter!(
-        "pageserver_wal_ingest_bytes_received",
-        "Bytes of WAL ingested from safekeepers",
-    )
-    .unwrap(),
-    records_received: register_int_counter!(
-        "pageserver_wal_ingest_records_received",
-        "Number of WAL records received from safekeepers"
-    )
-    .expect("failed to define a metric"),
-    records_committed: register_int_counter!(
-        "pageserver_wal_ingest_records_committed",
-        "Number of WAL records which resulted in writes to pageserver storage"
-    )
-    .expect("failed to define a metric"),
-    records_filtered: register_int_counter!(
-        "pageserver_wal_ingest_records_filtered",
-        "Number of WAL records filtered out due to sharding"
-    )
-    .expect("failed to define a metric"),
-    time_spent_on_ingest: register_histogram!(
-        "pageserver_wal_ingest_put_value_seconds",
-        "Actual time spent on ingesting a record",
-        redo_histogram_time_buckets!(),
-    )
-    .expect("failed to define a metric"),
-});
-
 pub(crate) static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
        "pageserver_wal_redo_seconds",
@@ -1869,29 +1819,6 @@ impl Default for WalRedoProcessCounters {
 pub(crate) static WAL_REDO_PROCESS_COUNTERS: Lazy<WalRedoProcessCounters> =
    Lazy::new(WalRedoProcessCounters::default);

-#[cfg(not(test))]
-pub mod wal_redo {
-    use super::*;
-
-    static PROCESS_KIND: Lazy<std::sync::Mutex<UIntGaugeVec>> = Lazy::new(|| {
-        std::sync::Mutex::new(
-            register_uint_gauge_vec!(
-                "pageserver_wal_redo_process_kind",
-                "The configured process kind for walredo",
-                &["kind"],
-            )
-            .unwrap(),
-        )
-    });
-
-    pub fn set_process_kind_metric(kind: crate::walredo::ProcessKind) {
-        // use guard to avoid races around the next two steps
-        let guard = PROCESS_KIND.lock().unwrap();
-        guard.reset();
-        guard.with_label_values(&[&format!("{kind}")]).set(1);
-    }
-}
-
 /// Similar to `prometheus::HistogramTimer` but does not record on drop.
 pub(crate) struct StorageTimeMetricsTimer {
    metrics: StorageTimeMetrics,
@@ -1913,22 +1840,6 @@ impl StorageTimeMetricsTimer {
        self.metrics.timeline_count.inc();
        self.metrics.global_histogram.observe(duration);
    }
-
-    /// Turns this timer into a timer, which will always record -- usually this means recording
-    /// regardless an early `?` path was taken in a function.
-    pub(crate) fn record_on_drop(self) -> AlwaysRecordingStorageTimeMetricsTimer {
-        AlwaysRecordingStorageTimeMetricsTimer(Some(self))
-    }
-}
-
-pub(crate) struct AlwaysRecordingStorageTimeMetricsTimer(Option<StorageTimeMetricsTimer>);
-
-impl Drop for AlwaysRecordingStorageTimeMetricsTimer {
-    fn drop(&mut self) {
-        if let Some(inner) = self.0.take() {
-            inner.stop_and_record();
-        }
-    }
 }

 /// Timing facilities for an globally histogrammed metric, which is supported by per tenant and
@@ -1989,7 +1900,6 @@ pub(crate) struct TimelineMetrics {
    pub imitate_logical_size_histo: StorageTimeMetrics,
    pub load_layer_map_histo: StorageTimeMetrics,
    pub garbage_collect_histo: StorageTimeMetrics,
-    pub update_gc_info_histo: StorageTimeMetrics,
    pub last_record_gauge: IntGauge,
    resident_physical_size_gauge: UIntGauge,
    /// copy of LayeredTimeline.current_logical_size
@@ -2050,12 +1960,6 @@ impl TimelineMetrics {
            &shard_id,
            &timeline_id,
        );
-        let update_gc_info_histo = StorageTimeMetrics::new(
-            StorageTimeOperation::UpdateGcInfo,
-            &tenant_id,
-            &shard_id,
-            &timeline_id,
-        );
        let last_record_gauge = LAST_RECORD_LSN
            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
            .unwrap();
@@ -2098,7 +2002,6 @@ impl TimelineMetrics {
            logical_size_histo,
            imitate_logical_size_histo,
            garbage_collect_histo,
-            update_gc_info_histo,
            load_layer_map_histo,
            last_record_gauge,
            resident_physical_size_gauge,
@@ -2186,7 +2089,7 @@ impl TimelineMetrics {

 pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) {
    // Only shard zero deals in synthetic sizes
-    if tenant_shard_id.is_shard_zero() {
+    if tenant_shard_id.is_zero() {
        let tid = tenant_shard_id.tenant_id.to_string();
        let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]);
    }
@@ -2843,8 +2746,7 @@ pub fn preinitialize_metrics() {

    // histograms
    [
-        &READ_NUM_LAYERS_VISITED,
-        &VEC_READ_NUM_LAYERS_VISITED,
+        &READ_NUM_FS_LAYERS,
        &WAIT_LSN_TIME,
        &WAL_REDO_TIME,
        &WAL_REDO_RECORDS_HISTOGRAM,
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -1,5 +1,13 @@
+//
 //! The Page Service listens for client connections and serves their GetPage@LSN
 //! requests.
+//
+//   It is possible to connect here using usual psql/pgbench/libpq. Following
+// commands are supported now:
+//     *status* -- show actual info about this pageserver,
+//     *pagestream* -- enter mode where smgr and pageserver talk with their
+//  custom protocol.
+//

 use anyhow::Context;
 use async_compression::tokio::write::GzipEncoder;
@@ -15,7 +23,7 @@ use pageserver_api::models::{
    PagestreamErrorResponse, PagestreamExistsRequest, PagestreamExistsResponse,
    PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetPageResponse,
    PagestreamGetSlruSegmentRequest, PagestreamGetSlruSegmentResponse, PagestreamNblocksRequest,
-    PagestreamNblocksResponse, PagestreamProtocolVersion,
+    PagestreamNblocksResponse,
 };
 use pageserver_api::shard::ShardIndex;
 use pageserver_api::shard::ShardNumber;
@@ -48,7 +56,6 @@ use utils::{

 use crate::auth::check_permission;
 use crate::basebackup;
-use crate::basebackup::BasebackupError;
 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::import_datadir::import_wal_from_tar;
@@ -544,7 +551,6 @@ impl PageServerHandler {
        pgb: &mut PostgresBackend<IO>,
        tenant_id: TenantId,
        timeline_id: TimelineId,
-        protocol_version: PagestreamProtocolVersion,
        ctx: RequestContext,
    ) -> Result<(), QueryError>
    where
@@ -607,15 +613,14 @@ impl PageServerHandler {
                t.trace(&copy_data_bytes)
            }

-            let neon_fe_msg =
-                PagestreamFeMessage::parse(&mut copy_data_bytes.reader(), protocol_version)?;
+            let neon_fe_msg = PagestreamFeMessage::parse(&mut copy_data_bytes.reader())?;

            // TODO: We could create a new per-request context here, with unique ID.
            // Currently we use the same per-timeline context for all requests

            let (response, span) = match neon_fe_msg {
                PagestreamFeMessage::Exists(req) => {
-                    let span = tracing::info_span!("handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.request_lsn);
+                    let span = tracing::info_span!("handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.lsn);
                    (
                        self.handle_get_rel_exists_request(tenant_id, timeline_id, &req, &ctx)
                            .instrument(span.clone())
@@ -624,7 +629,7 @@ impl PageServerHandler {
                    )
                }
                PagestreamFeMessage::Nblocks(req) => {
-                    let span = tracing::info_span!("handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.request_lsn);
+                    let span = tracing::info_span!("handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.lsn);
                    (
                        self.handle_get_nblocks_request(tenant_id, timeline_id, &req, &ctx)
                            .instrument(span.clone())
@@ -634,7 +639,7 @@ impl PageServerHandler {
                }
                PagestreamFeMessage::GetPage(req) => {
                    // shard_id is filled in by the handler
-                    let span = tracing::info_span!("handle_get_page_at_lsn_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.request_lsn);
+                    let span = tracing::info_span!("handle_get_page_at_lsn_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.lsn);
                    (
                        self.handle_get_page_at_lsn_request(tenant_id, timeline_id, &req, &ctx)
                            .instrument(span.clone())
@@ -643,7 +648,7 @@ impl PageServerHandler {
                    )
                }
                PagestreamFeMessage::DbSize(req) => {
-                    let span = tracing::info_span!("handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.request_lsn);
+                    let span = tracing::info_span!("handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.lsn);
                    (
                        self.handle_db_size_request(tenant_id, timeline_id, &req, &ctx)
                            .instrument(span.clone())
@@ -652,7 +657,7 @@ impl PageServerHandler {
                    )
                }
                PagestreamFeMessage::GetSlruSegment(req) => {
-                    let span = tracing::info_span!("handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.request_lsn);
+                    let span = tracing::info_span!("handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.lsn);
                    (
                        self.handle_get_slru_segment_request(tenant_id, timeline_id, &req, &ctx)
                            .instrument(span.clone())
@@ -833,80 +838,78 @@ impl PageServerHandler {
    /// Helper function to handle the LSN from client request.
    ///
    /// Each GetPage (and Exists and Nblocks) request includes information about
-    /// which version of the page is being requested. The primary compute node
-    /// will always request the latest page version, by setting 'request_lsn' to
-    /// the last inserted or flushed WAL position, while a standby will request
-    /// a version at the LSN that it's currently caught up to.
+    /// which version of the page is being requested. The client can request the
+    /// latest version of the page, or the version that's valid at a particular
+    /// LSN. The primary compute node will always request the latest page
+    /// version, while a standby will request a version at the LSN that it's
+    /// currently caught up to.
    ///
    /// In either case, if the page server hasn't received the WAL up to the
    /// requested LSN yet, we will wait for it to arrive. The return value is
    /// the LSN that should be used to look up the page versions.
-    ///
-    /// In addition to the request LSN, each request carries another LSN,
-    /// 'not_modified_since', which is a hint to the pageserver that the client
-    /// knows that the page has not been modified between 'not_modified_since'
-    /// and the request LSN. This allows skipping the wait, as long as the WAL
-    /// up to 'not_modified_since' has arrived. If the client doesn't have any
-    /// information about when the page was modified, it will use
-    /// not_modified_since == lsn. If the client lies and sends a too low
-    /// not_modified_hint such that there are in fact later page versions, the
-    /// behavior is undefined: the pageserver may return any of the page versions
-    /// or an error.
    async fn wait_or_get_last_lsn(
        timeline: &Timeline,
-        request_lsn: Lsn,
-        not_modified_since: Lsn,
+        mut lsn: Lsn,
+        latest: bool,
        latest_gc_cutoff_lsn: &RcuReadGuard<Lsn>,
        ctx: &RequestContext,
    ) -> Result<Lsn, PageStreamError> {
-        let last_record_lsn = timeline.get_last_record_lsn();
+        if latest {
+            // Latest page version was requested. If LSN is given, it is a hint
+            // to the page server that there have been no modifications to the
+            // page after that LSN. If we haven't received WAL up to that point,
+            // wait until it arrives.
+            let last_record_lsn = timeline.get_last_record_lsn();

-        // Sanity check the request
-        if request_lsn < not_modified_since {
-            return Err(PageStreamError::BadRequest(
-                format!(
-                    "invalid request with request LSN {} and not_modified_since {}",
-                    request_lsn, not_modified_since,
-                )
-                .into(),
-            ));
-        }
-
-        if request_lsn < **latest_gc_cutoff_lsn {
-            // Check explicitly for INVALID just to get a less scary error message if the
-            // request is obviously bogus
-            return Err(if request_lsn == Lsn::INVALID {
-                PageStreamError::BadRequest("invalid LSN(0) in request".into())
+            // Note: this covers the special case that lsn == Lsn(0). That
+            // special case means "return the latest version whatever it is",
+            // and it's used for bootstrapping purposes, when the page server is
+            // connected directly to the compute node. That is needed because
+            // when you connect to the compute node, to receive the WAL, the
+            // walsender process will do a look up in the pg_authid catalog
+            // table for authentication. That poses a deadlock problem: the
+            // catalog table lookup will send a GetPage request, but the GetPage
+            // request will block in the page server because the recent WAL
+            // hasn't been received yet, and it cannot be received until the
+            // walsender completes the authentication and starts streaming the
+            // WAL.
+            if lsn <= last_record_lsn {
+                lsn = last_record_lsn;
            } else {
-                PageStreamError::BadRequest(format!(
-                        "tried to request a page version that was garbage collected. requested at {} gc cutoff {}",
-                        request_lsn, **latest_gc_cutoff_lsn
-                    ).into())
-            });
-        }
-
-        // Wait for WAL up to 'not_modified_since' to arrive, if necessary
-        if not_modified_since > last_record_lsn {
+                timeline
+                    .wait_lsn(
+                        lsn,
+                        crate::tenant::timeline::WaitLsnWaiter::PageService,
+                        ctx,
+                    )
+                    .await?;
+                // Since we waited for 'lsn' to arrive, that is now the last
+                // record LSN. (Or close enough for our purposes; the
+                // last-record LSN can advance immediately after we return
+                // anyway)
+            }
+        } else {
+            if lsn == Lsn(0) {
+                return Err(PageStreamError::BadRequest(
+                    "invalid LSN(0) in request".into(),
+                ));
+            }
            timeline
                .wait_lsn(
-                    not_modified_since,
+                    lsn,
                    crate::tenant::timeline::WaitLsnWaiter::PageService,
                    ctx,
                )
                .await?;
-            // Since we waited for 'not_modified_since' to arrive, that is now the last
-            // record LSN. (Or close enough for our purposes; the last-record LSN can
-            // advance immediately after we return anyway)
-            Ok(not_modified_since)
-        } else {
-            // It might be better to use max(not_modified_since, latest_gc_cutoff_lsn)
-            // here instead. That would give the same result, since we know that there
-            // haven't been any modifications since 'not_modified_since'. Using an older
-            // LSN might be faster, because that could allow skipping recent layers when
-            // finding the page. However, we have historically used 'last_record_lsn', so
-            // stick to that for now.
-            Ok(std::cmp::min(last_record_lsn, request_lsn))
        }
+
+        if lsn < **latest_gc_cutoff_lsn {
+            return Err(PageStreamError::BadRequest(format!(
+                "tried to request a page version that was garbage collected. requested at {} gc cutoff {}",
+                lsn, **latest_gc_cutoff_lsn
+            ).into()));
+        }
+        Ok(lsn)
    }

    #[instrument(skip_all, fields(shard_id))]
@@ -923,17 +926,12 @@ impl PageServerHandler {
            .start_timer(metrics::SmgrQueryType::GetRelExists, ctx);

        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
-        let lsn = Self::wait_or_get_last_lsn(
-            timeline,
-            req.request_lsn,
-            req.not_modified_since,
-            &latest_gc_cutoff_lsn,
-            ctx,
-        )
-        .await?;
+        let lsn =
+            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
+                .await?;

        let exists = timeline
-            .get_rel_exists(req.rel, Version::Lsn(lsn), ctx)
+            .get_rel_exists(req.rel, Version::Lsn(lsn), req.latest, ctx)
            .await?;

        Ok(PagestreamBeMessage::Exists(PagestreamExistsResponse {
@@ -956,17 +954,12 @@ impl PageServerHandler {
            .start_timer(metrics::SmgrQueryType::GetRelSize, ctx);

        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
-        let lsn = Self::wait_or_get_last_lsn(
-            timeline,
-            req.request_lsn,
-            req.not_modified_since,
-            &latest_gc_cutoff_lsn,
-            ctx,
-        )
-        .await?;
+        let lsn =
+            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
+                .await?;

        let n_blocks = timeline
-            .get_rel_size(req.rel, Version::Lsn(lsn), ctx)
+            .get_rel_size(req.rel, Version::Lsn(lsn), req.latest, ctx)
            .await?;

        Ok(PagestreamBeMessage::Nblocks(PagestreamNblocksResponse {
@@ -989,17 +982,18 @@ impl PageServerHandler {
            .start_timer(metrics::SmgrQueryType::GetDbSize, ctx);

        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
-        let lsn = Self::wait_or_get_last_lsn(
-            timeline,
-            req.request_lsn,
-            req.not_modified_since,
-            &latest_gc_cutoff_lsn,
-            ctx,
-        )
-        .await?;
+        let lsn =
+            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
+                .await?;

        let total_blocks = timeline
-            .get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, Version::Lsn(lsn), ctx)
+            .get_db_size(
+                DEFAULTTABLESPACE_OID,
+                req.dbnode,
+                Version::Lsn(lsn),
+                req.latest,
+                ctx,
+            )
            .await?;
        let db_size = total_blocks as i64 * BLCKSZ as i64;

@@ -1166,17 +1160,12 @@ impl PageServerHandler {
            .start_timer(metrics::SmgrQueryType::GetPageAtLsn, ctx);

        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
-        let lsn = Self::wait_or_get_last_lsn(
-            timeline,
-            req.request_lsn,
-            req.not_modified_since,
-            &latest_gc_cutoff_lsn,
-            ctx,
-        )
-        .await?;
+        let lsn =
+            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
+                .await?;

        let page = timeline
-            .get_rel_page_at_lsn(req.rel, req.blkno, Version::Lsn(lsn), ctx)
+            .get_rel_page_at_lsn(req.rel, req.blkno, Version::Lsn(lsn), req.latest, ctx)
            .await?;

        Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
@@ -1199,14 +1188,9 @@ impl PageServerHandler {
            .start_timer(metrics::SmgrQueryType::GetSlruSegment, ctx);

        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
-        let lsn = Self::wait_or_get_last_lsn(
-            timeline,
-            req.request_lsn,
-            req.not_modified_since,
-            &latest_gc_cutoff_lsn,
-            ctx,
-        )
-        .await?;
+        let lsn =
+            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
+                .await?;

        let kind = SlruKind::from_repr(req.kind)
            .ok_or(PageStreamError::BadRequest("invalid SLRU kind".into()))?;
@@ -1217,10 +1201,6 @@ impl PageServerHandler {
        ))
    }

-    /// Note on "fullbackup":
-    /// Full basebackups should only be used for debugging purposes.
-    /// Originally, it was introduced to enable breaking storage format changes,
-    /// but that is not applicable anymore.
    #[allow(clippy::too_many_arguments)]
    #[instrument(skip_all, fields(shard_id, ?lsn, ?prev_lsn, %full_backup))]
    async fn handle_basebackup_request<IO>(
@@ -1237,13 +1217,6 @@ impl PageServerHandler {
    where
        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
    {
-        fn map_basebackup_error(err: BasebackupError) -> QueryError {
-            match err {
-                BasebackupError::Client(e) => QueryError::Disconnected(ConnectionError::Io(e)),
-                BasebackupError::Server(e) => QueryError::Other(e),
-            }
-        }
-
        let started = std::time::Instant::now();

        // check that the timeline exists
@@ -1269,8 +1242,7 @@ impl PageServerHandler {
        let lsn_awaited_after = started.elapsed();

        // switch client to COPYOUT
-        pgb.write_message_noflush(&BeMessage::CopyOutResponse)
-            .map_err(QueryError::Disconnected)?;
+        pgb.write_message_noflush(&BeMessage::CopyOutResponse)?;
        self.flush_cancellable(pgb, &timeline.cancel).await?;

        // Send a tarball of the latest layer on the timeline. Compress if not
@@ -1285,8 +1257,7 @@ impl PageServerHandler {
                full_backup,
                ctx,
            )
-            .await
-            .map_err(map_basebackup_error)?;
+            .await?;
        } else {
            let mut writer = pgb.copyout_writer();
            if gzip {
@@ -1307,13 +1278,9 @@ impl PageServerHandler {
                    full_backup,
                    ctx,
                )
-                .await
-                .map_err(map_basebackup_error)?;
+                .await?;
                // shutdown the encoder to ensure the gzip footer is written
-                encoder
-                    .shutdown()
-                    .await
-                    .map_err(|e| QueryError::Disconnected(ConnectionError::Io(e)))?;
+                encoder.shutdown().await?;
            } else {
                basebackup::send_basebackup_tarball(
                    &mut writer,
@@ -1323,13 +1290,11 @@ impl PageServerHandler {
                    full_backup,
                    ctx,
                )
-                .await
-                .map_err(map_basebackup_error)?;
+                .await?;
            }
        }

-        pgb.write_message_noflush(&BeMessage::CopyDone)
-            .map_err(QueryError::Disconnected)?;
+        pgb.write_message_noflush(&BeMessage::CopyDone)?;
        self.flush_cancellable(pgb, &timeline.cancel).await?;

        let basebackup_after = started
@@ -1439,34 +1404,7 @@ where

        let ctx = self.connection_ctx.attached_child();
        debug!("process query {query_string:?}");
-        if query_string.starts_with("pagestream_v2 ") {
-            let (_, params_raw) = query_string.split_at("pagestream_v2 ".len());
-            let params = params_raw.split(' ').collect::<Vec<_>>();
-            if params.len() != 2 {
-                return Err(QueryError::Other(anyhow::anyhow!(
-                    "invalid param number for pagestream command"
-                )));
-            }
-            let tenant_id = TenantId::from_str(params[0])
-                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
-            let timeline_id = TimelineId::from_str(params[1])
-                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
-
-            tracing::Span::current()
-                .record("tenant_id", field::display(tenant_id))
-                .record("timeline_id", field::display(timeline_id));
-
-            self.check_permission(Some(tenant_id))?;
-
-            self.handle_pagerequests(
-                pgb,
-                tenant_id,
-                timeline_id,
-                PagestreamProtocolVersion::V2,
-                ctx,
-            )
-            .await?;
-        } else if query_string.starts_with("pagestream ") {
+        if query_string.starts_with("pagestream ") {
            let (_, params_raw) = query_string.split_at("pagestream ".len());
            let params = params_raw.split(' ').collect::<Vec<_>>();
            if params.len() != 2 {
@@ -1485,14 +1423,8 @@ where

            self.check_permission(Some(tenant_id))?;

-            self.handle_pagerequests(
-                pgb,
-                tenant_id,
-                timeline_id,
-                PagestreamProtocolVersion::V1,
-                ctx,
-            )
-            .await?;
+            self.handle_pagerequests(pgb, tenant_id, timeline_id, ctx)
+                .await?;
        } else if query_string.starts_with("basebackup ") {
            let (_, params_raw) = query_string.split_at("basebackup ".len());
            let params = params_raw.split_whitespace().collect::<Vec<_>>();
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -9,7 +9,6 @@
 use super::tenant::{PageReconstructError, Timeline};
 use crate::context::RequestContext;
 use crate::keyspace::{KeySpace, KeySpaceAccum};
-use crate::metrics::WAL_INGEST;
 use crate::repository::*;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
 use crate::walrecord::NeonWalRecord;
@@ -23,7 +22,6 @@ use pageserver_api::key::{
    slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range,
    AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
 };
-use pageserver_api::keyspace::SparseKeySpace;
 use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::BLCKSZ;
@@ -177,6 +175,7 @@ impl Timeline {
        tag: RelTag,
        blknum: BlockNumber,
        version: Version<'_>,
+        latest: bool,
        ctx: &RequestContext,
    ) -> Result<Bytes, PageReconstructError> {
        if tag.relnode == 0 {
@@ -185,7 +184,7 @@ impl Timeline {
            ));
        }

-        let nblocks = self.get_rel_size(tag, version, ctx).await?;
+        let nblocks = self.get_rel_size(tag, version, latest, ctx).await?;
        if blknum >= nblocks {
            debug!(
                "read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page",
@@ -207,6 +206,7 @@ impl Timeline {
        spcnode: Oid,
        dbnode: Oid,
        version: Version<'_>,
+        latest: bool,
        ctx: &RequestContext,
    ) -> Result<usize, PageReconstructError> {
        let mut total_blocks = 0;
@@ -214,7 +214,7 @@ impl Timeline {
        let rels = self.list_rels(spcnode, dbnode, version, ctx).await?;

        for rel in rels {
-            let n_blocks = self.get_rel_size(rel, version, ctx).await?;
+            let n_blocks = self.get_rel_size(rel, version, latest, ctx).await?;
            total_blocks += n_blocks as usize;
        }
        Ok(total_blocks)
@@ -225,6 +225,7 @@ impl Timeline {
        &self,
        tag: RelTag,
        version: Version<'_>,
+        latest: bool,
        ctx: &RequestContext,
    ) -> Result<BlockNumber, PageReconstructError> {
        if tag.relnode == 0 {
@@ -238,7 +239,7 @@ impl Timeline {
        }

        if (tag.forknum == FSM_FORKNUM || tag.forknum == VISIBILITYMAP_FORKNUM)
-            && !self.get_rel_exists(tag, version, ctx).await?
+            && !self.get_rel_exists(tag, version, latest, ctx).await?
        {
            // FIXME: Postgres sometimes calls smgrcreate() to create
            // FSM, and smgrnblocks() on it immediately afterwards,
@@ -251,8 +252,16 @@ impl Timeline {
        let mut buf = version.get(self, key, ctx).await?;
        let nblocks = buf.get_u32_le();

-        self.update_cached_rel_size(tag, version.get_lsn(), nblocks);
-
+        if latest {
+            // Update relation size cache only if "latest" flag is set.
+            // This flag is set by compute when it is working with most recent version of relation.
+            // Typically master compute node always set latest=true.
+            // Please notice, that even if compute node "by mistake" specifies old LSN but set
+            // latest=true, then it can not cause cache corruption, because with latest=true
+            // pageserver choose max(request_lsn, last_written_lsn) and so cached value will be
+            // associated with most recent value of LSN.
+            self.update_cached_rel_size(tag, version.get_lsn(), nblocks);
+        }
        Ok(nblocks)
    }

@@ -261,6 +270,7 @@ impl Timeline {
        &self,
        tag: RelTag,
        version: Version<'_>,
+        _latest: bool,
        ctx: &RequestContext,
    ) -> Result<bool, PageReconstructError> {
        if tag.relnode == 0 {
@@ -456,12 +466,6 @@ impl Timeline {
                // Didn't find any commit timestamps smaller than the request
                Ok(LsnForTimestamp::Past(min_lsn))
            }
-            (true, _) if commit_lsn < min_lsn => {
-                // the search above did set found_smaller to true but it never increased the lsn.
-                // Then, low is still the old min_lsn, and the subtraction above gave a value
-                // below the min_lsn. We should never do that.
-                Ok(LsnForTimestamp::Past(min_lsn))
-            }
            (true, false) => {
                // Only found commits with timestamps smaller than the request.
                // It's still a valid case for branch creation, return it.
@@ -731,13 +735,11 @@ impl Timeline {
    /// Get a KeySpace that covers all the Keys that are in use at the given LSN.
    /// Anything that's not listed maybe removed from the underlying storage (from
    /// that LSN forwards).
-    ///
-    /// The return value is (dense keyspace, sparse keyspace).
    pub(crate) async fn collect_keyspace(
        &self,
        lsn: Lsn,
        ctx: &RequestContext,
-    ) -> Result<(KeySpace, SparseKeySpace), CollectKeySpaceError> {
+    ) -> Result<KeySpace, CollectKeySpaceError> {
        // Iterate through key ranges, greedily packing them into partitions
        let mut result = KeySpaceAccum::new();

@@ -809,18 +811,13 @@ impl Timeline {
        if self.get(AUX_FILES_KEY, lsn, ctx).await.is_ok() {
            result.add_key(AUX_FILES_KEY);
        }
-
-        Ok((
-            result.to_keyspace(),
-            /* AUX sparse key space */
-            SparseKeySpace(KeySpace::single(Key::metadata_aux_key_range())),
-        ))
+        Ok(result.to_keyspace())
    }

    /// Get cached size of relation if it not updated after specified LSN
    pub fn get_cached_rel_size(&self, tag: &RelTag, lsn: Lsn) -> Option<BlockNumber> {
        let rel_size_cache = self.rel_size_cache.read().unwrap();
-        if let Some((cached_lsn, nblocks)) = rel_size_cache.map.get(tag) {
+        if let Some((cached_lsn, nblocks)) = rel_size_cache.get(tag) {
            if lsn >= *cached_lsn {
                return Some(*nblocks);
            }
@@ -831,16 +828,7 @@ impl Timeline {
    /// Update cached relation size if there is no more recent update
    pub fn update_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) {
        let mut rel_size_cache = self.rel_size_cache.write().unwrap();
-
-        if lsn < rel_size_cache.complete_as_of {
-            // Do not cache old values. It's safe to cache the size on read, as long as
-            // the read was at an LSN since we started the WAL ingestion. Reasoning: we
-            // never evict values from the cache, so if the relation size changed after
-            // 'lsn', the new value is already in the cache.
-            return;
-        }
-
-        match rel_size_cache.map.entry(tag) {
+        match rel_size_cache.entry(tag) {
            hash_map::Entry::Occupied(mut entry) => {
                let cached_lsn = entry.get_mut();
                if lsn >= cached_lsn.0 {
@@ -856,13 +844,13 @@ impl Timeline {
    /// Store cached relation size
    pub fn set_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) {
        let mut rel_size_cache = self.rel_size_cache.write().unwrap();
-        rel_size_cache.map.insert(tag, (lsn, nblocks));
+        rel_size_cache.insert(tag, (lsn, nblocks));
    }

    /// Remove cached relation size
    pub fn remove_cached_rel_size(&self, tag: &RelTag) {
        let mut rel_size_cache = self.rel_size_cache.write().unwrap();
-        rel_size_cache.map.remove(tag);
+        rel_size_cache.remove(tag);
    }
 }

@@ -1100,7 +1088,7 @@ impl<'a> DatadirModification<'a> {
    ) -> anyhow::Result<()> {
        let total_blocks = self
            .tline
-            .get_db_size(spcnode, dbnode, Version::Modified(self), ctx)
+            .get_db_size(spcnode, dbnode, Version::Modified(self), true, ctx)
            .await?;

        // Remove entry from dbdir
@@ -1199,7 +1187,7 @@ impl<'a> DatadirModification<'a> {
        anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
        if self
            .tline
-            .get_rel_exists(rel, Version::Modified(self), ctx)
+            .get_rel_exists(rel, Version::Modified(self), true, ctx)
            .await?
        {
            let size_key = rel_size_to_key(rel);
@@ -1413,7 +1401,7 @@ impl<'a> DatadirModification<'a> {
        let n_files;
        let mut aux_files = self.tline.aux_files.lock().await;
        if let Some(mut dir) = aux_files.dir.take() {
-            // We already updated aux files in `self`: emit a delta and update our latest value.
+            // We already updated aux files in `self`: emit a delta and update our latest value
            dir.upsert(file_path.clone(), content.clone());
            n_files = dir.files.len();
            if aux_files.n_deltas == MAX_AUX_FILE_DELTAS {
@@ -1458,14 +1446,10 @@ impl<'a> DatadirModification<'a> {
                    // reset the map.
                    return Err(e.into());
                }
-                // Note: we added missing key error variant in https://github.com/neondatabase/neon/pull/7393 but
-                // the original code assumes all other errors are missing keys. Therefore, we keep the code path
-                // the same for now, though in theory, we should only match the `MissingKey` variant.
-                Err(
-                    PageReconstructError::Other(_)
-                    | PageReconstructError::WalRedo(_)
-                    | PageReconstructError::MissingKey { .. },
-                ) => {
+                // FIXME: PageReconstructError doesn't have an explicit variant for key-not-found, so
+                // we are assuming that all _other_ possible errors represents a missing key.  If some
+                // other error occurs, we may incorrectly reset the map of aux files.
+                Err(PageReconstructError::Other(_) | PageReconstructError::WalRedo(_)) => {
                    // Key is missing, we must insert an image as the basis for subsequent deltas.

                    let mut dir = AuxFilesDirectory {
@@ -1557,8 +1541,6 @@ impl<'a> DatadirModification<'a> {
    pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
        let mut writer = self.tline.writer().await;

-        let timer = WAL_INGEST.time_spent_on_ingest.start_timer();
-
        let pending_nblocks = self.pending_nblocks;
        self.pending_nblocks = 0;

@@ -1598,8 +1580,6 @@ impl<'a> DatadirModification<'a> {
            writer.update_directory_entries_count(kind, count as u64);
        }

-        timer.observe_duration();
-
        Ok(())
    }

--- a/pageserver/src/repository.rs
+++ b/pageserver/src/repository.rs
@@ -33,52 +33,6 @@ impl Value {
    }
 }

-#[cfg(test)]
-#[derive(Debug, PartialEq)]
-pub(crate) enum InvalidInput {
-    TooShortValue,
-    TooShortPostgresRecord,
-}
-
-/// We could have a ValueRef where everything is `serde(borrow)`. Before implementing that, lets
-/// use this type for querying if a slice looks some particular way.
-#[cfg(test)]
-pub(crate) struct ValueBytes;
-
-#[cfg(test)]
-impl ValueBytes {
-    pub(crate) fn will_init(raw: &[u8]) -> Result<bool, InvalidInput> {
-        if raw.len() < 12 {
-            return Err(InvalidInput::TooShortValue);
-        }
-
-        let value_discriminator = &raw[0..4];
-
-        if value_discriminator == [0, 0, 0, 0] {
-            // Value::Image always initializes
-            return Ok(true);
-        }
-
-        if value_discriminator != [0, 0, 0, 1] {
-            // not a Value::WalRecord(..)
-            return Ok(false);
-        }
-
-        let walrecord_discriminator = &raw[4..8];
-
-        if walrecord_discriminator != [0, 0, 0, 0] {
-            // only NeonWalRecord::Postgres can have will_init
-            return Ok(false);
-        }
-
-        if raw.len() < 17 {
-            return Err(InvalidInput::TooShortPostgresRecord);
-        }
-
-        Ok(raw[8] == 1)
-    }
-}
-
 #[cfg(test)]
 mod test {
    use super::*;
@@ -116,8 +70,6 @@ mod test {
        ];

        roundtrip!(image, expected);
-
-        assert!(ValueBytes::will_init(&expected).unwrap());
    }

    #[test]
@@ -141,96 +93,6 @@ mod test {
        ];

        roundtrip!(rec, expected);
-
-        assert!(ValueBytes::will_init(&expected).unwrap());
-    }
-
-    #[test]
-    fn bytes_inspection_too_short_image() {
-        let rec = Value::Image(Bytes::from_static(b""));
-
-        #[rustfmt::skip]
-        let expected = [
-            // top level discriminator of 4 bytes
-            0x00, 0x00, 0x00, 0x00,
-            // 8 byte length
-            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-        ];
-
-        roundtrip!(rec, expected);
-
-        assert!(ValueBytes::will_init(&expected).unwrap());
-        assert_eq!(expected.len(), 12);
-        for len in 0..12 {
-            assert_eq!(
-                ValueBytes::will_init(&expected[..len]).unwrap_err(),
-                InvalidInput::TooShortValue
-            );
-        }
-    }
-
-    #[test]
-    fn bytes_inspection_too_short_postgres_record() {
-        let rec = NeonWalRecord::Postgres {
-            will_init: false,
-            rec: Bytes::from_static(b""),
-        };
-        let rec = Value::WalRecord(rec);
-
-        #[rustfmt::skip]
-        let expected = [
-            // flattened discriminator of total 8 bytes
-            0x00, 0x00, 0x00, 0x01,
-            0x00, 0x00, 0x00, 0x00,
-            // will_init
-            0x00,
-            // 8 byte length
-            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-        ];
-
-        roundtrip!(rec, expected);
-
-        assert!(!ValueBytes::will_init(&expected).unwrap());
-        assert_eq!(expected.len(), 17);
-        for len in 12..17 {
-            assert_eq!(
-                ValueBytes::will_init(&expected[..len]).unwrap_err(),
-                InvalidInput::TooShortPostgresRecord
-            )
-        }
-        for len in 0..12 {
-            assert_eq!(
-                ValueBytes::will_init(&expected[..len]).unwrap_err(),
-                InvalidInput::TooShortValue
-            )
-        }
-    }
-
-    #[test]
-    fn clear_visibility_map_flags_example() {
-        let rec = NeonWalRecord::ClearVisibilityMapFlags {
-            new_heap_blkno: Some(0x11),
-            old_heap_blkno: None,
-            flags: 0x03,
-        };
-        let rec = Value::WalRecord(rec);
-
-        #[rustfmt::skip]
-        let expected = [
-            // discriminators
-            0x00, 0x00, 0x00, 0x01,
-            0x00, 0x00, 0x00, 0x01,
-            // Some == 1 followed by 4 bytes
-            0x01, 0x00, 0x00, 0x00, 0x11,
-            // None == 0
-            0x00,
-            // flags
-            0x03
-        ];
-
-        roundtrip!(rec, expected);
-
-        assert!(!ValueBytes::will_init(&expected).unwrap());
    }
 }

--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -361,8 +361,6 @@ pub enum TaskKind {

    DebugTool,

-    EphemeralFilePreWarmPageCache,
-
    #[cfg(test)]
    UnitTest,
 }
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -386,7 +386,7 @@ impl WalRedoManager {

    pub(crate) fn status(&self) -> Option<WalRedoManagerStatus> {
        match self {
-            WalRedoManager::Prod(m) => Some(m.status()),
+            WalRedoManager::Prod(m) => m.status(),
            #[cfg(test)]
            WalRedoManager::Test(_) => None,
        }
@@ -559,10 +559,9 @@ impl Tenant {
            // By doing what we do here, the index part upload is retried.
            // If control plane retries timeline creation in the meantime, the mgmt API handler
            // for timeline creation will coalesce on the upload we queue here.
-            // FIXME: this branch should be dead code as we no longer write local metadata.
            let rtc = timeline.remote_client.as_ref().unwrap();
            rtc.init_upload_queue_for_empty_remote(&metadata)?;
-            rtc.schedule_index_upload_for_full_metadata_update(&metadata)?;
+            rtc.schedule_index_upload_for_metadata_update(&metadata)?;
        }

        timeline
@@ -888,7 +887,7 @@ impl Tenant {

    #[instrument(skip_all)]
    pub(crate) async fn preload(
-        self: &Arc<Self>,
+        self: &Arc<Tenant>,
        remote_storage: &GenericRemoteStorage,
        cancel: CancellationToken,
    ) -> anyhow::Result<TenantPreload> {
@@ -918,13 +917,9 @@ impl Tenant {

        Ok(TenantPreload {
            deleting,
-            timelines: Self::load_timeline_metadata(
-                self,
-                remote_timeline_ids,
-                remote_storage,
-                cancel,
-            )
-            .await?,
+            timelines: self
+                .load_timeline_metadata(remote_timeline_ids, remote_storage, cancel)
+                .await?,
        })
    }

@@ -2874,23 +2869,20 @@ impl Tenant {
                }
            }

-            let cutoff = timeline
-                .get_last_record_lsn()
-                .checked_sub(horizon)
-                .unwrap_or(Lsn(0));
+            if let Some(cutoff) = timeline.get_last_record_lsn().checked_sub(horizon) {
+                let branchpoints: Vec<Lsn> = all_branchpoints
+                    .range((
+                        Included((timeline_id, Lsn(0))),
+                        Included((timeline_id, Lsn(u64::MAX))),
+                    ))
+                    .map(|&x| x.1)
+                    .collect();
+                timeline
+                    .update_gc_info(branchpoints, cutoff, pitr, cancel, ctx)
+                    .await?;

-            let branchpoints: Vec<Lsn> = all_branchpoints
-                .range((
-                    Included((timeline_id, Lsn(0))),
-                    Included((timeline_id, Lsn(u64::MAX))),
-                ))
-                .map(|&x| x.1)
-                .collect();
-            timeline
-                .update_gc_info(branchpoints, cutoff, pitr, cancel, ctx)
-                .await?;
-
-            gc_timelines.push(timeline);
+                gc_timelines.push(timeline);
+            }
        }
        drop(gc_cs);
        Ok(gc_timelines)
@@ -3035,7 +3027,7 @@ impl Tenant {
        // See also https://github.com/neondatabase/neon/issues/3865
        if let Some(remote_client) = new_timeline.remote_client.as_ref() {
            remote_client
-                .schedule_index_upload_for_full_metadata_update(&metadata)
+                .schedule_index_upload_for_metadata_update(&metadata)
                .context("branch initial metadata upload")?;
        }

@@ -3198,7 +3190,7 @@ impl Tenant {
            run_initdb(self.conf, &pgdata_path, pg_version, &self.cancel).await?;

            // Upload the created data dir to S3
-            if self.tenant_shard_id().is_shard_zero() {
+            if self.tenant_shard_id().is_zero() {
                self.upload_initdb(&timelines_path, &pgdata_path, &timeline_id)
                    .await?;
            }
@@ -3406,11 +3398,7 @@ impl Tenant {
        // is in progress (which is not a common case).
        //
        // See more for on the issue #2748 condenced out of the initial PR review.
-        let mut shared_cache = tokio::select! {
-            locked = self.cached_logical_sizes.lock() => locked,
-            _ = cancel.cancelled() => anyhow::bail!("cancelled"),
-            _ = self.cancel.cancelled() => anyhow::bail!("tenant is shutting down"),
-        };
+        let mut shared_cache = self.cached_logical_sizes.lock().await;

        size::gather_inputs(
            self,
@@ -3449,7 +3437,7 @@ impl Tenant {
            .store(size, Ordering::Relaxed);

        // Only shard zero should be calculating synthetic sizes
-        debug_assert!(self.shard_identity.is_shard_zero());
+        debug_assert!(self.shard_identity.is_zero());

        TENANT_SYNTHETIC_SIZE_METRIC
            .get_metric_with_label_values(&[&self.tenant_shard_id.tenant_id.to_string()])
@@ -3672,7 +3660,6 @@ pub(crate) mod harness {
                image_layer_creation_check_threshold: Some(
                    tenant_conf.image_layer_creation_check_threshold,
                ),
-                switch_to_aux_file_v2: Some(tenant_conf.switch_to_aux_file_v2),
            }
        }
    }
@@ -3861,8 +3848,6 @@ pub(crate) mod harness {

 #[cfg(test)]
 mod tests {
-    use std::collections::BTreeMap;
-
    use super::*;
    use crate::keyspace::KeySpaceAccum;
    use crate::repository::{Key, Value};
@@ -3871,12 +3856,9 @@ mod tests {
    use crate::DEFAULT_PG_VERSION;
    use bytes::BytesMut;
    use hex_literal::hex;
-    use pageserver_api::key::NON_INHERITED_RANGE;
    use pageserver_api::keyspace::KeySpace;
-    use pageserver_api::models::CompactionAlgorithm;
    use rand::{thread_rng, Rng};
-    use tests::storage_layer::ValuesReconstructState;
-    use tests::timeline::{GetVectoredError, ShutdownMode};
+    use tests::timeline::ShutdownMode;

    static TEST_KEY: Lazy<Key> =
        Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001")));
@@ -4513,23 +4495,11 @@ mod tests {
    }

    async fn bulk_insert_compact_gc(
-        timeline: Arc<Timeline>,
-        ctx: &RequestContext,
-        lsn: Lsn,
-        repeat: usize,
-        key_count: usize,
-    ) -> anyhow::Result<()> {
-        let compact = true;
-        bulk_insert_maybe_compact_gc(timeline, ctx, lsn, repeat, key_count, compact).await
-    }
-
-    async fn bulk_insert_maybe_compact_gc(
        timeline: Arc<Timeline>,
        ctx: &RequestContext,
        mut lsn: Lsn,
        repeat: usize,
        key_count: usize,
-        compact: bool,
    ) -> anyhow::Result<()> {
        let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
        let mut blknum = 0;
@@ -4570,11 +4540,9 @@ mod tests {
                )
                .await?;
            timeline.freeze_and_flush().await?;
-            if compact {
-                timeline
-                    .compact(&CancellationToken::new(), EnumSet::empty(), ctx)
-                    .await?;
-            }
+            timeline
+                .compact(&CancellationToken::new(), EnumSet::empty(), ctx)
+                .await?;
            timeline.gc().await?;
        }

@@ -4678,9 +4646,7 @@ mod tests {
        for read in reads {
            info!("Doing vectored read on {:?}", read);

-            let vectored_res = tline
-                .get_vectored_impl(read.clone(), reads_lsn, ValuesReconstructState::new(), &ctx)
-                .await;
+            let vectored_res = tline.get_vectored_impl(read.clone(), reads_lsn, &ctx).await;
            tline
                .validate_get_vectored_impl(&vectored_res, read, reads_lsn, &ctx)
                .await;
@@ -4689,67 +4655,6 @@ mod tests {
        Ok(())
    }

-    #[tokio::test]
-    async fn test_get_vectored_aux_files() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_get_vectored_aux_files")?;
-
-        let (tenant, ctx) = harness.load().await;
-        let tline = tenant
-            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)
-            .await?;
-        let tline = tline.raw_timeline().unwrap();
-
-        let mut modification = tline.begin_modification(Lsn(0x1000));
-        modification.put_file("foo/bar1", b"content1", &ctx).await?;
-        modification.set_lsn(Lsn(0x1008))?;
-        modification.put_file("foo/bar2", b"content2", &ctx).await?;
-        modification.commit(&ctx).await?;
-
-        let child_timeline_id = TimelineId::generate();
-        tenant
-            .branch_timeline_test(
-                tline,
-                child_timeline_id,
-                Some(tline.get_last_record_lsn()),
-                &ctx,
-            )
-            .await?;
-
-        let child_timeline = tenant
-            .get_timeline(child_timeline_id, true)
-            .expect("Should have the branched timeline");
-
-        let aux_keyspace = KeySpace {
-            ranges: vec![NON_INHERITED_RANGE],
-        };
-        let read_lsn = child_timeline.get_last_record_lsn();
-
-        let vectored_res = child_timeline
-            .get_vectored_impl(
-                aux_keyspace.clone(),
-                read_lsn,
-                ValuesReconstructState::new(),
-                &ctx,
-            )
-            .await;
-
-        child_timeline
-            .validate_get_vectored_impl(&vectored_res, aux_keyspace, read_lsn, &ctx)
-            .await;
-
-        let images = vectored_res?;
-        let mut key = NON_INHERITED_RANGE.start;
-        while key < NON_INHERITED_RANGE.end {
-            assert!(matches!(
-                images[&key],
-                Err(PageReconstructError::MissingKey(_))
-            ));
-            key = key.next();
-        }
-
-        Ok(())
-    }
-
    // Test that vectored get handles layer gaps correctly
    // by advancing into the next ancestor timeline if required.
    //
@@ -4878,12 +4783,7 @@ mod tests {
            ranges: vec![key_near_gap..gap_at_key.next(), key_near_end..current_key],
        };
        let results = child_timeline
-            .get_vectored_impl(
-                read.clone(),
-                current_lsn,
-                ValuesReconstructState::new(),
-                &ctx,
-            )
+            .get_vectored_impl(read.clone(), current_lsn, &ctx)
            .await?;

        for (key, img_res) in results {
@@ -4894,185 +4794,9 @@ mod tests {
        Ok(())
    }

-    // Test that vectored get descends into ancestor timelines correctly and
-    // does not return an image that's newer than requested.
-    //
-    // The diagram below ilustrates an interesting case. We have a parent timeline
-    // (top of the Lsn range) and a child timeline. The request key cannot be reconstructed
-    // from the child timeline, so the parent timeline must be visited. When advacing into
-    // the child timeline, the read path needs to remember what the requested Lsn was in
-    // order to avoid returning an image that's too new. The test below constructs such
-    // a timeline setup and does a few queries around the Lsn of each page image.
-    // ```
-    //    LSN
-    //     ^
-    //     |
-    //     |
-    // 500 | --------------------------------------> branch point
-    // 400 |        X
-    // 300 |        X
-    // 200 | --------------------------------------> requested lsn
-    // 100 |        X
-    //     |---------------------------------------> Key
-    //              |
-    //              ------> requested key
-    //
-    // Legend:
-    // * X - page images
-    // ```
-    #[tokio::test]
-    async fn test_get_vectored_ancestor_descent() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_get_vectored_on_lsn_axis")?;
-        let (tenant, ctx) = harness.load().await;
-
-        let start_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
-        let end_key = start_key.add(1000);
-        let child_gap_at_key = start_key.add(500);
-        let mut parent_gap_lsns: BTreeMap<Lsn, String> = BTreeMap::new();
-
-        let mut current_lsn = Lsn(0x10);
-
-        let timeline_id = TimelineId::generate();
-        let parent_timeline = tenant
-            .create_test_timeline(timeline_id, current_lsn, DEFAULT_PG_VERSION, &ctx)
-            .await?;
-
-        current_lsn += 0x100;
-
-        for _ in 0..3 {
-            let mut key = start_key;
-            while key < end_key {
-                current_lsn += 0x10;
-
-                let image_value = format!("{} at {}", child_gap_at_key, current_lsn);
-
-                let mut writer = parent_timeline.writer().await;
-                writer
-                    .put(
-                        key,
-                        current_lsn,
-                        &Value::Image(test_img(&image_value)),
-                        &ctx,
-                    )
-                    .await?;
-                writer.finish_write(current_lsn);
-
-                if key == child_gap_at_key {
-                    parent_gap_lsns.insert(current_lsn, image_value);
-                }
-
-                key = key.next();
-            }
-
-            parent_timeline.freeze_and_flush().await?;
-        }
-
-        let child_timeline_id = TimelineId::generate();
-
-        let child_timeline = tenant
-            .branch_timeline_test(&parent_timeline, child_timeline_id, Some(current_lsn), &ctx)
-            .await?;
-
-        let mut key = start_key;
-        while key < end_key {
-            if key == child_gap_at_key {
-                key = key.next();
-                continue;
-            }
-
-            current_lsn += 0x10;
-
-            let mut writer = child_timeline.writer().await;
-            writer
-                .put(
-                    key,
-                    current_lsn,
-                    &Value::Image(test_img(&format!("{} at {}", key, current_lsn))),
-                    &ctx,
-                )
-                .await?;
-            writer.finish_write(current_lsn);
-
-            key = key.next();
-        }
-
-        child_timeline.freeze_and_flush().await?;
-
-        let lsn_offsets: [i64; 5] = [-10, -1, 0, 1, 10];
-        let mut query_lsns = Vec::new();
-        for image_lsn in parent_gap_lsns.keys().rev() {
-            for offset in lsn_offsets {
-                query_lsns.push(Lsn(image_lsn
-                    .0
-                    .checked_add_signed(offset)
-                    .expect("Shouldn't overflow")));
-            }
-        }
-
-        for query_lsn in query_lsns {
-            let results = child_timeline
-                .get_vectored_impl(
-                    KeySpace {
-                        ranges: vec![child_gap_at_key..child_gap_at_key.next()],
-                    },
-                    query_lsn,
-                    ValuesReconstructState::new(),
-                    &ctx,
-                )
-                .await;
-
-            let expected_item = parent_gap_lsns
-                .iter()
-                .rev()
-                .find(|(lsn, _)| **lsn <= query_lsn);
-
-            info!(
-                "Doing vectored read at LSN {}. Expecting image to be: {:?}",
-                query_lsn, expected_item
-            );
-
-            match expected_item {
-                Some((_, img_value)) => {
-                    let key_results = results.expect("No vectored get error expected");
-                    let key_result = &key_results[&child_gap_at_key];
-                    let returned_img = key_result
-                        .as_ref()
-                        .expect("No page reconstruct error expected");
-
-                    info!(
-                        "Vectored read at LSN {} returned image {}",
-                        query_lsn,
-                        std::str::from_utf8(returned_img)?
-                    );
-                    assert_eq!(*returned_img, test_img(img_value));
-                }
-                None => {
-                    assert!(matches!(results, Err(GetVectoredError::MissingKey(_))));
-                }
-            }
-        }
-
-        Ok(())
-    }
-
    #[tokio::test]
    async fn test_random_updates() -> anyhow::Result<()> {
-        let names_algorithms = [
-            ("test_random_updates_legacy", CompactionAlgorithm::Legacy),
-            ("test_random_updates_tiered", CompactionAlgorithm::Tiered),
-        ];
-        for (name, algorithm) in names_algorithms {
-            test_random_updates_algorithm(name, algorithm).await?;
-        }
-        Ok(())
-    }
-
-    async fn test_random_updates_algorithm(
-        name: &'static str,
-        compaction_algorithm: CompactionAlgorithm,
-    ) -> anyhow::Result<()> {
-        let mut harness = TenantHarness::create(name)?;
-        harness.tenant_conf.compaction_algorithm = compaction_algorithm;
+        let harness = TenantHarness::create("test_random_updates")?;
        let (tenant, ctx) = harness.load().await;
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
@@ -5137,7 +4861,7 @@ mod tests {
                );
            }

-            // Perform a cycle of flush, and GC
+            // Perform a cycle of flush, compact, and GC
            let cutoff = tline.get_last_record_lsn();
            tline
                .update_gc_info(
@@ -5149,6 +4873,9 @@ mod tests {
                )
                .await?;
            tline.freeze_and_flush().await?;
+            tline
+                .compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
+                .await?;
            tline.gc().await?;
        }

@@ -5429,36 +5156,19 @@ mod tests {

    #[tokio::test]
    async fn test_read_at_max_lsn() -> anyhow::Result<()> {
-        let names_algorithms = [
-            ("test_read_at_max_lsn_legacy", CompactionAlgorithm::Legacy),
-            ("test_read_at_max_lsn_tiered", CompactionAlgorithm::Tiered),
-        ];
-        for (name, algorithm) in names_algorithms {
-            test_read_at_max_lsn_algorithm(name, algorithm).await?;
-        }
-        Ok(())
-    }
-
-    async fn test_read_at_max_lsn_algorithm(
-        name: &'static str,
-        compaction_algorithm: CompactionAlgorithm,
-    ) -> anyhow::Result<()> {
-        let mut harness = TenantHarness::create(name)?;
-        harness.tenant_conf.compaction_algorithm = compaction_algorithm;
+        let harness = TenantHarness::create("test_read_at_max_lsn")?;
        let (tenant, ctx) = harness.load().await;
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
            .await?;

        let lsn = Lsn(0x10);
-        let compact = false;
-        bulk_insert_maybe_compact_gc(tline.clone(), &ctx, lsn, 50, 10000, compact).await?;
+        bulk_insert_compact_gc(tline.clone(), &ctx, lsn, 50, 10000).await?;

        let test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
        let read_lsn = Lsn(u64::MAX - 1);

-        let result = tline.get(test_key, read_lsn, &ctx).await;
-        assert!(result.is_ok(), "result is not Ok: {}", result.unwrap_err());
+        assert!(tline.get(test_key, read_lsn, &ctx).await.is_ok());

        Ok(())
    }
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -121,7 +121,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
        self.offset
    }

-    const CAPACITY: usize = if BUFFERED { 64 * 1024 } else { 0 };
+    const CAPACITY: usize = if BUFFERED { PAGE_SZ } else { 0 };

    /// Writes the given buffer directly to the underlying `VirtualFile`.
    /// You need to make sure that the internal buffer is empty, otherwise
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -369,10 +369,6 @@ pub struct TenantConf {
    // How much WAL must be ingested before checking again whether a new image layer is required.
    // Expresed in multiples of checkpoint distance.
    pub image_layer_creation_check_threshold: u8,
-
-    /// Switch to aux file v2. Switching this flag requires the user has not written any aux file into
-    /// the storage before, and this flag cannot be switched back. Otherwise there will be data corruptions.
-    pub switch_to_aux_file_v2: bool,
 }

 /// Same as TenantConf, but this struct preserves the information about
@@ -468,10 +464,6 @@ pub struct TenantConfOpt {

    #[serde(skip_serializing_if = "Option::is_none")]
    pub image_layer_creation_check_threshold: Option<u8>,
-
-    #[serde(skip_serializing_if = "Option::is_none")]
-    #[serde(default)]
-    pub switch_to_aux_file_v2: Option<bool>,
 }

 impl TenantConfOpt {
@@ -529,9 +521,6 @@ impl TenantConfOpt {
            image_layer_creation_check_threshold: self
                .image_layer_creation_check_threshold
                .unwrap_or(global_conf.image_layer_creation_check_threshold),
-            switch_to_aux_file_v2: self
-                .switch_to_aux_file_v2
-                .unwrap_or(global_conf.switch_to_aux_file_v2),
        }
    }
 }
@@ -573,7 +562,6 @@ impl Default for TenantConf {
            lazy_slru_download: false,
            timeline_get_throttle: crate::tenant::throttle::Config::disabled(),
            image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD,
-            switch_to_aux_file_v2: false,
        }
    }
 }
@@ -648,7 +636,6 @@ impl From<TenantConfOpt> for models::TenantConfig {
            lazy_slru_download: value.lazy_slru_download,
            timeline_get_throttle: value.timeline_get_throttle.map(ThrottleConfig::from),
            image_layer_creation_check_threshold: value.image_layer_creation_check_threshold,
-            switch_to_aux_file_v2: value.switch_to_aux_file_v2,
        }
    }
 }
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -436,11 +436,6 @@ impl DeleteTenantFlow {
        .await
    }

-    /// Check whether background deletion of this tenant is currently in progress
-    pub(crate) fn is_in_progress(tenant: &Tenant) -> bool {
-        tenant.delete_progress.try_lock().is_err()
-    }
-
    async fn prepare(
        tenant: &Arc<Tenant>,
    ) -> Result<tokio::sync::OwnedMutexGuard<Self>, DeleteTenantError> {
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -3,26 +3,36 @@

 use crate::config::PageServerConf;
 use crate::context::RequestContext;
-use crate::page_cache;
+use crate::page_cache::{self, PAGE_SZ};
 use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReader};
 use crate::virtual_file::{self, VirtualFile};
+use bytes::BytesMut;
 use camino::Utf8PathBuf;
 use pageserver_api::shard::TenantShardId;
+use std::cmp::min;

-use std::io;
+use std::io::{self, ErrorKind};
+use std::ops::DerefMut;
 use std::sync::atomic::AtomicU64;
+use tracing::*;
 use utils::id::TimelineId;

 pub struct EphemeralFile {
+    page_cache_file_id: page_cache::FileId,
+
    _tenant_shard_id: TenantShardId,
    _timeline_id: TimelineId,
-
-    rw: page_caching::RW,
+    file: VirtualFile,
+    len: u64,
+    /// An ephemeral file is append-only.
+    /// We keep the last page, which can still be modified, in [`Self::mutable_tail`].
+    /// The other pages, which can no longer be modified, are accessed through the page cache.
+    ///
+    /// None <=> IO is ongoing.
+    /// Size is fixed to PAGE_SZ at creation time and must not be changed.
+    mutable_tail: Option<BytesMut>,
 }

-mod page_caching;
-mod zero_padded_read_write;
-
 impl EphemeralFile {
    pub async fn create(
        conf: &PageServerConf,
@@ -49,18 +59,21 @@ impl EphemeralFile {
        .await?;

        Ok(EphemeralFile {
+            page_cache_file_id: page_cache::next_file_id(),
            _tenant_shard_id: tenant_shard_id,
            _timeline_id: timeline_id,
-            rw: page_caching::RW::new(file),
+            file,
+            len: 0,
+            mutable_tail: Some(BytesMut::zeroed(PAGE_SZ)),
        })
    }

    pub(crate) fn len(&self) -> u64 {
-        self.rw.bytes_written()
+        self.len
    }

-    pub(crate) fn page_cache_file_id(&self) -> page_cache::FileId {
-        self.rw.page_cache_file_id()
+    pub(crate) fn id(&self) -> page_cache::FileId {
+        self.page_cache_file_id
    }

    pub(crate) async fn read_blk(
@@ -68,30 +81,182 @@ impl EphemeralFile {
        blknum: u32,
        ctx: &RequestContext,
    ) -> Result<BlockLease, io::Error> {
-        self.rw.read_blk(blknum, ctx).await
+        let flushed_blknums = 0..self.len / PAGE_SZ as u64;
+        if flushed_blknums.contains(&(blknum as u64)) {
+            let cache = page_cache::get();
+            match cache
+                .read_immutable_buf(self.page_cache_file_id, blknum, ctx)
+                .await
+                .map_err(|e| {
+                    std::io::Error::new(
+                        std::io::ErrorKind::Other,
+                        // order path before error because error is anyhow::Error => might have many contexts
+                        format!(
+                            "ephemeral file: read immutable page #{}: {}: {:#}",
+                            blknum, self.file.path, e,
+                        ),
+                    )
+                })? {
+                page_cache::ReadBufResult::Found(guard) => {
+                    return Ok(BlockLease::PageReadGuard(guard))
+                }
+                page_cache::ReadBufResult::NotFound(write_guard) => {
+                    let write_guard = self
+                        .file
+                        .read_exact_at_page(write_guard, blknum as u64 * PAGE_SZ as u64)
+                        .await?;
+                    let read_guard = write_guard.mark_valid();
+                    return Ok(BlockLease::PageReadGuard(read_guard));
+                }
+            };
+        } else {
+            debug_assert_eq!(blknum as u64, self.len / PAGE_SZ as u64);
+            Ok(BlockLease::EphemeralFileMutableTail(
+                self.mutable_tail
+                    .as_deref()
+                    .expect("we're not doing IO, it must be Some()")
+                    .try_into()
+                    .expect("we ensure that it's always PAGE_SZ"),
+            ))
+        }
    }

    pub(crate) async fn write_blob(
        &mut self,
        srcbuf: &[u8],
-        _ctx: &RequestContext,
+        ctx: &RequestContext,
    ) -> Result<u64, io::Error> {
-        let pos = self.rw.bytes_written();
+        struct Writer<'a> {
+            ephemeral_file: &'a mut EphemeralFile,
+            /// The block to which the next [`push_bytes`] will write.
+            blknum: u32,
+            /// The offset inside the block identified by [`blknum`] to which [`push_bytes`] will write.
+            off: usize,
+        }
+        impl<'a> Writer<'a> {
+            fn new(ephemeral_file: &'a mut EphemeralFile) -> io::Result<Writer<'a>> {
+                Ok(Writer {
+                    blknum: (ephemeral_file.len / PAGE_SZ as u64) as u32,
+                    off: (ephemeral_file.len % PAGE_SZ as u64) as usize,
+                    ephemeral_file,
+                })
+            }
+            #[inline(always)]
+            async fn push_bytes(
+                &mut self,
+                src: &[u8],
+                ctx: &RequestContext,
+            ) -> Result<(), io::Error> {
+                let mut src_remaining = src;
+                while !src_remaining.is_empty() {
+                    let dst_remaining = &mut self
+                        .ephemeral_file
+                        .mutable_tail
+                        .as_deref_mut()
+                        .expect("IO is not yet ongoing")[self.off..];
+                    let n = min(dst_remaining.len(), src_remaining.len());
+                    dst_remaining[..n].copy_from_slice(&src_remaining[..n]);
+                    self.off += n;
+                    src_remaining = &src_remaining[n..];
+                    if self.off == PAGE_SZ {
+                        let mutable_tail = std::mem::take(&mut self.ephemeral_file.mutable_tail)
+                            .expect("IO is not yet ongoing");
+                        let (mutable_tail, res) = self
+                            .ephemeral_file
+                            .file
+                            .write_all_at(mutable_tail, self.blknum as u64 * PAGE_SZ as u64)
+                            .await;
+                        // TODO: If we panic before we can put the mutable_tail back, subsequent calls will fail.
+                        // I.e., the IO isn't retryable if we panic.
+                        self.ephemeral_file.mutable_tail = Some(mutable_tail);
+                        match res {
+                            Ok(_) => {
+                                // Pre-warm the page cache with what we just wrote.
+                                // This isn't necessary for coherency/correctness, but it's how we've always done it.
+                                let cache = page_cache::get();
+                                match cache
+                                    .read_immutable_buf(
+                                        self.ephemeral_file.page_cache_file_id,
+                                        self.blknum,
+                                        ctx,
+                                    )
+                                    .await
+                                {
+                                    Ok(page_cache::ReadBufResult::Found(_guard)) => {
+                                        // This function takes &mut self, so, it shouldn't be possible to reach this point.
+                                        unreachable!("we just wrote blknum {} and this function takes &mut self, so, no concurrent read_blk is possible", self.blknum);
+                                    }
+                                    Ok(page_cache::ReadBufResult::NotFound(mut write_guard)) => {
+                                        let buf: &mut [u8] = write_guard.deref_mut();
+                                        debug_assert_eq!(buf.len(), PAGE_SZ);
+                                        buf.copy_from_slice(
+                                            self.ephemeral_file
+                                                .mutable_tail
+                                                .as_deref()
+                                                .expect("IO is not ongoing"),
+                                        );
+                                        let _ = write_guard.mark_valid();
+                                        // pre-warm successful
+                                    }
+                                    Err(e) => {
+                                        error!("ephemeral_file write_blob failed to get immutable buf to pre-warm page cache: {e:?}");
+                                        // fail gracefully, it's not the end of the world if we can't pre-warm the cache here
+                                    }
+                                }
+                                // Zero the buffer for re-use.
+                                // Zeroing is critical for correcntess because the write_blob code below
+                                // and similarly read_blk expect zeroed pages.
+                                self.ephemeral_file
+                                    .mutable_tail
+                                    .as_deref_mut()
+                                    .expect("IO is not ongoing")
+                                    .fill(0);
+                                // This block is done, move to next one.
+                                self.blknum += 1;
+                                self.off = 0;
+                            }
+                            Err(e) => {
+                                return Err(std::io::Error::new(
+                                    ErrorKind::Other,
+                                    // order error before path because path is long and error is short
+                                    format!(
+                                        "ephemeral_file: write_blob: write-back full tail blk #{}: {:#}: {}",
+                                        self.blknum,
+                                        e,
+                                        self.ephemeral_file.file.path,
+                                    ),
+                                ));
+                            }
+                        }
+                    }
+                }
+                Ok(())
+            }
+        }
+
+        let pos = self.len;
+        let mut writer = Writer::new(self)?;

        // Write the length field
        if srcbuf.len() < 0x80 {
            // short one-byte length header
            let len_buf = [srcbuf.len() as u8];
-
-            self.rw.write_all_borrowed(&len_buf).await?;
+            writer.push_bytes(&len_buf, ctx).await?;
        } else {
            let mut len_buf = u32::to_be_bytes(srcbuf.len() as u32);
            len_buf[0] |= 0x80;
-            self.rw.write_all_borrowed(&len_buf).await?;
+            writer.push_bytes(&len_buf, ctx).await?;
        }

        // Write the payload
-        self.rw.write_all_borrowed(srcbuf).await?;
+        writer.push_bytes(srcbuf, ctx).await?;
+
+        if srcbuf.len() < 0x80 {
+            self.len += 1;
+        } else {
+            self.len += 4;
+        }
+        self.len += srcbuf.len() as u64;

        Ok(pos)
    }
@@ -106,6 +271,28 @@ pub fn is_ephemeral_file(filename: &str) -> bool {
    }
 }

+impl Drop for EphemeralFile {
+    fn drop(&mut self) {
+        // There might still be pages in the [`crate::page_cache`] for this file.
+        // We leave them there, [`crate::page_cache::PageCache::find_victim`] will evict them when needed.
+
+        // unlink the file
+        let res = std::fs::remove_file(&self.file.path);
+        if let Err(e) = res {
+            if e.kind() != std::io::ErrorKind::NotFound {
+                // just never log the not found errors, we cannot do anything for them; on detach
+                // the tenant directory is already gone.
+                //
+                // not found files might also be related to https://github.com/neondatabase/neon/issues/2442
+                error!(
+                    "could not remove ephemeral file '{}': {}",
+                    self.file.path, e
+                );
+            }
+        }
+    }
+}
+
 impl BlockReader for EphemeralFile {
    fn block_cursor(&self) -> super::block_io::BlockCursor<'_> {
        BlockCursor::new(super::block_io::BlockReaderRef::EphemeralFile(self))
--- a/pageserver/src/tenant/ephemeral_file/page_caching.rs
+++ b/pageserver/src/tenant/ephemeral_file/page_caching.rs
@@ -1,218 +0,0 @@
-//! Wrapper around [`super::zero_padded_read_write::RW`] that uses the
-//! [`crate::page_cache`] to serve reads that need to go to the underlying [`VirtualFile`].
-
-use crate::context::RequestContext;
-use crate::page_cache::{self, PAGE_SZ};
-use crate::tenant::block_io::BlockLease;
-use crate::virtual_file::VirtualFile;
-
-use once_cell::sync::Lazy;
-use std::io::{self, ErrorKind};
-use tokio_epoll_uring::BoundedBuf;
-use tracing::*;
-
-use super::zero_padded_read_write;
-
-/// See module-level comment.
-pub struct RW {
-    page_cache_file_id: page_cache::FileId,
-    rw: super::zero_padded_read_write::RW<PreWarmingWriter>,
-}
-
-impl RW {
-    pub fn new(file: VirtualFile) -> Self {
-        let page_cache_file_id = page_cache::next_file_id();
-        Self {
-            page_cache_file_id,
-            rw: super::zero_padded_read_write::RW::new(PreWarmingWriter::new(
-                page_cache_file_id,
-                file,
-            )),
-        }
-    }
-
-    pub fn page_cache_file_id(&self) -> page_cache::FileId {
-        self.page_cache_file_id
-    }
-
-    pub(crate) async fn write_all_borrowed(&mut self, srcbuf: &[u8]) -> Result<usize, io::Error> {
-        // It doesn't make sense to proactively fill the page cache on the Pageserver write path
-        // because Compute is unlikely to access recently written data.
-        self.rw.write_all_borrowed(srcbuf).await
-    }
-
-    pub(crate) fn bytes_written(&self) -> u64 {
-        self.rw.bytes_written()
-    }
-
-    pub(crate) async fn read_blk(
-        &self,
-        blknum: u32,
-        ctx: &RequestContext,
-    ) -> Result<BlockLease, io::Error> {
-        match self.rw.read_blk(blknum).await? {
-            zero_padded_read_write::ReadResult::NeedsReadFromWriter { writer } => {
-                let cache = page_cache::get();
-                match cache
-                    .read_immutable_buf(self.page_cache_file_id, blknum, ctx)
-                    .await
-                    .map_err(|e| {
-                        std::io::Error::new(
-                            std::io::ErrorKind::Other,
-                            // order path before error because error is anyhow::Error => might have many contexts
-                            format!(
-                                "ephemeral file: read immutable page #{}: {}: {:#}",
-                                blknum,
-                                self.rw.as_writer().file.path,
-                                e,
-                            ),
-                        )
-                    })? {
-                    page_cache::ReadBufResult::Found(guard) => {
-                        return Ok(BlockLease::PageReadGuard(guard))
-                    }
-                    page_cache::ReadBufResult::NotFound(write_guard) => {
-                        let write_guard = writer
-                            .file
-                            .read_exact_at_page(write_guard, blknum as u64 * PAGE_SZ as u64)
-                            .await?;
-                        let read_guard = write_guard.mark_valid();
-                        return Ok(BlockLease::PageReadGuard(read_guard));
-                    }
-                }
-            }
-            zero_padded_read_write::ReadResult::ServedFromZeroPaddedMutableTail { buffer } => {
-                Ok(BlockLease::EphemeralFileMutableTail(buffer))
-            }
-        }
-    }
-}
-
-impl Drop for RW {
-    fn drop(&mut self) {
-        // There might still be pages in the [`crate::page_cache`] for this file.
-        // We leave them there, [`crate::page_cache::PageCache::find_victim`] will evict them when needed.
-
-        // unlink the file
-        let res = std::fs::remove_file(&self.rw.as_writer().file.path);
-        if let Err(e) = res {
-            if e.kind() != std::io::ErrorKind::NotFound {
-                // just never log the not found errors, we cannot do anything for them; on detach
-                // the tenant directory is already gone.
-                //
-                // not found files might also be related to https://github.com/neondatabase/neon/issues/2442
-                error!(
-                    "could not remove ephemeral file '{}': {}",
-                    self.rw.as_writer().file.path,
-                    e
-                );
-            }
-        }
-    }
-}
-
-struct PreWarmingWriter {
-    nwritten_blocks: u32,
-    page_cache_file_id: page_cache::FileId,
-    file: VirtualFile,
-}
-
-impl PreWarmingWriter {
-    fn new(page_cache_file_id: page_cache::FileId, file: VirtualFile) -> Self {
-        Self {
-            nwritten_blocks: 0,
-            page_cache_file_id,
-            file,
-        }
-    }
-}
-
-impl crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter for PreWarmingWriter {
-    async fn write_all<
-        B: tokio_epoll_uring::BoundedBuf<Buf = Buf>,
-        Buf: tokio_epoll_uring::IoBuf + Send,
-    >(
-        &mut self,
-        buf: B,
-    ) -> std::io::Result<(usize, B::Buf)> {
-        let buf = buf.slice(..);
-        let saved_bounds = buf.bounds(); // save for reconstructing the Slice from iobuf after the IO is done
-        let check_bounds_stuff_works = if cfg!(test) && cfg!(debug_assertions) {
-            Some(buf.to_vec())
-        } else {
-            None
-        };
-        let buflen = buf.len();
-        assert_eq!(
-            buflen % PAGE_SZ,
-            0,
-            "{buflen} ; we know TAIL_SZ is a PAGE_SZ multiple, and write_buffered_borrowed is used"
-        );
-
-        // Do the IO.
-        let iobuf = match self.file.write_all(buf).await {
-            (iobuf, Ok(nwritten)) => {
-                assert_eq!(nwritten, buflen);
-                iobuf
-            }
-            (_, Err(e)) => {
-                return Err(std::io::Error::new(
-                    ErrorKind::Other,
-                    // order error before path because path is long and error is short
-                    format!(
-                        "ephemeral_file: write_blob: write-back tail self.nwritten_blocks={}, buflen={}, {:#}: {}",
-                        self.nwritten_blocks, buflen, e, self.file.path,
-                    ),
-                ));
-            }
-        };
-
-        // Reconstruct the Slice (the write path consumed the Slice and returned us the underlying IoBuf)
-        let buf = tokio_epoll_uring::Slice::from_buf_bounds(iobuf, saved_bounds);
-        if let Some(check_bounds_stuff_works) = check_bounds_stuff_works {
-            assert_eq!(&check_bounds_stuff_works, &*buf);
-        }
-
-        // Pre-warm page cache with the contents.
-        // At least in isolated bulk ingest benchmarks (test_bulk_insert.py), the pre-warming
-        // benefits the code that writes InMemoryLayer=>L0 layers.
-        let nblocks = buflen / PAGE_SZ;
-        let nblocks32 = u32::try_from(nblocks).unwrap();
-        let cache = page_cache::get();
-        static CTX: Lazy<RequestContext> = Lazy::new(|| {
-            RequestContext::new(
-                crate::task_mgr::TaskKind::EphemeralFilePreWarmPageCache,
-                crate::context::DownloadBehavior::Error,
-            )
-        });
-        for blknum_in_buffer in 0..nblocks {
-            let blk_in_buffer = &buf[blknum_in_buffer * PAGE_SZ..(blknum_in_buffer + 1) * PAGE_SZ];
-            let blknum = self
-                .nwritten_blocks
-                .checked_add(blknum_in_buffer as u32)
-                .unwrap();
-            match cache
-                .read_immutable_buf(self.page_cache_file_id, blknum, &CTX)
-                .await
-            {
-                Err(e) => {
-                    error!("ephemeral_file write_blob failed to get immutable buf to pre-warm page cache: {e:?}");
-                    // fail gracefully, it's not the end of the world if we can't pre-warm the cache here
-                }
-                Ok(v) => match v {
-                    page_cache::ReadBufResult::Found(_guard) => {
-                        // This function takes &mut self, so, it shouldn't be possible to reach this point.
-                        unreachable!("we just wrote block {blknum} to the VirtualFile, which is owned by Self, \
-                                      and this function takes &mut self, so, no concurrent read_blk is possible");
-                    }
-                    page_cache::ReadBufResult::NotFound(mut write_guard) => {
-                        write_guard.copy_from_slice(blk_in_buffer);
-                        let _ = write_guard.mark_valid();
-                    }
-                },
-            }
-        }
-        self.nwritten_blocks = self.nwritten_blocks.checked_add(nblocks32).unwrap();
-        Ok((buflen, buf.into_inner()))
-    }
-}
--- a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs
+++ b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs
@@ -1,125 +0,0 @@
-//! The heart of how [`super::EphemeralFile`] does its reads and writes.
-//!
-//! # Writes
-//!
-//! [`super::EphemeralFile`] writes small, borrowed buffers using [`RW::write_all_borrowed`].
-//! The [`RW`] batches these into [`TAIL_SZ`] bigger writes, using [`owned_buffers_io::write::BufferedWriter`].
-//!
-//! # Reads
-//!
-//! [`super::EphemeralFile`] always reads full [`PAGE_SZ`]ed blocks using [`RW::read_blk`].
-//!
-//! The [`RW`] serves these reads either from the buffered writer's in-memory buffer
-//! or redirects the caller to read from the underlying [`OwnedAsyncWriter`]
-//! if the read is for the prefix that has already been flushed.
-//!
-//! # Current Usage
-//!
-//! The current user of this module is [`super::page_caching::RW`].
-
-mod zero_padded;
-
-use crate::{
-    page_cache::PAGE_SZ,
-    virtual_file::owned_buffers_io::{
-        self,
-        write::{Buffer, OwnedAsyncWriter},
-    },
-};
-
-const TAIL_SZ: usize = 64 * 1024;
-
-/// See module-level comment.
-pub struct RW<W: OwnedAsyncWriter> {
-    buffered_writer: owned_buffers_io::write::BufferedWriter<
-        zero_padded::Buffer<TAIL_SZ>,
-        owned_buffers_io::util::size_tracking_writer::Writer<W>,
-    >,
-}
-
-pub enum ReadResult<'a, W> {
-    NeedsReadFromWriter { writer: &'a W },
-    ServedFromZeroPaddedMutableTail { buffer: &'a [u8; PAGE_SZ] },
-}
-
-impl<W> RW<W>
-where
-    W: OwnedAsyncWriter,
-{
-    pub fn new(writer: W) -> Self {
-        let bytes_flushed_tracker =
-            owned_buffers_io::util::size_tracking_writer::Writer::new(writer);
-        let buffered_writer = owned_buffers_io::write::BufferedWriter::new(
-            bytes_flushed_tracker,
-            zero_padded::Buffer::default(),
-        );
-        Self { buffered_writer }
-    }
-
-    pub(crate) fn as_writer(&self) -> &W {
-        self.buffered_writer.as_inner().as_inner()
-    }
-
-    pub async fn write_all_borrowed(&mut self, buf: &[u8]) -> std::io::Result<usize> {
-        self.buffered_writer.write_buffered_borrowed(buf).await
-    }
-
-    pub fn bytes_written(&self) -> u64 {
-        let flushed_offset = self.buffered_writer.as_inner().bytes_written();
-        let buffer: &zero_padded::Buffer<TAIL_SZ> = self.buffered_writer.inspect_buffer();
-        flushed_offset + u64::try_from(buffer.pending()).unwrap()
-    }
-
-    pub(crate) async fn read_blk(&self, blknum: u32) -> Result<ReadResult<'_, W>, std::io::Error> {
-        let flushed_offset = self.buffered_writer.as_inner().bytes_written();
-        let buffer: &zero_padded::Buffer<TAIL_SZ> = self.buffered_writer.inspect_buffer();
-        let buffered_offset = flushed_offset + u64::try_from(buffer.pending()).unwrap();
-        let read_offset = (blknum as u64) * (PAGE_SZ as u64);
-
-        // The trailing page ("block") might only be partially filled,
-        // yet the blob_io code relies on us to return a full PAGE_SZed slice anyway.
-        // Moreover, it has to be zero-padded, because when we still had
-        // a write-back page cache, it provided pre-zeroed pages, and blob_io came to rely on it.
-        // DeltaLayer probably has the same issue, not sure why it needs no special treatment.
-        // => check here that the read doesn't go beyond this potentially trailing
-        // => the zero-padding is done in the `else` branch below
-        let blocks_written = if buffered_offset % (PAGE_SZ as u64) == 0 {
-            buffered_offset / (PAGE_SZ as u64)
-        } else {
-            (buffered_offset / (PAGE_SZ as u64)) + 1
-        };
-        if (blknum as u64) >= blocks_written {
-            return Err(std::io::Error::new(std::io::ErrorKind::Other, anyhow::anyhow!("read past end of ephemeral_file: read=0x{read_offset:x} buffered=0x{buffered_offset:x} flushed=0x{flushed_offset}")));
-        }
-
-        // assertions for the `if-else` below
-        assert_eq!(
-            flushed_offset % (TAIL_SZ as u64), 0,
-            "we only use write_buffered_borrowed to write to the buffered writer, so it's guaranteed that flushes happen buffer.cap()-sized chunks"
-        );
-        assert_eq!(
-            flushed_offset % (PAGE_SZ as u64),
-            0,
-            "the logic below can't handle if the page is spread across the flushed part and the buffer"
-        );
-
-        if read_offset < flushed_offset {
-            assert!(read_offset + (PAGE_SZ as u64) <= flushed_offset);
-            Ok(ReadResult::NeedsReadFromWriter {
-                writer: self.as_writer(),
-            })
-        } else {
-            let read_offset_in_buffer = read_offset
-                .checked_sub(flushed_offset)
-                .expect("would have taken `if` branch instead of this one");
-            let read_offset_in_buffer = usize::try_from(read_offset_in_buffer).unwrap();
-            let zero_padded_slice = buffer.as_zero_padded_slice();
-            let page = &zero_padded_slice[read_offset_in_buffer..(read_offset_in_buffer + PAGE_SZ)];
-            Ok(ReadResult::ServedFromZeroPaddedMutableTail {
-                buffer: page
-                    .try_into()
-                    .expect("the slice above got it as page-size slice"),
-            })
-        }
-    }
-}
--- a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write/zero_padded.rs
+++ b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write/zero_padded.rs
@@ -1,108 +0,0 @@
-//! A [`crate::virtual_file::owned_buffers_io::write::Buffer`] whose
-//! unwritten range is guaranteed to be zero-initialized.
-//! This is used by [`crate::tenant::ephemeral_file::zero_padded_read_write::RW::read_blk`]
-//! to serve page-sized reads of the trailing page when the trailing page has only been partially filled.
-
-use std::mem::MaybeUninit;
-
-/// See module-level comment.
-pub struct Buffer<const N: usize> {
-    allocation: Box<[u8; N]>,
-    written: usize,
-}
-
-impl<const N: usize> Default for Buffer<N> {
-    fn default() -> Self {
-        Self {
-            allocation: Box::new(
-                // SAFETY: zeroed memory is a valid [u8; N]
-                unsafe { MaybeUninit::zeroed().assume_init() },
-            ),
-            written: 0,
-        }
-    }
-}
-
-impl<const N: usize> Buffer<N> {
-    #[inline(always)]
-    fn invariants(&self) {
-        // don't check by default, unoptimized is too expensive even for debug mode
-        if false {
-            debug_assert!(self.written <= N, "{}", self.written);
-            debug_assert!(self.allocation[self.written..N].iter().all(|v| *v == 0));
-        }
-    }
-
-    pub fn as_zero_padded_slice(&self) -> &[u8; N] {
-        &self.allocation
-    }
-}
-
-impl<const N: usize> crate::virtual_file::owned_buffers_io::write::Buffer for Buffer<N> {
-    type IoBuf = Self;
-
-    fn cap(&self) -> usize {
-        self.allocation.len()
-    }
-
-    fn extend_from_slice(&mut self, other: &[u8]) {
-        self.invariants();
-        let remaining = self.allocation.len() - self.written;
-        if other.len() > remaining {
-            panic!("calling extend_from_slice() with insufficient remaining capacity");
-        }
-        self.allocation[self.written..(self.written + other.len())].copy_from_slice(other);
-        self.written += other.len();
-        self.invariants();
-    }
-
-    fn pending(&self) -> usize {
-        self.written
-    }
-
-    fn flush(self) -> tokio_epoll_uring::Slice<Self> {
-        self.invariants();
-        let written = self.written;
-        tokio_epoll_uring::BoundedBuf::slice(self, 0..written)
-    }
-
-    fn reuse_after_flush(iobuf: Self::IoBuf) -> Self {
-        let Self {
-            mut allocation,
-            written,
-        } = iobuf;
-        allocation[0..written].fill(0);
-        let new = Self {
-            allocation,
-            written: 0,
-        };
-        new.invariants();
-        new
-    }
-}
-
-/// We have this trait impl so that the `flush` method in the `Buffer` impl above can produce a
-/// [`tokio_epoll_uring::BoundedBuf::slice`] of the [`Self::written`] range of the data.
-///
-/// Remember that bytes_init is generally _not_ a tracker of the amount
-/// of valid data in the io buffer; we use `Slice` for that.
-/// The `IoBuf` is _only_ for keeping track of uninitialized memory, a bit like MaybeUninit.
-///
-/// SAFETY:
-///
-/// The [`Self::allocation`] is stable becauses boxes are stable.
-/// The memory is zero-initialized, so, bytes_init is always N.
-unsafe impl<const N: usize> tokio_epoll_uring::IoBuf for Buffer<N> {
-    fn stable_ptr(&self) -> *const u8 {
-        self.allocation.as_ptr()
-    }
-
-    fn bytes_init(&self) -> usize {
-        // Yes, N, not self.written; Read the full comment of this impl block!
-        N
-    }
-
-    fn bytes_total(&self) -> usize {
-        N
-    }
-}
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -916,7 +916,6 @@ mod tests {
        assert_eq!(lhs, rhs);
    }

-    #[cfg(test)]
    fn brute_force_range_search(
        layer_map: &LayerMap,
        key_range: Range<Key>,
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -235,12 +235,6 @@ impl TimelineMetadata {
        let bytes = instance.to_bytes().unwrap();
        Self::from_bytes(&bytes).unwrap()
    }
-
-    pub(crate) fn apply(&mut self, update: &MetadataUpdate) {
-        self.body.disk_consistent_lsn = update.disk_consistent_lsn;
-        self.body.prev_record_lsn = update.prev_record_lsn;
-        self.body.latest_gc_cutoff_lsn = update.latest_gc_cutoff_lsn;
-    }
 }

 impl<'de> Deserialize<'de> for TimelineMetadata {
@@ -265,27 +259,6 @@ impl Serialize for TimelineMetadata {
    }
 }

-/// Parts of the metadata which are regularly modified.
-pub(crate) struct MetadataUpdate {
-    disk_consistent_lsn: Lsn,
-    prev_record_lsn: Option<Lsn>,
-    latest_gc_cutoff_lsn: Lsn,
-}
-
-impl MetadataUpdate {
-    pub(crate) fn new(
-        disk_consistent_lsn: Lsn,
-        prev_record_lsn: Option<Lsn>,
-        latest_gc_cutoff_lsn: Lsn,
-    ) -> Self {
-        Self {
-            disk_consistent_lsn,
-            prev_record_lsn,
-            latest_gc_cutoff_lsn,
-        }
-    }
-}
-
 #[cfg(test)]
 mod tests {
    use super::*;
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -2,7 +2,6 @@
 //! page server.

 use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf};
-use futures::StreamExt;
 use itertools::Itertools;
 use pageserver_api::key::Key;
 use pageserver_api::models::LocationConfigMode;
@@ -254,15 +253,17 @@ impl TenantsMap {
    }
 }

-/// Precursor to deletion of a tenant dir: we do a fast rename to a tmp path, and then
-/// the slower actual deletion in the background.
-///
 /// This is "safe" in that that it won't leave behind a partially deleted directory
 /// at the original path, because we rename with TEMP_FILE_SUFFIX before starting deleting
 /// the contents.
 ///
 /// This is pageserver-specific, as it relies on future processes after a crash to check
 /// for TEMP_FILE_SUFFIX when loading things.
+async fn safe_remove_tenant_dir_all(path: impl AsRef<Utf8Path>) -> std::io::Result<()> {
+    let tmp_path = safe_rename_tenant_dir(path).await?;
+    fs::remove_dir_all(tmp_path).await
+}
+
 async fn safe_rename_tenant_dir(path: impl AsRef<Utf8Path>) -> std::io::Result<Utf8PathBuf> {
    let parent = path
        .as_ref()
@@ -285,28 +286,6 @@ async fn safe_rename_tenant_dir(path: impl AsRef<Utf8Path>) -> std::io::Result<U
    Ok(tmp_path)
 }

-/// When we have moved a tenant's content to a temporary directory, we may delete it lazily in
-/// the background, and thereby avoid blocking any API requests on this deletion completing.
-fn spawn_background_purge(tmp_path: Utf8PathBuf) {
-    // Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory.
-    // After a tenant is detached, there are no more task_mgr tasks for that tenant_id.
-    let task_tenant_id = None;
-
-    task_mgr::spawn(
-        task_mgr::BACKGROUND_RUNTIME.handle(),
-        TaskKind::MgmtRequest,
-        task_tenant_id,
-        None,
-        "tenant_files_delete",
-        false,
-        async move {
-            fs::remove_dir_all(tmp_path.as_path())
-                .await
-                .with_context(|| format!("tenant directory {:?} deletion", tmp_path))
-        },
-    );
-}
-
 static TENANTS: Lazy<std::sync::RwLock<TenantsMap>> =
    Lazy::new(|| std::sync::RwLock::new(TenantsMap::Initializing));

@@ -591,11 +570,7 @@ pub async fn init_tenant_mgr(
    );
    TENANT.startup_scheduled.inc_by(tenant_configs.len() as u64);

-    // Accumulate futures for writing tenant configs, so that we can execute in parallel
-    let mut config_write_futs = Vec::new();
-
-    // Update the location configs according to the re-attach response and persist them to disk
-    tracing::info!("Updating {} location configs", tenant_configs.len());
+    // Construct `Tenant` objects and start them running
    for (tenant_shard_id, location_conf) in tenant_configs {
        let tenant_dir_path = conf.tenant_path(&tenant_shard_id);

@@ -622,22 +597,18 @@ pub async fn init_tenant_mgr(
        const DEFAULT_SECONDARY_CONF: SecondaryLocationConfig =
            SecondaryLocationConfig { warm: true };

+        // Update the location config according to the re-attach response
        if let Some(tenant_modes) = &tenant_modes {
            // We have a generation map: treat it as the authority for whether
            // this tenant is really attached.
            match tenant_modes.get(&tenant_shard_id) {
                None => {
                    info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Detaching tenant, control plane omitted it in re-attach response");
-
-                    match safe_rename_tenant_dir(&tenant_dir_path).await {
-                        Ok(tmp_path) => {
-                            spawn_background_purge(tmp_path);
-                        }
-                        Err(e) => {
-                            error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
-                            "Failed to move detached tenant directory '{tenant_dir_path}': {e:?}");
-                        }
-                    };
+                    if let Err(e) = safe_remove_tenant_dir_all(&tenant_dir_path).await {
+                        error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
+                            "Failed to remove detached tenant directory '{tenant_dir_path}': {e:?}",
+                        );
+                    }

                    // We deleted local content: move on to next tenant, don't try and spawn this one.
                    continue;
@@ -683,32 +654,8 @@ pub async fn init_tenant_mgr(

        // Presence of a generation number implies attachment: attach the tenant
        // if it wasn't already, and apply the generation number.
-        config_write_futs.push(async move {
-            let r = Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await;
-            (tenant_shard_id, location_conf, r)
-        });
-    }
+        Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await?;

-    // Execute config writes with concurrency, to avoid bottlenecking on local FS write latency
-    tracing::info!(
-        "Writing {} location config files...",
-        config_write_futs.len()
-    );
-    let config_write_results = futures::stream::iter(config_write_futs)
-        .buffer_unordered(16)
-        .collect::<Vec<_>>()
-        .await;
-
-    tracing::info!(
-        "Spawning {} tenant shard locations...",
-        config_write_results.len()
-    );
-    // For those shards that have live configurations, construct `Tenant` or `SecondaryTenant` objects and start them running
-    for (tenant_shard_id, location_conf, config_write_result) in config_write_results {
-        // Errors writing configs are fatal
-        config_write_result?;
-
-        let tenant_dir_path = conf.tenant_path(&tenant_shard_id);
        let shard_identity = location_conf.shard;
        let slot = match location_conf.mode {
            LocationMode::Attached(attached_conf) => {
@@ -731,19 +678,12 @@ pub async fn init_tenant_mgr(
                    }
                }
            }
-            LocationMode::Secondary(secondary_conf) => {
-                info!(
-                    tenant_id = %tenant_shard_id.tenant_id,
-                    shard_id = %tenant_shard_id.shard_slug(),
-                    "Starting secondary tenant"
-                );
-                TenantSlot::Secondary(SecondaryTenant::new(
-                    tenant_shard_id,
-                    shard_identity,
-                    location_conf.tenant_conf,
-                    &secondary_conf,
-                ))
-            }
+            LocationMode::Secondary(secondary_conf) => TenantSlot::Secondary(SecondaryTenant::new(
+                tenant_shard_id,
+                shard_identity,
+                location_conf.tenant_conf,
+                &secondary_conf,
+            )),
        };

        tenants.insert(tenant_shard_id, slot);
@@ -1470,15 +1410,9 @@ impl TenantManager {

        match tenant.current_state() {
            TenantState::Broken { .. } | TenantState::Stopping { .. } => {
-                // If deletion is already in progress, return success (the semantics of this
-                // function are to rerturn success afterr deletion is spawned in background).
-                // Otherwise fall through and let [`DeleteTenantFlow`] handle this state.
-                if DeleteTenantFlow::is_in_progress(&tenant) {
-                    // The `delete_progress` lock is held: deletion is already happening
-                    // in the bacckground
-                    slot_guard.revert();
-                    return Ok(());
-                }
+                // If a tenant is broken or stopping, DeleteTenantFlow can
+                // handle it: broken tenants proceed to delete, stopping tenants
+                // are checked for deletion already in progress.
            }
            _ => {
                tenant
@@ -1752,7 +1686,7 @@ impl TenantManager {
        let tmp_path = safe_rename_tenant_dir(&local_tenant_directory)
            .await
            .with_context(|| format!("local tenant directory {local_tenant_directory:?} rename"))?;
-        spawn_background_purge(tmp_path);
+        self.spawn_background_purge(tmp_path);

        fail::fail_point!("shard-split-pre-finish", |_| Err(anyhow::anyhow!(
            "failpoint"
@@ -1907,6 +1841,28 @@ impl TenantManager {
        shutdown_all_tenants0(self.tenants).await
    }

+    /// When we have moved a tenant's content to a temporary directory, we may delete it lazily in
+    /// the background, and thereby avoid blocking any API requests on this deletion completing.
+    fn spawn_background_purge(&self, tmp_path: Utf8PathBuf) {
+        // Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory.
+        // After a tenant is detached, there are no more task_mgr tasks for that tenant_id.
+        let task_tenant_id = None;
+
+        task_mgr::spawn(
+            task_mgr::BACKGROUND_RUNTIME.handle(),
+            TaskKind::MgmtRequest,
+            task_tenant_id,
+            None,
+            "tenant_files_delete",
+            false,
+            async move {
+                fs::remove_dir_all(tmp_path.as_path())
+                    .await
+                    .with_context(|| format!("tenant directory {:?} deletion", tmp_path))
+            },
+        );
+    }
+
    pub(crate) async fn detach_tenant(
        &self,
        conf: &'static PageServerConf,
@@ -1923,7 +1879,7 @@ impl TenantManager {
                deletion_queue_client,
            )
            .await?;
-        spawn_background_purge(tmp_path);
+        self.spawn_background_purge(tmp_path);

        Ok(())
    }
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -202,9 +202,7 @@ use std::sync::atomic::{AtomicU32, Ordering};
 use std::sync::{Arc, Mutex};
 use std::time::Duration;

-use remote_storage::{
-    DownloadError, GenericRemoteStorage, ListingMode, RemotePath, TimeoutOrCancel,
-};
+use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath, TimeoutOrCancel};
 use std::ops::DerefMut;
 use tracing::{debug, error, info, instrument, warn};
 use tracing::{info_span, Instrument};
@@ -238,14 +236,11 @@ use utils::id::{TenantId, TimelineId};

 use self::index::IndexPart;

-use super::metadata::MetadataUpdate;
 use super::storage_layer::{Layer, LayerFileName, ResidentLayer};
 use super::upload_queue::SetDeletedFlagProgress;
 use super::Generation;

-pub(crate) use download::{
-    download_index_part, is_temp_download_file, list_remote_tenant_shards, list_remote_timelines,
-};
+pub(crate) use download::{is_temp_download_file, list_remote_timelines};
 pub(crate) use index::LayerFileMetadata;

 // Occasional network issues and such can cause remote operations to fail, and
@@ -474,7 +469,7 @@ impl RemoteTimelineClient {
            },
        );

-        let (index_part, _index_generation) = download::download_index_part(
+        let index_part = download::download_index_part(
            &self.storage_impl,
            &self.tenant_shard_id,
            &self.timeline_id,
@@ -541,10 +536,9 @@ impl RemoteTimelineClient {
    // Upload operations.
    //

-    /// Launch an index-file upload operation in the background, with
-    /// fully updated metadata.
    ///
-    /// This should only be used to upload initial metadata to remote storage.
+    /// Launch an index-file upload operation in the background, with
+    /// updated metadata.
    ///
    /// The upload will be added to the queue immediately, but it
    /// won't be performed until all previously scheduled layer file
@@ -556,7 +550,7 @@ impl RemoteTimelineClient {
    /// If there were any changes to the list of files, i.e. if any
    /// layer file uploads were scheduled, since the last index file
    /// upload, those will be included too.
-    pub fn schedule_index_upload_for_full_metadata_update(
+    pub fn schedule_index_upload_for_metadata_update(
        self: &Arc<Self>,
        metadata: &TimelineMetadata,
    ) -> anyhow::Result<()> {
@@ -572,27 +566,6 @@ impl RemoteTimelineClient {
        Ok(())
    }

-    /// Launch an index-file upload operation in the background, with only parts of the metadata
-    /// updated.
-    ///
-    /// This is the regular way of updating metadata on layer flushes or Gc.
-    ///
-    /// Using this lighter update mechanism allows for reparenting and detaching without changes to
-    /// `index_part.json`, while being more clear on what values update regularly.
-    pub(crate) fn schedule_index_upload_for_metadata_update(
-        self: &Arc<Self>,
-        update: &MetadataUpdate,
-    ) -> anyhow::Result<()> {
-        let mut guard = self.upload_queue.lock().unwrap();
-        let upload_queue = guard.initialized_mut()?;
-
-        upload_queue.latest_metadata.apply(update);
-
-        self.schedule_index_upload(upload_queue, upload_queue.latest_metadata.clone());
-
-        Ok(())
-    }
-
    ///
    /// Launch an index-file upload operation in the background, if necessary.
    ///
@@ -1149,7 +1122,7 @@ impl RemoteTimelineClient {
        // and retry will arrive to different pageserver there wont be any traces of it on remote storage
        let timeline_storage_path = remote_timeline_path(&self.tenant_shard_id, &self.timeline_id);

-        // Execute all pending deletions, so that when we proceed to do a listing below, we aren't
+        // Execute all pending deletions, so that when we proceed to do a list_prefixes below, we aren't
        // taking the burden of listing all the layers that we already know we should delete.
        self.flush_deletion_queue().await?;

@@ -1158,20 +1131,14 @@ impl RemoteTimelineClient {
        let remaining = download_retry(
            || async {
                self.storage_impl
-                    .list(
-                        Some(&timeline_storage_path),
-                        ListingMode::NoDelimiter,
-                        None,
-                        &cancel,
-                    )
+                    .list_files(Some(&timeline_storage_path), None, &cancel)
                    .await
            },
            "list remaining files",
            &cancel,
        )
        .await
-        .context("list files remaining files")?
-        .keys;
+        .context("list files remaining files")?;

        // We will delete the current index_part object last, since it acts as a deletion
        // marker via its deleted_at attribute
@@ -1718,11 +1685,6 @@ impl RemoteTimelineClient {
    }
 }

-pub fn remote_tenant_path(tenant_shard_id: &TenantShardId) -> RemotePath {
-    let path = format!("tenants/{tenant_shard_id}");
-    RemotePath::from_string(&path).expect("Failed to construct path")
-}
-
 pub fn remote_timelines_path(tenant_shard_id: &TenantShardId) -> RemotePath {
    let path = format!("tenants/{tenant_shard_id}/{TIMELINES_SEGMENT_NAME}");
    RemotePath::from_string(&path).expect("Failed to construct path")
@@ -2062,7 +2024,7 @@ mod tests {
        // Schedule upload of index. Check that it is queued
        let metadata = dummy_metadata(Lsn(0x20));
        client
-            .schedule_index_upload_for_full_metadata_update(&metadata)
+            .schedule_index_upload_for_metadata_update(&metadata)
            .unwrap();
        {
            let mut guard = client.upload_queue.lock().unwrap();
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -5,7 +5,6 @@

 use std::collections::HashSet;
 use std::future::Future;
-use std::str::FromStr;

 use anyhow::{anyhow, Context};
 use camino::{Utf8Path, Utf8PathBuf};
@@ -26,13 +25,13 @@ use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile};
 use crate::TEMP_FILE_SUFFIX;
 use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode, RemotePath};
 use utils::crashsafe::path_with_suffix_extension;
-use utils::id::{TenantId, TimelineId};
+use utils::id::TimelineId;

 use super::index::{IndexPart, LayerFileMetadata};
 use super::{
    parse_remote_index_path, remote_index_path, remote_initdb_archive_path,
-    remote_initdb_preserved_archive_path, remote_tenant_path, FAILED_DOWNLOAD_WARN_THRESHOLD,
-    FAILED_REMOTE_OP_RETRIES, INITDB_PATH,
+    remote_initdb_preserved_archive_path, FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES,
+    INITDB_PATH,
 };

 ///
@@ -183,7 +182,6 @@ async fn download_object<'a>(
        #[cfg(target_os = "linux")]
        crate::virtual_file::io_engine::IoEngine::TokioEpollUring => {
            use crate::virtual_file::owned_buffers_io::{self, util::size_tracking_writer};
-            use bytes::BytesMut;
            async {
                let destination_file = VirtualFile::create(dst_path)
                    .await
@@ -196,10 +194,10 @@ async fn download_object<'a>(
                // There's chunks_vectored() on the stream.
                let (bytes_amount, destination_file) = async {
                    let size_tracking = size_tracking_writer::Writer::new(destination_file);
-                    let mut buffered = owned_buffers_io::write::BufferedWriter::<BytesMut, _>::new(
-                        size_tracking,
-                        BytesMut::with_capacity(super::BUFFER_SIZE),
-                    );
+                    let mut buffered = owned_buffers_io::write::BufferedWriter::<
+                        { super::BUFFER_SIZE },
+                        _,
+                    >::new(size_tracking);
                    while let Some(res) =
                        futures::StreamExt::next(&mut download.download_stream).await
                    {
@@ -254,31 +252,42 @@ pub(crate) fn is_temp_download_file(path: &Utf8Path) -> bool {
    }
 }

-async fn list_identifiers<T>(
+/// List timelines of given tenant in remote storage
+pub async fn list_remote_timelines(
    storage: &GenericRemoteStorage,
-    prefix: RemotePath,
+    tenant_shard_id: TenantShardId,
    cancel: CancellationToken,
-) -> anyhow::Result<(HashSet<T>, HashSet<String>)>
-where
-    T: FromStr + Eq + std::hash::Hash,
-{
+) -> anyhow::Result<(HashSet<TimelineId>, HashSet<String>)> {
+    let remote_path = remote_timelines_path(&tenant_shard_id);
+
+    fail::fail_point!("storage-sync-list-remote-timelines", |_| {
+        anyhow::bail!("storage-sync-list-remote-timelines");
+    });
+
    let listing = download_retry_forever(
-        || storage.list(Some(&prefix), ListingMode::WithDelimiter, None, &cancel),
-        &format!("list identifiers in prefix {prefix}"),
+        || {
+            storage.list(
+                Some(&remote_path),
+                ListingMode::WithDelimiter,
+                None,
+                &cancel,
+            )
+        },
+        &format!("list timelines for {tenant_shard_id}"),
        &cancel,
    )
    .await?;

-    let mut parsed_ids = HashSet::new();
+    let mut timeline_ids = HashSet::new();
    let mut other_prefixes = HashSet::new();

-    for id_remote_storage_key in listing.prefixes {
-        let object_name = id_remote_storage_key.object_name().ok_or_else(|| {
-            anyhow::anyhow!("failed to get object name for key {id_remote_storage_key}")
+    for timeline_remote_storage_key in listing.prefixes {
+        let object_name = timeline_remote_storage_key.object_name().ok_or_else(|| {
+            anyhow::anyhow!("failed to get timeline id for remote tenant {tenant_shard_id}")
        })?;

-        match object_name.parse::<T>() {
-            Ok(t) => parsed_ids.insert(t),
+        match object_name.parse::<TimelineId>() {
+            Ok(t) => timeline_ids.insert(t),
            Err(_) => other_prefixes.insert(object_name.to_string()),
        };
    }
@@ -290,31 +299,7 @@ where
        other_prefixes.insert(object_name.to_string());
    }

-    Ok((parsed_ids, other_prefixes))
-}
-
-/// List shards of given tenant in remote storage
-pub(crate) async fn list_remote_tenant_shards(
-    storage: &GenericRemoteStorage,
-    tenant_id: TenantId,
-    cancel: CancellationToken,
-) -> anyhow::Result<(HashSet<TenantShardId>, HashSet<String>)> {
-    let remote_path = remote_tenant_path(&TenantShardId::unsharded(tenant_id));
-    list_identifiers::<TenantShardId>(storage, remote_path, cancel).await
-}
-
-/// List timelines of given tenant shard in remote storage
-pub async fn list_remote_timelines(
-    storage: &GenericRemoteStorage,
-    tenant_shard_id: TenantShardId,
-    cancel: CancellationToken,
-) -> anyhow::Result<(HashSet<TimelineId>, HashSet<String>)> {
-    fail::fail_point!("storage-sync-list-remote-timelines", |_| {
-        anyhow::bail!("storage-sync-list-remote-timelines");
-    });
-
-    let remote_path = remote_timelines_path(&tenant_shard_id).add_trailing_slash();
-    list_identifiers::<TimelineId>(storage, remote_path, cancel).await
+    Ok((timeline_ids, other_prefixes))
 }

 async fn do_download_index_part(
@@ -323,7 +308,7 @@ async fn do_download_index_part(
    timeline_id: &TimelineId,
    index_generation: Generation,
    cancel: &CancellationToken,
-) -> Result<(IndexPart, Generation), DownloadError> {
+) -> Result<IndexPart, DownloadError> {
    let remote_path = remote_index_path(tenant_shard_id, timeline_id, index_generation);

    let index_part_bytes = download_retry_forever(
@@ -348,7 +333,7 @@ async fn do_download_index_part(
        .with_context(|| format!("deserialize index part file at {remote_path:?}"))
        .map_err(DownloadError::Other)?;

-    Ok((index_part, index_generation))
+    Ok(index_part)
 }

 /// index_part.json objects are suffixed with a generation number, so we cannot
@@ -357,13 +342,13 @@ async fn do_download_index_part(
 /// In this function we probe for the most recent index in a generation <= our current generation.
 /// See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md
 #[tracing::instrument(skip_all, fields(generation=?my_generation))]
-pub(crate) async fn download_index_part(
+pub(super) async fn download_index_part(
    storage: &GenericRemoteStorage,
    tenant_shard_id: &TenantShardId,
    timeline_id: &TimelineId,
    my_generation: Generation,
    cancel: &CancellationToken,
-) -> Result<(IndexPart, Generation), DownloadError> {
+) -> Result<IndexPart, DownloadError> {
    debug_assert_current_span_has_tenant_and_timeline_id();

    if my_generation.is_none() {
@@ -432,16 +417,11 @@ pub(crate) async fn download_index_part(
    let index_prefix = remote_index_path(tenant_shard_id, timeline_id, Generation::none());

    let indices = download_retry(
-        || async {
-            storage
-                .list(Some(&index_prefix), ListingMode::NoDelimiter, None, cancel)
-                .await
-        },
+        || async { storage.list_files(Some(&index_prefix), None, cancel).await },
        "list index_part files",
        cancel,
    )
-    .await?
-    .keys;
+    .await?;

    // General case logic for which index to use: the latest index whose generation
    // is <= our own.  See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -167,7 +167,7 @@ pub(crate) async fn time_travel_recover_tenant(
    let warn_after = 3;
    let max_attempts = 10;
    let mut prefixes = Vec::with_capacity(2);
-    if tenant_shard_id.is_shard_zero() {
+    if tenant_shard_id.is_zero() {
        // Also recover the unsharded prefix for a shard of zero:
        // - if the tenant is totally unsharded, the unsharded prefix contains all the data
        // - if the tenant is sharded, we still want to recover the initdb data, but we only
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -312,7 +312,7 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
                    (detail.last_download, detail.next_download.unwrap())
                };

-                if now > next_download {
+                if now < next_download {
                    Some(PendingDownload {
                        secondary_state: secondary_tenant,
                        last_download,
@@ -647,12 +647,6 @@ impl<'a> TenantDownloader<'a> {
                progress.bytes_downloaded += layer_byte_count;
                progress.layers_downloaded += layer_count;
            }
-
-            for delete_timeline in &delete_timelines {
-                // We haven't removed from disk yet, but optimistically remove from in-memory state: if removal
-                // from disk fails that will be a fatal error.
-                detail.timelines.remove(delete_timeline);
-            }
        }

        // Execute accumulated deletions
@@ -716,14 +710,13 @@ impl<'a> TenantDownloader<'a> {
                    .await
                    .map_err(UpdateError::from)?;

-                SECONDARY_MODE.download_heatmap.inc();
-
                if Some(&download.etag) == prev_etag {
                    Ok(HeatMapDownload::Unmodified)
                } else {
                    let mut heatmap_bytes = Vec::new();
                    let mut body = tokio_util::io::StreamReader::new(download.download_stream);
                    let _size = tokio::io::copy_buf(&mut body, &mut heatmap_bytes).await?;
+                    SECONDARY_MODE.download_heatmap.inc();
                    Ok(HeatMapDownload::Modified(HeatMapModified {
                        etag: download.etag,
                        last_modified: download.last_modified,
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -118,9 +118,6 @@ pub(super) async fn gather_inputs(
    ctx: &RequestContext,
 ) -> anyhow::Result<ModelInputs> {
    // refresh is needed to update gc related pitr_cutoff and horizon_cutoff
-    //
-    // FIXME: if a single timeline is deleted while refresh gc info is ongoing, we will fail the
-    // whole computation. It does not make sense from the billing perspective.
    tenant
        .refresh_gc_info(cancel, ctx)
        .await
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -118,7 +118,6 @@ pub(crate) struct ValuesReconstructState {
    pub(crate) keys: HashMap<Key, Result<VectoredValueReconstructState, PageReconstructError>>,

    keys_done: KeySpaceRandomAccum,
-    layers_visited: u32,
 }

 impl ValuesReconstructState {
@@ -126,7 +125,6 @@ impl ValuesReconstructState {
        Self {
            keys: HashMap::new(),
            keys_done: KeySpaceRandomAccum::new(),
-            layers_visited: 0,
        }
    }

@@ -140,37 +138,6 @@ impl ValuesReconstructState {
        }
    }

-    pub(crate) fn on_layer_visited(&mut self) {
-        self.layers_visited += 1;
-    }
-
-    pub(crate) fn get_layers_visited(&self) -> u32 {
-        self.layers_visited
-    }
-
-    /// This function is called after reading a keyspace from a layer.
-    /// It checks if the read path has now moved past the cached Lsn for any keys.
-    ///
-    /// Implementation note: We intentionally iterate over the keys for which we've
-    /// already collected some reconstruct data. This avoids scaling complexity with
-    /// the size of the search space.
-    pub(crate) fn on_lsn_advanced(&mut self, keyspace: &KeySpace, advanced_to: Lsn) {
-        for (key, value) in self.keys.iter_mut() {
-            if !keyspace.contains(key) {
-                continue;
-            }
-
-            if let Ok(state) = value {
-                if state.situation != ValueReconstructSituation::Complete
-                    && state.get_cached_lsn() >= Some(advanced_to)
-                {
-                    state.situation = ValueReconstructSituation::Complete;
-                    self.keys_done.add_key(*key);
-                }
-            }
-        }
-    }
-
    /// Update the state collected for a given key.
    /// Returns true if this was the last value needed for the key and false otherwise.
    ///
@@ -195,18 +162,11 @@ impl ValuesReconstructState {
                        true
                    }
                    Value::WalRecord(rec) => {
-                        debug_assert!(
-                            Some(lsn) > state.get_cached_lsn(),
-                            "Attempt to collect a record below cached LSN for walredo: {} < {}",
-                            lsn,
-                            state
-                                .get_cached_lsn()
-                                .expect("Assertion can only fire if a cached lsn is present")
-                        );
-
+                        let reached_cache =
+                            state.get_cached_lsn().map(|clsn| clsn + 1) == Some(lsn);
                        let will_init = rec.will_init();
                        state.records.push((lsn, rec));
-                        will_init
+                        will_init || reached_cache
                    }
                },
            };
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -20,8 +20,8 @@
 //!    000000067F000032BE0000400000000020B6-000000067F000032BE0000400000000030B6__000000578C6B29-0000000057A50051
 //! ```
 //!
-//! Every delta file consists of three parts: "summary", "values", and
-//! "index". The summary is a fixed size header at the beginning of the file,
+//! Every delta file consists of three parts: "summary", "index", and
+//! "values". The summary is a fixed size header at the beginning of the file,
 //! and it contains basic information about the layer, and offsets to the other
 //! parts. The "index" is a B-tree, mapping from Key and LSN to an offset in the
 //! "values" part.  The actual page images and WAL records are stored in the
@@ -217,7 +217,6 @@ pub struct DeltaLayerInner {
    // values copied from summary
    index_start_blk: u32,
    index_root_blk: u32,
-    lsn_range: Range<Lsn>,

    file: VirtualFile,
    file_id: FileId,
@@ -729,9 +728,6 @@ impl DeltaLayerInner {
            // production code path
            expected_summary.index_start_blk = actual_summary.index_start_blk;
            expected_summary.index_root_blk = actual_summary.index_root_blk;
-            // mask out the timeline_id, but still require the layers to be from the same tenant
-            expected_summary.timeline_id = actual_summary.timeline_id;
-
            if actual_summary != expected_summary {
                bail!(
                    "in-file summary does not match expected summary. actual = {:?} expected = {:?}",
@@ -746,7 +742,6 @@ impl DeltaLayerInner {
            file_id,
            index_start_blk: actual_summary.index_start_blk,
            index_root_blk: actual_summary.index_root_blk,
-            lsn_range: actual_summary.lsn_range,
            max_vectored_read_bytes,
        }))
    }
@@ -868,10 +863,10 @@ impl DeltaLayerInner {
                .into(),
        );

-        let data_end_offset = self.index_start_offset();
+        let data_end_offset = self.index_start_blk as u64 * PAGE_SZ as u64;

        let reads = Self::plan_reads(
-            &keyspace,
+            keyspace,
            lsn_range,
            data_end_offset,
            index_reader,
@@ -885,13 +880,11 @@ impl DeltaLayerInner {
        self.do_reads_and_update_state(reads, reconstruct_state)
            .await;

-        reconstruct_state.on_lsn_advanced(&keyspace, self.lsn_range.start);
-
        Ok(())
    }

    async fn plan_reads<Reader>(
-        keyspace: &KeySpace,
+        keyspace: KeySpace,
        lsn_range: Range<Lsn>,
        data_end_offset: u64,
        index_reader: DiskBtreeReader<Reader, DELTA_KEY_SIZE>,
@@ -946,7 +939,7 @@ impl DeltaLayerInner {
            }

            if !range_end_handled {
-                tracing::debug!("Handling range end fallback at {}", data_end_offset);
+                tracing::info!("Handling range end fallback at {}", data_end_offset);
                planner.handle_range_end(data_end_offset);
            }
        }
@@ -1110,195 +1103,11 @@ impl DeltaLayerInner {
        if let Some(last) = all_keys.last_mut() {
            // Last key occupies all space till end of value storage,
            // which corresponds to beginning of the index
-            last.size = self.index_start_offset() - last.size;
+            last.size = self.index_start_blk as u64 * PAGE_SZ as u64 - last.size;
        }
        Ok(all_keys)
    }

-    /// Using the given writer, write out a truncated version, where LSNs higher than the
-    /// truncate_at are missing.
-    #[cfg(test)]
-    pub(super) async fn copy_prefix(
-        &self,
-        writer: &mut DeltaLayerWriter,
-        truncate_at: Lsn,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        use crate::tenant::vectored_blob_io::{
-            BlobMeta, VectoredReadBuilder, VectoredReadExtended,
-        };
-        use futures::stream::TryStreamExt;
-
-        #[derive(Debug)]
-        enum Item {
-            Actual(Key, Lsn, BlobRef),
-            Sentinel,
-        }
-
-        impl From<Item> for Option<(Key, Lsn, BlobRef)> {
-            fn from(value: Item) -> Self {
-                match value {
-                    Item::Actual(key, lsn, blob) => Some((key, lsn, blob)),
-                    Item::Sentinel => None,
-                }
-            }
-        }
-
-        impl Item {
-            fn offset(&self) -> Option<BlobRef> {
-                match self {
-                    Item::Actual(_, _, blob) => Some(*blob),
-                    Item::Sentinel => None,
-                }
-            }
-
-            fn is_last(&self) -> bool {
-                matches!(self, Item::Sentinel)
-            }
-        }
-
-        let block_reader = FileBlockReader::new(&self.file, self.file_id);
-        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
-            self.index_start_blk,
-            self.index_root_blk,
-            block_reader,
-        );
-
-        let stream = self.stream_index_forwards(&tree_reader, &[0u8; DELTA_KEY_SIZE], ctx);
-        let stream = stream.map_ok(|(key, lsn, pos)| Item::Actual(key, lsn, pos));
-        // put in a sentinel value for getting the end offset for last item, and not having to
-        // repeat the whole read part
-        let stream = stream.chain(futures::stream::once(futures::future::ready(Ok(
-            Item::Sentinel,
-        ))));
-        let mut stream = std::pin::pin!(stream);
-
-        let mut prev: Option<(Key, Lsn, BlobRef)> = None;
-
-        let mut read_builder: Option<VectoredReadBuilder> = None;
-
-        let max_read_size = self
-            .max_vectored_read_bytes
-            .map(|x| x.0.get())
-            .unwrap_or(8192);
-
-        let mut buffer = Some(BytesMut::with_capacity(max_read_size));
-
-        // FIXME: buffering of DeltaLayerWriter
-        let mut per_blob_copy = Vec::new();
-
-        while let Some(item) = stream.try_next().await? {
-            tracing::debug!(?item, "popped");
-            let offset = item
-                .offset()
-                .unwrap_or(BlobRef::new(self.index_start_offset(), false));
-
-            let actionable = if let Some((key, lsn, start_offset)) = prev.take() {
-                let end_offset = offset;
-
-                Some((BlobMeta { key, lsn }, start_offset..end_offset))
-            } else {
-                None
-            };
-
-            let is_last = item.is_last();
-
-            prev = Option::from(item);
-
-            let actionable = actionable.filter(|x| x.0.lsn < truncate_at);
-
-            let builder = if let Some((meta, offsets)) = actionable {
-                // extend or create a new builder
-                if read_builder
-                    .as_mut()
-                    .map(|x| x.extend(offsets.start.pos(), offsets.end.pos(), meta))
-                    .unwrap_or(VectoredReadExtended::No)
-                    == VectoredReadExtended::Yes
-                {
-                    None
-                } else {
-                    read_builder.replace(VectoredReadBuilder::new(
-                        offsets.start.pos(),
-                        offsets.end.pos(),
-                        meta,
-                        max_read_size,
-                    ))
-                }
-            } else {
-                // nothing to do, except perhaps flush any existing for the last element
-                None
-            };
-
-            // flush the possible older builder and also the new one if the item was the last one
-            let builders = builder.into_iter();
-            let builders = if is_last {
-                builders.chain(read_builder.take())
-            } else {
-                builders.chain(None)
-            };
-
-            for builder in builders {
-                let read = builder.build();
-
-                let reader = VectoredBlobReader::new(&self.file);
-
-                let mut buf = buffer.take().unwrap();
-
-                buf.clear();
-                buf.reserve(read.size());
-                let res = reader.read_blobs(&read, buf).await?;
-
-                for blob in res.blobs {
-                    let key = blob.meta.key;
-                    let lsn = blob.meta.lsn;
-                    let data = &res.buf[blob.start..blob.end];
-
-                    #[cfg(debug_assertions)]
-                    Value::des(data)
-                        .with_context(|| {
-                            format!(
-                                "blob failed to deserialize for {}@{}, {}..{}: {:?}",
-                                blob.meta.key,
-                                blob.meta.lsn,
-                                blob.start,
-                                blob.end,
-                                utils::Hex(data)
-                            )
-                        })
-                        .unwrap();
-
-                    // is it an image or will_init walrecord?
-                    // FIXME: this could be handled by threading the BlobRef to the
-                    // VectoredReadBuilder
-                    let will_init = crate::repository::ValueBytes::will_init(data)
-                        .inspect_err(|_e| {
-                            #[cfg(feature = "testing")]
-                            tracing::error!(data=?utils::Hex(data), err=?_e, "failed to parse will_init out of serialized value");
-                        })
-                        .unwrap_or(false);
-
-                    per_blob_copy.clear();
-                    per_blob_copy.extend_from_slice(data);
-
-                    let (tmp, res) = writer
-                        .put_value_bytes(key, lsn, std::mem::take(&mut per_blob_copy), will_init)
-                        .await;
-                    per_blob_copy = tmp;
-                    res?;
-                }
-
-                buffer = Some(res.buf);
-            }
-        }
-
-        assert!(
-            read_builder.is_none(),
-            "with the sentinel above loop should had handled all"
-        );
-
-        Ok(())
-    }
-
    pub(super) async fn dump(&self, ctx: &RequestContext) -> anyhow::Result<()> {
        println!(
            "index_start_blk: {}, root {}",
@@ -1368,44 +1177,6 @@ impl DeltaLayerInner {

        Ok(())
    }
-
-    #[cfg(test)]
-    fn stream_index_forwards<'a, R>(
-        &'a self,
-        reader: &'a DiskBtreeReader<R, DELTA_KEY_SIZE>,
-        start: &'a [u8; DELTA_KEY_SIZE],
-        ctx: &'a RequestContext,
-    ) -> impl futures::stream::Stream<
-        Item = Result<(Key, Lsn, BlobRef), crate::tenant::disk_btree::DiskBtreeError>,
-    > + 'a
-    where
-        R: BlockReader,
-    {
-        use futures::stream::TryStreamExt;
-        let stream = reader.get_stream_from(start, ctx);
-        stream.map_ok(|(key, value)| {
-            let key = DeltaKey::from_slice(&key);
-            let (key, lsn) = (key.key(), key.lsn());
-            let offset = BlobRef(value);
-
-            (key, lsn, offset)
-        })
-    }
-
-    /// The file offset to the first block of index.
-    ///
-    /// The file structure is summary, values, and index. We often need this for the size of last blob.
-    fn index_start_offset(&self) -> u64 {
-        let offset = self.index_start_blk as u64 * PAGE_SZ as u64;
-        let bref = BlobRef(offset);
-        tracing::debug!(
-            index_start_blk = self.index_start_blk,
-            offset,
-            pos = bref.pos(),
-            "index_start_offset"
-        );
-        offset
-    }
 }

 /// A set of data associated with a delta layer key and its value
@@ -1539,7 +1310,7 @@ mod test {

        // Plan and validate
        let vectored_reads = DeltaLayerInner::plan_reads(
-            &keyspace,
+            keyspace.clone(),
            lsn_range.clone(),
            disk_offset,
            reader,
@@ -1767,7 +1538,7 @@ mod test {

        let resident = writer.finish(entries_meta.key_range.end, &timeline).await?;

-        let inner = resident.as_delta(&ctx).await?;
+        let inner = resident.get_inner_delta(&ctx).await?;

        let file_size = inner.file.metadata().await?.len();
        tracing::info!(
@@ -1791,7 +1562,7 @@ mod test {
            let data_end_offset = inner.index_start_blk as u64 * PAGE_SZ as u64;

            let vectored_reads = DeltaLayerInner::plan_reads(
-                &keyspace,
+                keyspace.clone(),
                entries_meta.lsn_range.clone(),
                data_end_offset,
                index_reader,
@@ -1823,217 +1594,4 @@ mod test {

        Ok(())
    }
-
-    #[tokio::test]
-    async fn copy_delta_prefix_smoke() {
-        use crate::walrecord::NeonWalRecord;
-        use bytes::Bytes;
-
-        let h = crate::tenant::harness::TenantHarness::create("truncate_delta_smoke").unwrap();
-        let (tenant, ctx) = h.load().await;
-        let ctx = &ctx;
-        let timeline = tenant
-            .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, ctx)
-            .await
-            .unwrap();
-
-        let initdb_layer = timeline
-            .layers
-            .read()
-            .await
-            .likely_resident_layers()
-            .next()
-            .unwrap();
-
-        {
-            let mut writer = timeline.writer().await;
-
-            let data = [
-                (0x20, 12, Value::Image(Bytes::from_static(b"foobar"))),
-                (
-                    0x30,
-                    12,
-                    Value::WalRecord(NeonWalRecord::Postgres {
-                        will_init: false,
-                        rec: Bytes::from_static(b"1"),
-                    }),
-                ),
-                (
-                    0x40,
-                    12,
-                    Value::WalRecord(NeonWalRecord::Postgres {
-                        will_init: true,
-                        rec: Bytes::from_static(b"2"),
-                    }),
-                ),
-                // build an oversized value so we cannot extend and existing read over
-                // this
-                (
-                    0x50,
-                    12,
-                    Value::WalRecord(NeonWalRecord::Postgres {
-                        will_init: true,
-                        rec: {
-                            let mut buf =
-                                vec![0u8; tenant.conf.max_vectored_read_bytes.0.get() + 1024];
-                            buf.iter_mut()
-                                .enumerate()
-                                .for_each(|(i, slot)| *slot = (i % 256) as u8);
-                            Bytes::from(buf)
-                        },
-                    }),
-                ),
-                // because the oversized read cannot be extended further, we are sure to exercise the
-                // builder created on the last round with this:
-                (
-                    0x60,
-                    12,
-                    Value::WalRecord(NeonWalRecord::Postgres {
-                        will_init: true,
-                        rec: Bytes::from_static(b"3"),
-                    }),
-                ),
-                (
-                    0x60,
-                    9,
-                    Value::Image(Bytes::from_static(b"something for a different key")),
-                ),
-            ];
-
-            let mut last_lsn = None;
-
-            for (lsn, key, value) in data {
-                let key = Key::from_i128(key);
-                writer.put(key, Lsn(lsn), &value, ctx).await.unwrap();
-                last_lsn = Some(lsn);
-            }
-
-            writer.finish_write(Lsn(last_lsn.unwrap()));
-        }
-        timeline.freeze_and_flush().await.unwrap();
-
-        let new_layer = timeline
-            .layers
-            .read()
-            .await
-            .likely_resident_layers()
-            .find(|x| x != &initdb_layer)
-            .unwrap();
-
-        // create a copy for the timeline, so we don't overwrite the file
-        let branch = tenant
-            .branch_timeline_test(&timeline, TimelineId::generate(), None, ctx)
-            .await
-            .unwrap();
-
-        assert_eq!(branch.get_ancestor_lsn(), Lsn(0x60));
-
-        // truncating at 0x61 gives us a full copy, otherwise just go backwards until there's just
-        // a single key
-
-        for truncate_at in [0x61, 0x51, 0x41, 0x31, 0x21] {
-            let truncate_at = Lsn(truncate_at);
-
-            let mut writer = DeltaLayerWriter::new(
-                tenant.conf,
-                branch.timeline_id,
-                tenant.tenant_shard_id,
-                Key::MIN,
-                Lsn(0x11)..truncate_at,
-            )
-            .await
-            .unwrap();
-
-            let new_layer = new_layer.download_and_keep_resident().await.unwrap();
-
-            new_layer
-                .copy_delta_prefix(&mut writer, truncate_at, ctx)
-                .await
-                .unwrap();
-
-            let copied_layer = writer.finish(Key::MAX, &branch).await.unwrap();
-
-            copied_layer.as_delta(ctx).await.unwrap();
-
-            assert_keys_and_values_eq(
-                new_layer.as_delta(ctx).await.unwrap(),
-                copied_layer.as_delta(ctx).await.unwrap(),
-                truncate_at,
-                ctx,
-            )
-            .await;
-        }
-    }
-
-    async fn assert_keys_and_values_eq(
-        source: &DeltaLayerInner,
-        truncated: &DeltaLayerInner,
-        truncated_at: Lsn,
-        ctx: &RequestContext,
-    ) {
-        use futures::future::ready;
-        use futures::stream::TryStreamExt;
-
-        let start_key = [0u8; DELTA_KEY_SIZE];
-
-        let source_reader = FileBlockReader::new(&source.file, source.file_id);
-        let source_tree = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
-            source.index_start_blk,
-            source.index_root_blk,
-            &source_reader,
-        );
-        let source_stream = source.stream_index_forwards(&source_tree, &start_key, ctx);
-        let source_stream = source_stream.filter(|res| match res {
-            Ok((_, lsn, _)) => ready(lsn < &truncated_at),
-            _ => ready(true),
-        });
-        let mut source_stream = std::pin::pin!(source_stream);
-
-        let truncated_reader = FileBlockReader::new(&truncated.file, truncated.file_id);
-        let truncated_tree = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
-            truncated.index_start_blk,
-            truncated.index_root_blk,
-            &truncated_reader,
-        );
-        let truncated_stream = truncated.stream_index_forwards(&truncated_tree, &start_key, ctx);
-        let mut truncated_stream = std::pin::pin!(truncated_stream);
-
-        let mut scratch_left = Vec::new();
-        let mut scratch_right = Vec::new();
-
-        loop {
-            let (src, truncated) = (source_stream.try_next(), truncated_stream.try_next());
-            let (src, truncated) = tokio::try_join!(src, truncated).unwrap();
-
-            if src.is_none() {
-                assert!(truncated.is_none());
-                break;
-            }
-
-            let (src, truncated) = (src.unwrap(), truncated.unwrap());
-
-            // because we've filtered the source with Lsn, we should always have the same keys from both.
-            assert_eq!(src.0, truncated.0);
-            assert_eq!(src.1, truncated.1);
-
-            // if this is needed for something else, just drop this assert.
-            assert!(
-                src.2.pos() >= truncated.2.pos(),
-                "value position should not go backwards {} vs. {}",
-                src.2.pos(),
-                truncated.2.pos()
-            );
-
-            scratch_left.clear();
-            let src_cursor = source_reader.block_cursor();
-            let left = src_cursor.read_blob_into_buf(src.2.pos(), &mut scratch_left, ctx);
-            scratch_right.clear();
-            let trunc_cursor = truncated_reader.block_cursor();
-            let right = trunc_cursor.read_blob_into_buf(truncated.2.pos(), &mut scratch_right, ctx);
-
-            tokio::try_join!(left, right).unwrap();
-
-            assert_eq!(utils::Hex(&scratch_left), utils::Hex(&scratch_right));
-        }
-    }
 }
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -396,8 +396,6 @@ impl ImageLayerInner {
            // production code path
            expected_summary.index_start_blk = actual_summary.index_start_blk;
            expected_summary.index_root_blk = actual_summary.index_root_blk;
-            // mask out the timeline_id, but still require the layers to be from the same tenant
-            expected_summary.timeline_id = actual_summary.timeline_id;

            if actual_summary != expected_summary {
                bail!(
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -17,7 +17,7 @@ use anyhow::{anyhow, ensure, Result};
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::InMemoryLayerInfo;
 use pageserver_api::shard::TenantShardId;
-use std::collections::{BTreeMap, BinaryHeap, HashSet};
+use std::collections::{BinaryHeap, HashMap, HashSet};
 use std::sync::{Arc, OnceLock};
 use std::time::Instant;
 use tracing::*;
@@ -26,7 +26,7 @@ use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn, vec_map::VecMap};
 // while being able to use std::fmt::Write's methods
 use crate::metrics::TIMELINE_EPHEMERAL_BYTES;
 use std::cmp::Ordering;
-use std::fmt::Write;
+use std::fmt::Write as _;
 use std::ops::Range;
 use std::sync::atomic::Ordering as AtomicOrdering;
 use std::sync::atomic::{AtomicU64, AtomicUsize};
@@ -54,12 +54,6 @@ pub struct InMemoryLayer {
    /// Writes are only allowed when this is `None`.
    end_lsn: OnceLock<Lsn>,

-    /// Used for traversal path. Cached representation of the in-memory layer before frozen.
-    local_path_str: Arc<str>,
-
-    /// Used for traversal path. Cached representation of the in-memory layer after frozen.
-    frozen_local_path_str: OnceLock<Arc<str>>,
-
    opened_at: Instant,

    /// The above fields never change, except for `end_lsn`, which is only set once.
@@ -78,10 +72,10 @@ impl std::fmt::Debug for InMemoryLayer {
 }

 pub struct InMemoryLayerInner {
-    /// All versions of all pages in the layer are kept here. Indexed
+    /// All versions of all pages in the layer are kept here.  Indexed
    /// by block number and LSN. The value is an offset into the
    /// ephemeral file where the page version is stored.
-    index: BTreeMap<Key, VecMap<Lsn, u64>>,
+    index: HashMap<Key, VecMap<Lsn, u64>>,

    /// The values are stored in a serialized format in this file.
    /// Each serialized Value is preceded by a 'u32' length field.
@@ -247,12 +241,6 @@ impl InMemoryLayer {
        self.start_lsn..self.end_lsn_or_max()
    }

-    pub(crate) fn local_path_str(&self) -> &Arc<str> {
-        self.frozen_local_path_str
-            .get()
-            .unwrap_or(&self.local_path_str)
-    }
-
    /// debugging function to print out the contents of the layer
    ///
    /// this is likely completly unused
@@ -384,24 +372,29 @@ impl InMemoryLayer {
        let mut planned_block_reads = BinaryHeap::new();

        for range in keyspace.ranges.iter() {
-            for (key, vec_map) in inner.index.range(range.start..range.end) {
-                let lsn_range = match reconstruct_state.get_cached_lsn(key) {
-                    Some(cached_lsn) => (cached_lsn + 1)..end_lsn,
-                    None => self.start_lsn..end_lsn,
-                };
+            let mut key = range.start;
+            while key < range.end {
+                if let Some(vec_map) = inner.index.get(&key) {
+                    let lsn_range = match reconstruct_state.get_cached_lsn(&key) {
+                        Some(cached_lsn) => (cached_lsn + 1)..end_lsn,
+                        None => self.start_lsn..end_lsn,
+                    };

-                let slice = vec_map.slice_range(lsn_range);
-                for (entry_lsn, pos) in slice.iter().rev() {
-                    planned_block_reads.push(BlockRead {
-                        key: *key,
-                        lsn: *entry_lsn,
-                        block_offset: *pos,
-                    });
+                    let slice = vec_map.slice_range(lsn_range);
+                    for (entry_lsn, pos) in slice.iter().rev() {
+                        planned_block_reads.push(BlockRead {
+                            key,
+                            lsn: *entry_lsn,
+                            block_offset: *pos,
+                        });
+                    }
                }
+
+                key = key.next();
            }
        }

-        let keyspace_size = keyspace.total_raw_size();
+        let keyspace_size = keyspace.total_size();

        let mut completed_keys = HashSet::new();
        while completed_keys.len() < keyspace_size && !planned_block_reads.is_empty() {
@@ -433,30 +426,14 @@ impl InMemoryLayer {
            }
        }

-        reconstruct_state.on_lsn_advanced(&keyspace, self.start_lsn);
-
        Ok(())
    }
 }

-fn inmem_layer_display(mut f: impl Write, start_lsn: Lsn, end_lsn: Lsn) -> std::fmt::Result {
-    write!(f, "inmem-{:016X}-{:016X}", start_lsn.0, end_lsn.0)
-}
-
-fn inmem_layer_log_display(
-    mut f: impl Write,
-    timeline: TimelineId,
-    start_lsn: Lsn,
-    end_lsn: Lsn,
-) -> std::fmt::Result {
-    write!(f, "timeline {} in-memory ", timeline)?;
-    inmem_layer_display(f, start_lsn, end_lsn)
-}
-
 impl std::fmt::Display for InMemoryLayer {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        let end_lsn = self.end_lsn_or_max();
-        inmem_layer_display(f, self.start_lsn, end_lsn)
+        write!(f, "inmem-{:016X}-{:016X}", self.start_lsn.0, end_lsn.0)
    }
 }

@@ -477,16 +454,10 @@ impl InMemoryLayer {
        trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}");

        let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id).await?;
-        let key = InMemoryLayerFileId(file.page_cache_file_id());
+        let key = InMemoryLayerFileId(file.id());

        Ok(InMemoryLayer {
            file_id: key,
-            local_path_str: {
-                let mut buf = String::new();
-                inmem_layer_log_display(&mut buf, timeline_id, start_lsn, Lsn::MAX).unwrap();
-                buf.into()
-            },
-            frozen_local_path_str: OnceLock::new(),
            conf,
            timeline_id,
            tenant_shard_id,
@@ -494,7 +465,7 @@ impl InMemoryLayer {
            end_lsn: OnceLock::new(),
            opened_at: Instant::now(),
            inner: RwLock::new(InMemoryLayerInner {
-                index: BTreeMap::new(),
+                index: HashMap::new(),
                file,
                resource_units: GlobalResourceUnits::new(),
            }),
@@ -581,15 +552,6 @@ impl InMemoryLayer {
        );
        self.end_lsn.set(end_lsn).expect("end_lsn set only once");

-        self.frozen_local_path_str
-            .set({
-                let mut buf = String::new();
-                inmem_layer_log_display(&mut buf, self.get_timeline_id(), self.start_lsn, end_lsn)
-                    .unwrap();
-                buf.into()
-            })
-            .expect("frozen_local_path_str set only once");
-
        for vec_map in inner.index.values() {
            for (lsn, _pos) in vec_map.as_slice() {
                assert!(*lsn < end_lsn);
@@ -597,17 +559,14 @@ impl InMemoryLayer {
        }
    }

-    /// Write this frozen in-memory layer to disk. If `key_range` is set, the delta
-    /// layer will only contain the key range the user specifies, and may return `None`
-    /// if there are no matching keys.
+    /// Write this frozen in-memory layer to disk.
    ///
    /// Returns a new delta layer with all the same data as this in-memory layer
    pub(crate) async fn write_to_disk(
        &self,
        timeline: &Arc<Timeline>,
        ctx: &RequestContext,
-        key_range: Option<Range<Key>>,
-    ) -> Result<Option<ResidentLayer>> {
+    ) -> Result<ResidentLayer> {
        // Grab the lock in read-mode. We hold it over the I/O, but because this
        // layer is not writeable anymore, no one should be trying to acquire the
        // write lock on it, so we shouldn't block anyone. There's one exception
@@ -621,21 +580,6 @@ impl InMemoryLayer {

        let end_lsn = *self.end_lsn.get().unwrap();

-        let keys: Vec<_> = if let Some(key_range) = key_range {
-            inner
-                .index
-                .iter()
-                .filter(|(k, _)| key_range.contains(k))
-                .map(|(k, m)| (k.to_i128(), m))
-                .collect()
-        } else {
-            inner.index.iter().map(|(k, m)| (k.to_i128(), m)).collect()
-        };
-
-        if keys.is_empty() {
-            return Ok(None);
-        }
-
        let mut delta_layer_writer = DeltaLayerWriter::new(
            self.conf,
            self.timeline_id,
@@ -649,17 +593,26 @@ impl InMemoryLayer {

        let cursor = inner.file.block_cursor();

+        // Sort the keys because delta layer writer expects them sorted.
+        //
+        // NOTE: this sort can take up significant time if the layer has millions of
+        //       keys. To speed up all the comparisons we convert the key to i128 and
+        //       keep the value as a reference.
+        let mut keys: Vec<_> = inner.index.iter().map(|(k, m)| (k.to_i128(), m)).collect();
+        keys.sort_unstable_by_key(|k| k.0);
+
        let ctx = RequestContextBuilder::extend(ctx)
            .page_content_kind(PageContentKind::InMemoryLayer)
            .build();
-        for (key, vec_map) in inner.index.iter() {
+        for (key, vec_map) in keys.iter() {
+            let key = Key::from_i128(*key);
            // Write all page versions
            for (lsn, pos) in vec_map.as_slice() {
                cursor.read_blob_into_buf(*pos, &mut buf, &ctx).await?;
                let will_init = Value::des(&buf)?.will_init();
                let res;
                (buf, res) = delta_layer_writer
-                    .put_value_bytes(*key, *lsn, buf, will_init)
+                    .put_value_bytes(key, *lsn, buf, will_init)
                    .await;
                res?;
            }
@@ -667,6 +620,6 @@ impl InMemoryLayer {

        // MAX is used here because we identify L0 layers by full key range
        let delta_layer = delta_layer_writer.finish(Key::MAX, timeline).await?;
-        Ok(Some(delta_layer))
+        Ok(delta_layer)
    }
 }
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -116,12 +116,6 @@ impl AsLayerDesc for Layer {
    }
 }

-impl PartialEq for Layer {
-    fn eq(&self, other: &Self) -> bool {
-        Arc::as_ptr(&self.0) == Arc::as_ptr(&other.0)
-    }
-}
-
 impl Layer {
    /// Creates a layer value for a file we know to not be resident.
    pub(crate) fn for_evicted(
@@ -336,12 +330,6 @@ impl Layer {
            .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_data, &self.0, ctx)
            .instrument(tracing::debug_span!("get_values_reconstruct_data", layer=%self))
            .await
-            .map_err(|err| match err {
-                GetVectoredError::Other(err) => GetVectoredError::Other(
-                    err.context(format!("get_values_reconstruct_data for layer {self}")),
-                ),
-                err => err,
-            })
    }

    /// Download the layer if evicted.
@@ -401,10 +389,6 @@ impl Layer {
        &self.0.path
    }

-    pub(crate) fn debug_str(&self) -> &Arc<str> {
-        &self.0.debug_str
-    }
-
    pub(crate) fn metadata(&self) -> LayerFileMetadata {
        self.0.metadata()
    }
@@ -527,9 +511,6 @@ struct LayerInner {
    /// Full path to the file; unclear if this should exist anymore.
    path: Utf8PathBuf,

-    /// String representation of the layer, used for traversal id.
-    debug_str: Arc<str>,
-
    desc: PersistentLayerDesc,

    /// Timeline access is needed for remote timeline client and metrics.
@@ -623,17 +604,9 @@ enum Status {

 impl Drop for LayerInner {
    fn drop(&mut self) {
-        // if there was a pending eviction, mark it cancelled here to balance metrics
-        if let Some((ResidentOrWantedEvicted::WantedEvicted(..), _)) = self.inner.take_and_deinit()
-        {
-            // eviction has already been started
-            LAYER_IMPL_METRICS.inc_eviction_cancelled(EvictionCancelled::LayerGone);
-
-            // eviction request is intentionally not honored as no one is present to wait for it
-            // and we could be delaying shutdown for nothing.
-        }
-
        if !*self.wanted_deleted.get_mut() {
+            // should we try to evict if the last wish was for eviction? seems more like a hazard
+            // than a clear win.
            return;
        }

@@ -735,7 +708,6 @@ impl LayerInner {

        LayerInner {
            conf,
-            debug_str: { format!("timelines/{}/{}", timeline.timeline_id, desc.filename()).into() },
            path,
            desc,
            timeline: Arc::downgrade(timeline),
@@ -1580,8 +1552,8 @@ impl Drop for DownloadedLayer {
        if let Some(owner) = self.owner.upgrade() {
            owner.on_downloaded_layer_drop(self.version);
        } else {
-            // Layer::drop will handle cancelling the eviction; because of drop order and
-            // `DownloadedLayer` never leaking, we cannot know here if eviction was requested.
+            // no need to do anything, we are shutting down
+            LAYER_IMPL_METRICS.inc_eviction_cancelled(EvictionCancelled::LayerGone);
        }
    }
 }
@@ -1780,28 +1752,6 @@ impl ResidentLayer {
        }
    }

-    /// FIXME: truncate is bad name because we are not truncating anything, but copying the
-    /// filtered parts.
-    #[cfg(test)]
-    pub(super) async fn copy_delta_prefix(
-        &self,
-        writer: &mut super::delta_layer::DeltaLayerWriter,
-        truncate_at: Lsn,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        use LayerKind::*;
-
-        let owner = &self.owner.0;
-
-        match self.downloaded.get(owner, ctx).await? {
-            Delta(ref d) => d
-                .copy_prefix(writer, truncate_at, ctx)
-                .await
-                .with_context(|| format!("truncate {self}")),
-            Image(_) => anyhow::bail!(format!("cannot truncate image layer {self}")),
-        }
-    }
-
    pub(crate) fn local_path(&self) -> &Utf8Path {
        &self.owner.0.path
    }
@@ -1811,14 +1761,14 @@ impl ResidentLayer {
    }

    #[cfg(test)]
-    pub(crate) async fn as_delta(
-        &self,
+    pub(crate) async fn get_inner_delta<'a>(
+        &'a self,
        ctx: &RequestContext,
-    ) -> anyhow::Result<&delta_layer::DeltaLayerInner> {
-        use LayerKind::*;
-        match self.downloaded.get(&self.owner.0, ctx).await? {
-            Delta(ref d) => Ok(d),
-            Image(_) => Err(anyhow::anyhow!("image layer")),
+    ) -> anyhow::Result<&'a delta_layer::DeltaLayerInner> {
+        let owner = &self.owner.0;
+        match self.downloaded.get(owner, ctx).await? {
+            LayerKind::Delta(d) => Ok(d),
+            LayerKind::Image(_) => Err(anyhow::anyhow!("Expected a delta layer")),
        }
    }
 }
--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -721,110 +721,11 @@ async fn evict_and_wait_does_not_wait_for_download() {
    layer.evict_and_wait(FOREVER).await.unwrap();
 }

-/// Asserts that there is no miscalculation when Layer is dropped while it is being kept resident,
-/// which is the last value.
-///
-/// Also checks that the same does not happen on a non-evicted layer (regression test).
-#[tokio::test(start_paused = true)]
-async fn eviction_cancellation_on_drop() {
-    use crate::repository::Value;
-    use bytes::Bytes;
-
-    // this is the runtime on which Layer spawns the blocking tasks on
-    let handle = tokio::runtime::Handle::current();
-
-    let h = TenantHarness::create("eviction_cancellation_on_drop").unwrap();
-    utils::logging::replace_panic_hook_with_tracing_panic_hook().forget();
-    let (tenant, ctx) = h.load().await;
-
-    let timeline = tenant
-        .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
-        .await
-        .unwrap();
-
-    {
-        // create_test_timeline wrote us one layer, write another
-        let mut writer = timeline.writer().await;
-        writer
-            .put(
-                Key::from_i128(5),
-                Lsn(0x20),
-                &Value::Image(Bytes::from_static(b"this does not matter either")),
-                &ctx,
-            )
-            .await
-            .unwrap();
-
-        writer.finish_write(Lsn(0x20));
-    }
-
-    timeline.freeze_and_flush().await.unwrap();
-
-    // wait for the upload to complete so our Arc::strong_count assertion holds
-    timeline
-        .remote_client
-        .as_ref()
-        .unwrap()
-        .wait_completion()
-        .await
-        .unwrap();
-
-    let (evicted_layer, not_evicted) = {
-        let mut layers = {
-            let mut guard = timeline.layers.write().await;
-            let layers = guard.likely_resident_layers().collect::<Vec<_>>();
-            // remove the layers from layermap
-            guard.finish_gc_timeline(&layers);
-
-            layers
-        };
-
-        assert_eq!(layers.len(), 2);
-
-        (layers.pop().unwrap(), layers.pop().unwrap())
-    };
-
-    let victims = [(evicted_layer, true), (not_evicted, false)];
-
-    for (victim, evict) in victims {
-        let resident = victim.keep_resident().await.unwrap();
-        drop(victim);
-
-        assert_eq!(Arc::strong_count(&resident.owner.0), 1);
-
-        if evict {
-            let evict_and_wait = resident.owner.evict_and_wait(FOREVER);
-
-            // drive the future to await on the status channel, and then drop it
-            tokio::time::timeout(ADVANCE, evict_and_wait)
-                .await
-                .expect_err("should had been a timeout since we are holding the layer resident");
-        }
-
-        // 1 == we only evict one of the layers
-        assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get());
-
-        drop(resident);
-
-        // run any spawned
-        tokio::time::sleep(ADVANCE).await;
-
-        SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle).await;
-
-        assert_eq!(
-            1,
-            LAYER_IMPL_METRICS.cancelled_evictions[EvictionCancelled::LayerGone].get()
-        );
-    }
-}
-
-/// A test case to remind you the cost of these structures. You can bump the size limit
-/// below if it is really necessary to add more fields to the structures.
 #[test]
 fn layer_size() {
    assert_eq!(std::mem::size_of::<LayerAccessStats>(), 2040);
    assert_eq!(std::mem::size_of::<PersistentLayerDesc>(), 104);
-    assert_eq!(std::mem::size_of::<LayerInner>(), 2344);
+    assert_eq!(std::mem::size_of::<LayerInner>(), 2328);
    // it also has the utf8 path
 }

--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -62,7 +62,7 @@ impl BackgroundLoopKind {
 pub(crate) async fn concurrent_background_tasks_rate_limit_permit(
    loop_kind: BackgroundLoopKind,
    _ctx: &RequestContext,
-) -> tokio::sync::SemaphorePermit<'static> {
+) -> impl Drop {
    let _guard = crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_GAUGE
        .with_label_values(&[loop_kind.as_static_str()])
        .guard();
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -9,13 +9,13 @@ use std::ops::{Deref, Range};
 use std::sync::Arc;

 use super::layer_manager::LayerManager;
-use super::{CompactFlags, DurationRecorder, ImageLayerCreationMode, RecordedDuration, Timeline};
+use super::{CompactFlags, DurationRecorder, RecordedDuration, Timeline};

 use anyhow::{anyhow, Context};
 use enumset::EnumSet;
 use fail::fail_point;
 use itertools::Itertools;
-use pageserver_api::shard::{ShardIdentity, TenantShardId};
+use pageserver_api::shard::TenantShardId;
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, info, info_span, trace, warn, Instrument};
 use utils::id::TimelineId;
@@ -102,7 +102,7 @@ impl Timeline {
            )
            .await
        {
-            Ok(((dense_partitioning, sparse_partitioning), lsn)) => {
+            Ok((partitioning, lsn)) => {
                // Disables access_stats updates, so that the files we read remain candidates for eviction after we're done with them
                let image_ctx = RequestContextBuilder::extend(ctx)
                    .access_stats_behavior(AccessStatsBehavior::Skip)
@@ -115,37 +115,17 @@ impl Timeline {

                // 3. Create new image layers for partitions that have been modified
                // "enough".
-                let dense_layers = self
+                let layers = self
                    .create_image_layers(
-                        &dense_partitioning,
+                        &partitioning,
                        lsn,
-                        if flags.contains(CompactFlags::ForceImageLayerCreation) {
-                            ImageLayerCreationMode::Force
-                        } else {
-                            ImageLayerCreationMode::Try
-                        },
+                        flags.contains(CompactFlags::ForceImageLayerCreation),
                        &image_ctx,
                    )
                    .await
                    .map_err(anyhow::Error::from)?;

-                // For now, nothing will be produced...
-                let sparse_layers = self
-                    .create_image_layers(
-                        &sparse_partitioning.clone().into_dense(),
-                        lsn,
-                        if flags.contains(CompactFlags::ForceImageLayerCreation) {
-                            ImageLayerCreationMode::Force
-                        } else {
-                            ImageLayerCreationMode::Try
-                        },
-                        &image_ctx,
-                    )
-                    .await
-                    .map_err(anyhow::Error::from)?;
-                assert!(sparse_layers.is_empty());
-
-                self.upload_new_image_layers(dense_layers)?;
+                self.upload_new_image_layers(layers)?;
            }
            Err(err) => {
                // no partitioning? This is normal, if the timeline was just created
@@ -778,9 +758,8 @@ impl Timeline {
            return Err(CompactionError::ShuttingDown);
        }

-        let (dense_ks, _sparse_ks) = self.collect_keyspace(end_lsn, ctx).await?;
-        // TODO(chi): ignore sparse_keyspace for now, compact it in the future.
-        let mut adaptor = TimelineAdaptor::new(self, (end_lsn, dense_ks));
+        let keyspace = self.collect_keyspace(end_lsn, ctx).await?;
+        let mut adaptor = TimelineAdaptor::new(self, (end_lsn, keyspace));

        pageserver_compaction::compact_tiered::compact_tiered(
            &mut adaptor,
@@ -852,10 +831,6 @@ impl CompactionJobExecutor for TimelineAdaptor {

    type RequestContext = crate::context::RequestContext;

-    fn get_shard_identity(&self) -> &ShardIdentity {
-        self.timeline.get_shard_identity()
-    }
-
    async fn get_layers(
        &mut self,
        key_range: &Range<Key>,
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -188,10 +188,24 @@ impl Timeline {
    ) -> ControlFlow<()> {
        let now = SystemTime::now();

-        let permit = self.acquire_imitation_permit(cancel, ctx).await?;
+        let acquire_permit = crate::tenant::tasks::concurrent_background_tasks_rate_limit_permit(
+            BackgroundLoopKind::Eviction,
+            ctx,
+        );

-        self.imitate_layer_accesses(tenant, p, cancel, gate, permit, ctx)
-            .await?;
+        let _permit = tokio::select! {
+            permit = acquire_permit => permit,
+            _ = cancel.cancelled() => return ControlFlow::Break(()),
+            _ = self.cancel.cancelled() => return ControlFlow::Break(()),
+        };
+
+        match self
+            .imitate_layer_accesses(tenant, p, cancel, gate, ctx)
+            .await
+        {
+            ControlFlow::Break(()) => return ControlFlow::Break(()),
+            ControlFlow::Continue(()) => (),
+        }

        #[derive(Debug, Default)]
        struct EvictionStats {
@@ -316,27 +330,19 @@ impl Timeline {
        gate: &GateGuard,
        ctx: &RequestContext,
    ) -> ControlFlow<()> {
-        let permit = self.acquire_imitation_permit(cancel, ctx).await?;
-
-        self.imitate_layer_accesses(tenant, p, cancel, gate, permit, ctx)
-            .await
-    }
-
-    async fn acquire_imitation_permit(
-        &self,
-        cancel: &CancellationToken,
-        ctx: &RequestContext,
-    ) -> ControlFlow<(), tokio::sync::SemaphorePermit<'static>> {
        let acquire_permit = crate::tenant::tasks::concurrent_background_tasks_rate_limit_permit(
            BackgroundLoopKind::Eviction,
            ctx,
        );

-        tokio::select! {
-            permit = acquire_permit => ControlFlow::Continue(permit),
-            _ = cancel.cancelled() => ControlFlow::Break(()),
-            _ = self.cancel.cancelled() => ControlFlow::Break(()),
-        }
+        let _permit = tokio::select! {
+            permit = acquire_permit => permit,
+            _ = cancel.cancelled() => return ControlFlow::Break(()),
+            _ = self.cancel.cancelled() => return ControlFlow::Break(()),
+        };
+
+        self.imitate_layer_accesses(tenant, p, cancel, gate, ctx)
+            .await
    }

    /// If we evict layers but keep cached values derived from those layers, then
@@ -370,10 +376,9 @@ impl Timeline {
        p: &EvictionPolicyLayerAccessThreshold,
        cancel: &CancellationToken,
        gate: &GateGuard,
-        permit: tokio::sync::SemaphorePermit<'static>,
        ctx: &RequestContext,
    ) -> ControlFlow<()> {
-        if !self.tenant_shard_id.is_shard_zero() {
+        if !self.tenant_shard_id.is_zero() {
            // Shards !=0 do not maintain accurate relation sizes, and do not need to calculate logical size
            // for consumption metrics (consumption metrics are only sent from shard 0).  We may therefore
            // skip imitating logical size accesses for eviction purposes.
@@ -403,28 +408,7 @@ impl Timeline {
        // Make one of the tenant's timelines draw the short straw and run the calculation.
        // The others wait until the calculation is done so that they take into account the
        // imitated accesses that the winner made.
-        let (mut state, _permit) = {
-            if let Ok(locked) = tenant.eviction_task_tenant_state.try_lock() {
-                (locked, permit)
-            } else {
-                // we might need to wait for a long time here in case of pathological synthetic
-                // size calculation performance
-                drop(permit);
-                let locked = tokio::select! {
-                    locked = tenant.eviction_task_tenant_state.lock() => locked,
-                    _ = self.cancel.cancelled() => {
-                        return ControlFlow::Break(())
-                    },
-                    _ = cancel.cancelled() => {
-                        return ControlFlow::Break(())
-                    }
-                };
-                // then reacquire -- this will be bad if there is a lot of traffic, but because we
-                // released the permit, the overall latency will be much better.
-                let permit = self.acquire_imitation_permit(cancel, ctx).await?;
-                (locked, permit)
-            }
-        };
+        let mut state = tenant.eviction_task_tenant_state.lock().await;
        match state.last_layer_access_imitation {
            Some(ts) if ts.elapsed() < inter_imitate_period => { /* no need to run */ }
            _ => {
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -22,12 +22,10 @@ use crate::tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeli
 use anyhow::Context;
 use chrono::{NaiveDateTime, Utc};
 use pageserver_api::models::TimelineState;
-
+use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey;
+use storage_broker::proto::SafekeeperTimelineInfo;
+use storage_broker::proto::SubscribeSafekeeperInfoRequest;
 use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
-use storage_broker::proto::{
-    FilterTenantTimelineId, MessageType, SafekeeperDiscoveryRequest, SafekeeperDiscoveryResponse,
-    SubscribeByFilterRequest, TypeSubscription, TypedMessage,
-};
 use storage_broker::{BrokerClientChannel, Code, Streaming};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
@@ -91,14 +89,6 @@ pub(super) async fn connection_manager_loop_step(
        .timeline
        .subscribe_for_state_updates();

-    let mut wait_lsn_status = connection_manager_state
-        .timeline
-        .subscribe_for_wait_lsn_updates();
-
-    // TODO: create a separate config option for discovery request interval
-    let discovery_request_interval = connection_manager_state.conf.lagging_wal_timeout;
-    let mut last_discovery_ts: Option<std::time::Instant> = None;
-
    // Subscribe to the broker updates. Stream shares underlying TCP connection
    // with other streams on this client (other connection managers). When
    // object goes out of scope, stream finishes in drop() automatically.
@@ -107,12 +97,10 @@ pub(super) async fn connection_manager_loop_step(

    loop {
        let time_until_next_retry = connection_manager_state.time_until_next_retry();
-        let any_activity = connection_manager_state.wal_connection.is_some()
-            || !connection_manager_state.wal_stream_candidates.is_empty();

        // These things are happening concurrently:
        //
-        //  - cancellation request
+        // - cancellation request
        //  - keep receiving WAL on the current connection
        //      - if the shared state says we need to change connection, disconnect and return
        //      - this runs in a separate task and we receive updates via a watch channel
@@ -120,7 +108,6 @@ pub(super) async fn connection_manager_loop_step(
        //  - receive updates from broker
        //      - this might change the current desired connection
        //  - timeline state changes to something that does not allow walreceiver to run concurrently
-        //  - if there's no connection and no candidates, try to send a discovery request

        // NB: make sure each of the select expressions are cancellation-safe
        // (no need for arms to be cancellation-safe).
@@ -227,65 +214,6 @@ pub(super) async fn connection_manager_loop_step(
                    }
                }
            } => debug!("Waking up for the next retry after waiting for {time_until_next_retry:?}"),
-
-            Some(()) = async {
-                // Reminder: this match arm needs to be cancellation-safe.
-                // Calculating time needed to wait until sending the next discovery request.
-                // Current implementation is conservative and sends discovery requests only when there are no candidates.
-
-                if any_activity {
-                    // No need to send discovery requests if there is an active connection or candidates.
-                    return None;
-                }
-
-                // Waiting for an active wait_lsn request.
-                while wait_lsn_status.borrow().is_none() {
-                    if wait_lsn_status.changed().await.is_err() {
-                        // wait_lsn_status channel was closed, exiting
-                        warn!("wait_lsn_status channel was closed in connection_manager_loop_step");
-                        return None;
-                    }
-                }
-
-                // All preconditions met, preparing to send a discovery request.
-                let now = std::time::Instant::now();
-                let next_discovery_ts = last_discovery_ts
-                    .map(|ts| ts + discovery_request_interval)
-                    .unwrap_or_else(|| now);
-
-                if next_discovery_ts > now {
-                    // Prevent sending discovery requests too frequently.
-                    tokio::time::sleep(next_discovery_ts - now).await;
-                }
-
-                let tenant_timeline_id = Some(ProtoTenantTimelineId {
-                    tenant_id: id.tenant_id.as_ref().to_owned(),
-                    timeline_id: id.timeline_id.as_ref().to_owned(),
-                });
-                let request = SafekeeperDiscoveryRequest { tenant_timeline_id };
-                let msg = TypedMessage {
-                    r#type: MessageType::SafekeeperDiscoveryRequest as i32,
-                    safekeeper_timeline_info: None,
-                    safekeeper_discovery_request: Some(request),
-                    safekeeper_discovery_response: None,
-                    };
-
-                last_discovery_ts = Some(std::time::Instant::now());
-                debug!("No active connection and no candidates, sending discovery request to the broker");
-
-                // Cancellation safety: we want to send a message to the broker, but publish_one()
-                // function can get cancelled by the other select! arm. This is absolutely fine, because
-                // we just want to receive broker updates and discovery is not important if we already
-                // receive updates.
-                //
-                // It is possible that `last_discovery_ts` will be updated, but the message will not be sent.
-                // This is totally fine because of the reason above.
-
-                // This is a fire-and-forget request, we don't care about the response
-                let _ = broker_client.publish_one(msg).await;
-                debug!("Discovery request sent to the broker");
-                None
-            } => {}
        }

        if let Some(new_candidate) = connection_manager_state.next_connection_candidate() {
@@ -303,7 +231,7 @@ async fn subscribe_for_timeline_updates(
    broker_client: &mut BrokerClientChannel,
    id: TenantTimelineId,
    cancel: &CancellationToken,
-) -> Result<Streaming<TypedMessage>, Cancelled> {
+) -> Result<Streaming<SafekeeperTimelineInfo>, Cancelled> {
    let mut attempt = 0;
    loop {
        exponential_backoff(
@@ -316,27 +244,17 @@ async fn subscribe_for_timeline_updates(
        attempt += 1;

        // subscribe to the specific timeline
-        let request = SubscribeByFilterRequest {
-            types: vec![
-                TypeSubscription {
-                    r#type: MessageType::SafekeeperTimelineInfo as i32,
-                },
-                TypeSubscription {
-                    r#type: MessageType::SafekeeperDiscoveryResponse as i32,
-                },
-            ],
-            tenant_timeline_id: Some(FilterTenantTimelineId {
-                enabled: true,
-                tenant_timeline_id: Some(ProtoTenantTimelineId {
-                    tenant_id: id.tenant_id.as_ref().to_owned(),
-                    timeline_id: id.timeline_id.as_ref().to_owned(),
-                }),
-            }),
+        let key = SubscriptionKey::TenantTimelineId(ProtoTenantTimelineId {
+            tenant_id: id.tenant_id.as_ref().to_owned(),
+            timeline_id: id.timeline_id.as_ref().to_owned(),
+        });
+        let request = SubscribeSafekeeperInfoRequest {
+            subscription_key: Some(key),
        };

        match {
            tokio::select! {
-                r = broker_client.subscribe_by_filter(request) => { r }
+                r = broker_client.subscribe_safekeeper_info(request) => { r }
                _ = cancel.cancelled() => { return Err(Cancelled); }
            }
        } {
@@ -480,7 +398,7 @@ struct RetryInfo {
 /// Data about the timeline to connect to, received from the broker.
 #[derive(Debug, Clone)]
 struct BrokerSkTimeline {
-    timeline: SafekeeperDiscoveryResponse,
+    timeline: SafekeeperTimelineInfo,
    /// Time at which the data was fetched from the broker last time, to track the stale data.
    latest_update: NaiveDateTime,
 }
@@ -688,41 +606,7 @@ impl ConnectionManagerState {
    }

    /// Adds another broker timeline into the state, if its more recent than the one already added there for the same key.
-    fn register_timeline_update(&mut self, typed_msg: TypedMessage) {
-        let mut is_discovery = false;
-        let timeline_update = match typed_msg.r#type() {
-            MessageType::SafekeeperTimelineInfo => {
-                let info = match typed_msg.safekeeper_timeline_info {
-                    Some(info) => info,
-                    None => {
-                        warn!("bad proto message from broker: no safekeeper_timeline_info");
-                        return;
-                    }
-                };
-                SafekeeperDiscoveryResponse {
-                    safekeeper_id: info.safekeeper_id,
-                    tenant_timeline_id: info.tenant_timeline_id,
-                    commit_lsn: info.commit_lsn,
-                    safekeeper_connstr: info.safekeeper_connstr,
-                    availability_zone: info.availability_zone,
-                }
-            }
-            MessageType::SafekeeperDiscoveryResponse => {
-                is_discovery = true;
-                match typed_msg.safekeeper_discovery_response {
-                    Some(response) => response,
-                    None => {
-                        warn!("bad proto message from broker: no safekeeper_discovery_response");
-                        return;
-                    }
-                }
-            }
-            _ => {
-                // unexpected message
-                return;
-            }
-        };
-
+    fn register_timeline_update(&mut self, timeline_update: SafekeeperTimelineInfo) {
        WALRECEIVER_BROKER_UPDATES.inc();

        let new_safekeeper_id = NodeId(timeline_update.safekeeper_id);
@@ -735,11 +619,7 @@ impl ConnectionManagerState {
        );

        if old_entry.is_none() {
-            info!(
-                ?is_discovery,
-                %new_safekeeper_id,
-                "New SK node was added",
-            );
+            info!("New SK node was added: {new_safekeeper_id}");
            WALRECEIVER_CANDIDATES_ADDED.inc();
        }
    }
@@ -938,7 +818,7 @@ impl ConnectionManagerState {
    fn select_connection_candidate(
        &self,
        node_to_omit: Option<NodeId>,
-    ) -> Option<(NodeId, &SafekeeperDiscoveryResponse, PgConnectionConfig)> {
+    ) -> Option<(NodeId, &SafekeeperTimelineInfo, PgConnectionConfig)> {
        self.applicable_connection_candidates()
            .filter(|&(sk_id, _, _)| Some(sk_id) != node_to_omit)
            .max_by_key(|(_, info, _)| info.commit_lsn)
@@ -948,7 +828,7 @@ impl ConnectionManagerState {
    /// Some safekeepers are filtered by the retry cooldown.
    fn applicable_connection_candidates(
        &self,
-    ) -> impl Iterator<Item = (NodeId, &SafekeeperDiscoveryResponse, PgConnectionConfig)> {
+    ) -> impl Iterator<Item = (NodeId, &SafekeeperTimelineInfo, PgConnectionConfig)> {
        let now = Utc::now().naive_utc();

        self.wal_stream_candidates
@@ -1088,11 +968,19 @@ mod tests {
        latest_update: NaiveDateTime,
    ) -> BrokerSkTimeline {
        BrokerSkTimeline {
-            timeline: SafekeeperDiscoveryResponse {
+            timeline: SafekeeperTimelineInfo {
                safekeeper_id: 0,
                tenant_timeline_id: None,
+                term: 0,
+                last_log_term: 0,
+                flush_lsn: 0,
                commit_lsn,
+                backup_lsn: 0,
+                remote_consistent_lsn: 0,
+                peer_horizon_lsn: 0,
+                local_start_lsn: 0,
                safekeeper_connstr: safekeeper_connstr.to_owned(),
+                http_connstr: safekeeper_connstr.to_owned(),
                availability_zone: None,
            },
            latest_update,
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -427,7 +427,7 @@ pub(super) async fn handle_walreceiver_connection(

            // Send the replication feedback message.
            // Regular standby_status_update fields are put into this message.
-            let current_timeline_size = if timeline.tenant_shard_id.is_shard_zero() {
+            let current_timeline_size = if timeline.tenant_shard_id.is_zero() {
                timeline
                    .get_current_logical_size(
                        crate::tenant::timeline::GetLogicalSizePriority::User,
--- a/pageserver/src/tenant/vectored_blob_io.rs
+++ b/pageserver/src/tenant/vectored_blob_io.rs
@@ -61,18 +61,18 @@ pub struct VectoredRead {
 }

 impl VectoredRead {
-    pub(crate) fn size(&self) -> usize {
+    pub fn size(&self) -> usize {
        (self.end - self.start) as usize
    }
 }

 #[derive(Eq, PartialEq)]
-pub(crate) enum VectoredReadExtended {
+enum VectoredReadExtended {
    Yes,
    No,
 }

-pub(crate) struct VectoredReadBuilder {
+struct VectoredReadBuilder {
    start: u64,
    end: u64,
    blobs_at: VecMap<u64, BlobMeta>,
@@ -80,17 +80,7 @@ pub(crate) struct VectoredReadBuilder {
 }

 impl VectoredReadBuilder {
-    /// Start building a new vectored read.
-    ///
-    /// Note that by design, this does not check against reading more than `max_read_size` to
-    /// support reading larger blobs than the configuration value. The builder will be single use
-    /// however after that.
-    pub(crate) fn new(
-        start_offset: u64,
-        end_offset: u64,
-        meta: BlobMeta,
-        max_read_size: usize,
-    ) -> Self {
+    fn new(start_offset: u64, end_offset: u64, meta: BlobMeta, max_read_size: usize) -> Self {
        let mut blobs_at = VecMap::default();
        blobs_at
            .append(start_offset, meta)
@@ -107,8 +97,7 @@ impl VectoredReadBuilder {
    /// Attempt to extend the current read with a new blob if the start
    /// offset matches with the current end of the vectored read
    /// and the resuting size is below the max read size
-    pub(crate) fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended {
-        tracing::trace!(start, end, "trying to extend");
+    fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended {
        let size = (end - start) as usize;
        if self.end == start && self.size() + size <= self.max_read_size {
            self.end = end;
@@ -122,11 +111,11 @@ impl VectoredReadBuilder {
        VectoredReadExtended::No
    }

-    pub(crate) fn size(&self) -> usize {
+    fn size(&self) -> usize {
        (self.end - self.start) as usize
    }

-    pub(crate) fn build(self) -> VectoredRead {
+    fn build(self) -> VectoredRead {
        VectoredRead {
            start: self.start,
            end: self.end,
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -32,11 +32,11 @@ pub use io_engine::feature_test as io_engine_feature_test;
 pub use io_engine::FeatureTestResult as IoEngineFeatureTestResult;
 mod metadata;
 mod open_options;
-use self::owned_buffers_io::write::OwnedAsyncWriter;
 pub(crate) use io_engine::IoEngineKind;
 pub(crate) use metadata::Metadata;
 pub(crate) use open_options::*;

+#[cfg_attr(not(target_os = "linux"), allow(dead_code))]
 pub(crate) mod owned_buffers_io {
    //! Abstractions for IO with owned buffers.
    //!
@@ -1083,17 +1083,6 @@ impl Drop for VirtualFile {
    }
 }

-impl OwnedAsyncWriter for VirtualFile {
-    #[inline(always)]
-    async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
-        &mut self,
-        buf: B,
-    ) -> std::io::Result<(usize, B::Buf)> {
-        let (buf, res) = VirtualFile::write_all(self, buf).await;
-        res.map(move |v| (v, buf))
-    }
-}
-
 impl OpenFiles {
    fn new(num_slots: usize) -> OpenFiles {
        let mut slots = Box::new(Vec::with_capacity(num_slots));
--- a/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs
@@ -1,45 +1,33 @@
-use crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter;
+use crate::virtual_file::{owned_buffers_io::write::OwnedAsyncWriter, VirtualFile};
 use tokio_epoll_uring::{BoundedBuf, IoBuf};

-pub struct Writer<W> {
-    dst: W,
+pub struct Writer {
+    dst: VirtualFile,
    bytes_amount: u64,
 }

-impl<W> Writer<W> {
-    pub fn new(dst: W) -> Self {
+impl Writer {
+    pub fn new(dst: VirtualFile) -> Self {
        Self {
            dst,
            bytes_amount: 0,
        }
    }
-
-    pub fn bytes_written(&self) -> u64 {
-        self.bytes_amount
-    }
-
-    pub fn as_inner(&self) -> &W {
-        &self.dst
-    }
-
    /// Returns the wrapped `VirtualFile` object as well as the number
    /// of bytes that were written to it through this object.
-    #[cfg_attr(target_os = "macos", allow(dead_code))]
-    pub fn into_inner(self) -> (u64, W) {
+    pub fn into_inner(self) -> (u64, VirtualFile) {
        (self.bytes_amount, self.dst)
    }
 }

-impl<W> OwnedAsyncWriter for Writer<W>
-where
-    W: OwnedAsyncWriter,
-{
+impl OwnedAsyncWriter for Writer {
    #[inline(always)]
    async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
        &mut self,
        buf: B,
    ) -> std::io::Result<(usize, B::Buf)> {
-        let (nwritten, buf) = self.dst.write_all(buf).await?;
+        let (buf, res) = self.dst.write_all(buf).await;
+        let nwritten = res?;
        self.bytes_amount += u64::try_from(nwritten).unwrap();
        Ok((nwritten, buf))
    }
--- a/pageserver/src/virtual_file/owned_buffers_io/write.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/write.rs
@@ -10,14 +10,14 @@ pub trait OwnedAsyncWriter {
    ) -> std::io::Result<(usize, B::Buf)>;
 }

-/// A wrapper aorund an [`OwnedAsyncWriter`] that uses a [`Buffer`] to batch
-/// small writes into larger writes of size [`Buffer::cap`].
+/// A wrapper aorund an [`OwnedAsyncWriter`] that batches smaller writers
+/// into `BUFFER_SIZE`-sized writes.
 ///
 /// # Passthrough Of Large Writers
 ///
-/// Calls to [`BufferedWriter::write_buffered`] that are larger than [`Buffer::cap`]
-/// cause the internal buffer to be flushed prematurely so that the large
-/// buffered write is passed through to the underlying [`OwnedAsyncWriter`].
+/// Buffered writes larger than the `BUFFER_SIZE` cause the internal
+/// buffer to be flushed, even if it is not full yet. Then, the large
+/// buffered write is passed through to the unerlying [`OwnedAsyncWriter`].
 ///
 /// This pass-through is generally beneficial for throughput, but if
 /// the storage backend of the [`OwnedAsyncWriter`] is a shared resource,
@@ -25,38 +25,27 @@ pub trait OwnedAsyncWriter {
 ///
 /// In such cases, a different implementation that always buffers in memory
 /// may be preferable.
-pub struct BufferedWriter<B, W> {
+pub struct BufferedWriter<const BUFFER_SIZE: usize, W> {
    writer: W,
-    /// invariant: always remains Some(buf) except
-    /// - while IO is ongoing => goes back to Some() once the IO completed successfully
-    /// - after an IO error => stays `None` forever
-    /// In these exceptional cases, it's `None`.
-    buf: Option<B>,
+    // invariant: always remains Some(buf)
+    // with buf.capacity() == BUFFER_SIZE except
+    // - while IO is ongoing => goes back to Some() once the IO completed successfully
+    // - after an IO error => stays `None` forever
+    // In these exceptional cases, it's `None`.
+    buf: Option<BytesMut>,
 }

-impl<B, Buf, W> BufferedWriter<B, W>
+impl<const BUFFER_SIZE: usize, W> BufferedWriter<BUFFER_SIZE, W>
 where
-    B: Buffer<IoBuf = Buf> + Send,
-    Buf: IoBuf + Send,
    W: OwnedAsyncWriter,
 {
-    pub fn new(writer: W, buf: B) -> Self {
+    pub fn new(writer: W) -> Self {
        Self {
            writer,
-            buf: Some(buf),
+            buf: Some(BytesMut::with_capacity(BUFFER_SIZE)),
        }
    }

-    pub fn as_inner(&self) -> &W {
-        &self.writer
-    }
-
-    /// Panics if used after any of the write paths returned an error
-    pub fn inspect_buffer(&self) -> &B {
-        self.buf()
-    }
-
-    #[cfg_attr(target_os = "macos", allow(dead_code))]
    pub async fn flush_and_into_inner(mut self) -> std::io::Result<W> {
        self.flush().await?;
        let Self { buf, writer } = self;
@@ -64,144 +53,61 @@ where
        Ok(writer)
    }

-    #[inline(always)]
-    fn buf(&self) -> &B {
-        self.buf
-            .as_ref()
-            .expect("must not use after we returned an error")
-    }
-
-    #[cfg_attr(target_os = "macos", allow(dead_code))]
-    pub async fn write_buffered<S: IoBuf>(&mut self, chunk: Slice<S>) -> std::io::Result<(usize, S)>
+    pub async fn write_buffered<B: IoBuf>(&mut self, chunk: Slice<B>) -> std::io::Result<()>
    where
-        S: IoBuf + Send,
+        B: IoBuf + Send,
    {
-        let chunk_len = chunk.len();
        // avoid memcpy for the middle of the chunk
-        if chunk.len() >= self.buf().cap() {
+        if chunk.len() >= BUFFER_SIZE {
            self.flush().await?;
            // do a big write, bypassing `buf`
            assert_eq!(
                self.buf
                    .as_ref()
                    .expect("must not use after an error")
-                    .pending(),
+                    .len(),
                0
            );
+            let chunk_len = chunk.len();
            let (nwritten, chunk) = self.writer.write_all(chunk).await?;
            assert_eq!(nwritten, chunk_len);
-            return Ok((nwritten, chunk));
+            drop(chunk);
+            return Ok(());
        }
        // in-memory copy the < BUFFER_SIZED tail of the chunk
-        assert!(chunk.len() < self.buf().cap());
-        let mut slice = &chunk[..];
-        while !slice.is_empty() {
-            let buf = self.buf.as_mut().expect("must not use after an error");
-            let need = buf.cap() - buf.pending();
-            let have = slice.len();
-            let n = std::cmp::min(need, have);
-            buf.extend_from_slice(&slice[..n]);
-            slice = &slice[n..];
-            if buf.pending() >= buf.cap() {
-                assert_eq!(buf.pending(), buf.cap());
-                self.flush().await?;
-            }
-        }
-        assert!(slice.is_empty(), "by now we should have drained the chunk");
-        Ok((chunk_len, chunk.into_inner()))
-    }
-
-    /// Strictly less performant variant of [`Self::write_buffered`] that allows writing borrowed data.
-    ///
-    /// It is less performant because we always have to copy the borrowed data into the internal buffer
-    /// before we can do the IO. The [`Self::write_buffered`] can avoid this, which is more performant
-    /// for large writes.
-    pub async fn write_buffered_borrowed(&mut self, mut chunk: &[u8]) -> std::io::Result<usize> {
-        let chunk_len = chunk.len();
+        assert!(chunk.len() < BUFFER_SIZE);
+        let mut chunk = &chunk[..];
        while !chunk.is_empty() {
            let buf = self.buf.as_mut().expect("must not use after an error");
-            let need = buf.cap() - buf.pending();
+            let need = BUFFER_SIZE - buf.len();
            let have = chunk.len();
            let n = std::cmp::min(need, have);
            buf.extend_from_slice(&chunk[..n]);
            chunk = &chunk[n..];
-            if buf.pending() >= buf.cap() {
-                assert_eq!(buf.pending(), buf.cap());
+            if buf.len() >= BUFFER_SIZE {
+                assert_eq!(buf.len(), BUFFER_SIZE);
                self.flush().await?;
            }
        }
-        Ok(chunk_len)
+        assert!(chunk.is_empty(), "by now we should have drained the chunk");
+        Ok(())
    }

    async fn flush(&mut self) -> std::io::Result<()> {
        let buf = self.buf.take().expect("must not use after an error");
-        let buf_len = buf.pending();
-        if buf_len == 0 {
+        if buf.is_empty() {
            self.buf = Some(buf);
-            return Ok(());
+            return std::io::Result::Ok(());
        }
-        let (nwritten, io_buf) = self.writer.write_all(buf.flush()).await?;
+        let buf_len = buf.len();
+        let (nwritten, mut buf) = self.writer.write_all(buf).await?;
        assert_eq!(nwritten, buf_len);
-        self.buf = Some(Buffer::reuse_after_flush(io_buf));
+        buf.clear();
+        self.buf = Some(buf);
        Ok(())
    }
 }

-/// A [`Buffer`] is used by [`BufferedWriter`] to batch smaller writes into larger ones.
-pub trait Buffer {
-    type IoBuf: IoBuf;
-
-    /// Capacity of the buffer. Must not change over the lifetime `self`.`
-    fn cap(&self) -> usize;
-
-    /// Add data to the buffer.
-    /// Panics if there is not enough room to accomodate `other`'s content, i.e.,
-    /// panics if `other.len() > self.cap() - self.pending()`.
-    fn extend_from_slice(&mut self, other: &[u8]);
-
-    /// Number of bytes in the buffer.
-    fn pending(&self) -> usize;
-
-    /// Turns `self` into a [`tokio_epoll_uring::Slice`] of the pending data
-    /// so we can use [`tokio_epoll_uring`] to write it to disk.
-    fn flush(self) -> Slice<Self::IoBuf>;
-
-    /// After the write to disk is done and we have gotten back the slice,
-    /// [`BufferedWriter`] uses this method to re-use the io buffer.
-    fn reuse_after_flush(iobuf: Self::IoBuf) -> Self;
-}
-
-impl Buffer for BytesMut {
-    type IoBuf = BytesMut;
-
-    #[inline(always)]
-    fn cap(&self) -> usize {
-        self.capacity()
-    }
-
-    fn extend_from_slice(&mut self, other: &[u8]) {
-        BytesMut::extend_from_slice(self, other)
-    }
-
-    #[inline(always)]
-    fn pending(&self) -> usize {
-        self.len()
-    }
-
-    fn flush(self) -> Slice<BytesMut> {
-        if self.is_empty() {
-            return self.slice_full();
-        }
-        let len = self.len();
-        self.slice(0..len)
-    }
-
-    fn reuse_after_flush(mut iobuf: BytesMut) -> Self {
-        iobuf.clear();
-        iobuf
-    }
-}
-
 impl OwnedAsyncWriter for Vec<u8> {
    async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
        &mut self,
@@ -219,8 +125,6 @@ impl OwnedAsyncWriter for Vec<u8> {

 #[cfg(test)]
 mod tests {
-    use bytes::BytesMut;
-
    use super::*;

    #[derive(Default)]
@@ -254,7 +158,7 @@ mod tests {
    #[tokio::test]
    async fn test_buffered_writes_only() -> std::io::Result<()> {
        let recorder = RecorderWriter::default();
-        let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2));
+        let mut writer = BufferedWriter::<2, _>::new(recorder);
        write!(writer, b"a");
        write!(writer, b"b");
        write!(writer, b"c");
@@ -271,7 +175,7 @@ mod tests {
    #[tokio::test]
    async fn test_passthrough_writes_only() -> std::io::Result<()> {
        let recorder = RecorderWriter::default();
-        let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2));
+        let mut writer = BufferedWriter::<2, _>::new(recorder);
        write!(writer, b"abc");
        write!(writer, b"de");
        write!(writer, b"");
@@ -287,7 +191,7 @@ mod tests {
    #[tokio::test]
    async fn test_passthrough_write_with_nonempty_buffer() -> std::io::Result<()> {
        let recorder = RecorderWriter::default();
-        let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2));
+        let mut writer = BufferedWriter::<2, _>::new(recorder);
        write!(writer, b"a");
        write!(writer, b"bc");
        write!(writer, b"d");
@@ -299,31 +203,4 @@ mod tests {
        );
        Ok(())
    }
-
-    #[tokio::test]
-    async fn test_write_all_borrowed_always_goes_through_buffer() -> std::io::Result<()> {
-        let recorder = RecorderWriter::default();
-        let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2));
-
-        writer.write_buffered_borrowed(b"abc").await?;
-        writer.write_buffered_borrowed(b"d").await?;
-        writer.write_buffered_borrowed(b"e").await?;
-        writer.write_buffered_borrowed(b"fg").await?;
-        writer.write_buffered_borrowed(b"hi").await?;
-        writer.write_buffered_borrowed(b"j").await?;
-        writer.write_buffered_borrowed(b"klmno").await?;
-
-        let recorder = writer.flush_and_into_inner().await?;
-        assert_eq!(
-            recorder.writes,
-            {
-                let expect: &[&[u8]] = &[b"ab", b"cd", b"ef", b"gh", b"ij", b"kl", b"mn", b"o"];
-                expect
-            }
-            .iter()
-            .map(|v| v[..].to_vec())
-            .collect::<Vec<_>>()
-        );
-        Ok(())
-    }
 }
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -403,7 +403,7 @@ impl WalIngest {
            );

            if !key_is_local {
-                if self.shard.is_shard_zero() {
+                if self.shard.is_zero() {
                    // Shard 0 tracks relation sizes.  Although we will not store this block, we will observe
                    // its blkno in case it implicitly extends a relation.
                    self.observe_decoded_block(modification, blk, ctx).await?;
@@ -1034,7 +1034,7 @@ impl WalIngest {

            let nblocks = modification
                .tline
-                .get_rel_size(src_rel, Version::Modified(modification), ctx)
+                .get_rel_size(src_rel, Version::Modified(modification), true, ctx)
                .await?;
            let dst_rel = RelTag {
                spcnode: tablespace_id,
@@ -1068,7 +1068,13 @@ impl WalIngest {

                let content = modification
                    .tline
-                    .get_rel_page_at_lsn(src_rel, blknum, Version::Modified(modification), ctx)
+                    .get_rel_page_at_lsn(
+                        src_rel,
+                        blknum,
+                        Version::Modified(modification),
+                        true,
+                        ctx,
+                    )
                    .await?;
                modification.put_rel_page_image(dst_rel, blknum, content)?;
                num_blocks_copied += 1;
@@ -1236,7 +1242,7 @@ impl WalIngest {
                };
                if modification
                    .tline
-                    .get_rel_exists(rel, Version::Modified(modification), ctx)
+                    .get_rel_exists(rel, Version::Modified(modification), true, ctx)
                    .await?
                {
                    self.put_rel_drop(modification, rel, ctx).await?;
@@ -1535,7 +1541,7 @@ impl WalIngest {
            nblocks
        } else if !modification
            .tline
-            .get_rel_exists(rel, Version::Modified(modification), ctx)
+            .get_rel_exists(rel, Version::Modified(modification), true, ctx)
            .await?
        {
            // create it with 0 size initially, the logic below will extend it
@@ -1547,7 +1553,7 @@ impl WalIngest {
        } else {
            modification
                .tline
-                .get_rel_size(rel, Version::Modified(modification), ctx)
+                .get_rel_size(rel, Version::Modified(modification), true, ctx)
                .await?
        };

@@ -1644,14 +1650,14 @@ async fn get_relsize(
 ) -> anyhow::Result<BlockNumber> {
    let nblocks = if !modification
        .tline
-        .get_rel_exists(rel, Version::Modified(modification), ctx)
+        .get_rel_exists(rel, Version::Modified(modification), true, ctx)
        .await?
    {
        0
    } else {
        modification
            .tline
-            .get_rel_size(rel, Version::Modified(modification), ctx)
+            .get_rel_size(rel, Version::Modified(modification), true, ctx)
            .await?
    };
    Ok(nblocks)
@@ -1726,29 +1732,29 @@ mod tests {
        // The relation was created at LSN 2, not visible at LSN 1 yet.
        assert_eq!(
            tline
-                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx)
+                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx)
                .await?,
            false
        );
        assert!(tline
-            .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx)
+            .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx)
            .await
            .is_err());
        assert_eq!(
            tline
-                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx)
+                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
                .await?,
            true
        );
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
                .await?,
            1
        );
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), false, &ctx)
                .await?,
            3
        );
@@ -1756,46 +1762,46 @@ mod tests {
        // Check page contents at each LSN
        assert_eq!(
            tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x20)), &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x20)), false, &ctx)
                .await?,
            test_img("foo blk 0 at 2")
        );

        assert_eq!(
            tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x30)), &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x30)), false, &ctx)
                .await?,
            test_img("foo blk 0 at 3")
        );

        assert_eq!(
            tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x40)), &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x40)), false, &ctx)
                .await?,
            test_img("foo blk 0 at 3")
        );
        assert_eq!(
            tline
-                .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x40)), &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x40)), false, &ctx)
                .await?,
            test_img("foo blk 1 at 4")
        );

        assert_eq!(
            tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x50)), &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x50)), false, &ctx)
                .await?,
            test_img("foo blk 0 at 3")
        );
        assert_eq!(
            tline
-                .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x50)), &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x50)), false, &ctx)
                .await?,
            test_img("foo blk 1 at 4")
        );
        assert_eq!(
            tline
-                .get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), false, &ctx)
                .await?,
            test_img("foo blk 2 at 5")
        );
@@ -1811,19 +1817,19 @@ mod tests {
        // Check reported size and contents after truncation
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), false, &ctx)
                .await?,
            2
        );
        assert_eq!(
            tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x60)), &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x60)), false, &ctx)
                .await?,
            test_img("foo blk 0 at 3")
        );
        assert_eq!(
            tline
-                .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x60)), &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x60)), false, &ctx)
                .await?,
            test_img("foo blk 1 at 4")
        );
@@ -1831,13 +1837,13 @@ mod tests {
        // should still see the truncated block with older LSN
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), false, &ctx)
                .await?,
            3
        );
        assert_eq!(
            tline
-                .get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), false, &ctx)
                .await?,
            test_img("foo blk 2 at 5")
        );
@@ -1850,7 +1856,7 @@ mod tests {
        m.commit(&ctx).await?;
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x68)), &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x68)), false, &ctx)
                .await?,
            0
        );
@@ -1863,19 +1869,19 @@ mod tests {
        m.commit(&ctx).await?;
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x70)), &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x70)), false, &ctx)
                .await?,
            2
        );
        assert_eq!(
            tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x70)), &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x70)), false, &ctx)
                .await?,
            ZERO_PAGE
        );
        assert_eq!(
            tline
-                .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x70)), &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x70)), false, &ctx)
                .await?,
            test_img("foo blk 1")
        );
@@ -1888,21 +1894,21 @@ mod tests {
        m.commit(&ctx).await?;
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), false, &ctx)
                .await?,
            1501
        );
        for blk in 2..1500 {
            assert_eq!(
                tline
-                    .get_rel_page_at_lsn(TESTREL_A, blk, Version::Lsn(Lsn(0x80)), &ctx)
+                    .get_rel_page_at_lsn(TESTREL_A, blk, Version::Lsn(Lsn(0x80)), false, &ctx)
                    .await?,
                ZERO_PAGE
            );
        }
        assert_eq!(
            tline
-                .get_rel_page_at_lsn(TESTREL_A, 1500, Version::Lsn(Lsn(0x80)), &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 1500, Version::Lsn(Lsn(0x80)), false, &ctx)
                .await?,
            test_img("foo blk 1500")
        );
@@ -1929,13 +1935,13 @@ mod tests {
        // Check that rel exists and size is correct
        assert_eq!(
            tline
-                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx)
+                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
                .await?,
            true
        );
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
                .await?,
            1
        );
@@ -1948,7 +1954,7 @@ mod tests {
        // Check that rel is not visible anymore
        assert_eq!(
            tline
-                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x30)), &ctx)
+                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x30)), false, &ctx)
                .await?,
            false
        );
@@ -1966,13 +1972,13 @@ mod tests {
        // Check that rel exists and size is correct
        assert_eq!(
            tline
-                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x40)), &ctx)
+                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x40)), false, &ctx)
                .await?,
            true
        );
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x40)), &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x40)), false, &ctx)
                .await?,
            1
        );
@@ -2005,24 +2011,24 @@ mod tests {
        // The relation was created at LSN 20, not visible at LSN 1 yet.
        assert_eq!(
            tline
-                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx)
+                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx)
                .await?,
            false
        );
        assert!(tline
-            .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx)
+            .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx)
            .await
            .is_err());

        assert_eq!(
            tline
-                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx)
+                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
                .await?,
            true
        );
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
                .await?,
            relsize
        );
@@ -2033,7 +2039,7 @@ mod tests {
            let data = format!("foo blk {} at {}", blkno, lsn);
            assert_eq!(
                tline
-                    .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(lsn), &ctx)
+                    .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(lsn), false, &ctx)
                    .await?,
                test_img(&data)
            );
@@ -2050,7 +2056,7 @@ mod tests {
        // Check reported size and contents after truncation
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), false, &ctx)
                .await?,
            1
        );
@@ -2060,7 +2066,7 @@ mod tests {
            let data = format!("foo blk {} at {}", blkno, lsn);
            assert_eq!(
                tline
-                    .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x60)), &ctx)
+                    .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x60)), false, &ctx)
                    .await?,
                test_img(&data)
            );
@@ -2069,7 +2075,7 @@ mod tests {
        // should still see all blocks with older LSN
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), false, &ctx)
                .await?,
            relsize
        );
@@ -2078,7 +2084,7 @@ mod tests {
            let data = format!("foo blk {} at {}", blkno, lsn);
            assert_eq!(
                tline
-                    .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x50)), &ctx)
+                    .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x50)), false, &ctx)
                    .await?,
                test_img(&data)
            );
@@ -2098,13 +2104,13 @@ mod tests {

        assert_eq!(
            tline
-                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x80)), &ctx)
+                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x80)), false, &ctx)
                .await?,
            true
        );
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), false, &ctx)
                .await?,
            relsize
        );
@@ -2114,7 +2120,7 @@ mod tests {
            let data = format!("foo blk {} at {}", blkno, lsn);
            assert_eq!(
                tline
-                    .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x80)), &ctx)
+                    .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x80)), false, &ctx)
                    .await?,
                test_img(&data)
            );
@@ -2148,7 +2154,7 @@ mod tests {

        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx)
                .await?,
            RELSEG_SIZE + 1
        );
@@ -2162,7 +2168,7 @@ mod tests {
        m.commit(&ctx).await?;
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx)
                .await?,
            RELSEG_SIZE
        );
@@ -2177,7 +2183,7 @@ mod tests {
        m.commit(&ctx).await?;
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx)
                .await?,
            RELSEG_SIZE - 1
        );
@@ -2195,7 +2201,7 @@ mod tests {
            m.commit(&ctx).await?;
            assert_eq!(
                tline
-                    .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), &ctx)
+                    .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx)
                    .await?,
                size as BlockNumber
            );
--- a/pageserver/src/walrecord.rs
+++ b/pageserver/src/walrecord.rs
@@ -55,7 +55,6 @@ impl NeonWalRecord {
    /// Does replaying this WAL record initialize the page from scratch, or does
    /// it need to be applied over the previous image of the page?
    pub fn will_init(&self) -> bool {
-        // If you change this function, you'll also need to change ValueBytes::will_init
        match self {
            NeonWalRecord::Postgres { will_init, rec: _ } => *will_init,

--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -20,7 +20,6 @@

 /// Process lifecycle and abstracction for the IPC protocol.
 mod process;
-pub use process::Kind as ProcessKind;

 /// Code to apply [`NeonWalRecord`]s.
 pub(crate) mod apply_neon;
@@ -35,7 +34,7 @@ use crate::walrecord::NeonWalRecord;
 use anyhow::Context;
 use bytes::{Bytes, BytesMut};
 use pageserver_api::key::key_to_rel_block;
-use pageserver_api::models::{WalRedoManagerProcessStatus, WalRedoManagerStatus};
+use pageserver_api::models::WalRedoManagerStatus;
 use pageserver_api::shard::TenantShardId;
 use std::sync::Arc;
 use std::time::Duration;
@@ -55,7 +54,7 @@ pub struct PostgresRedoManager {
    tenant_shard_id: TenantShardId,
    conf: &'static PageServerConf,
    last_redo_at: std::sync::Mutex<Option<Instant>>,
-    /// The current [`process::Process`] that is used by new redo requests.
+    /// The current [`process::WalRedoProcess`] that is used by new redo requests.
    /// We use [`heavier_once_cell`] for coalescing the spawning, but the redo
    /// requests don't use the [`heavier_once_cell::Guard`] to keep ahold of the
    /// their process object; we use [`Arc::clone`] for that.
@@ -67,7 +66,7 @@ pub struct PostgresRedoManager {
    /// still be using the old redo process. But, those other tasks will most likely
    /// encounter an error as well, and errors are an unexpected condition anyway.
    /// So, probably we could get rid of the `Arc` in the future.
-    redo_process: heavier_once_cell::OnceCell<Arc<process::Process>>,
+    redo_process: heavier_once_cell::OnceCell<Arc<process::WalRedoProcess>>,
 }

 ///
@@ -140,8 +139,8 @@ impl PostgresRedoManager {
        }
    }

-    pub fn status(&self) -> WalRedoManagerStatus {
-        WalRedoManagerStatus {
+    pub(crate) fn status(&self) -> Option<WalRedoManagerStatus> {
+        Some(WalRedoManagerStatus {
            last_redo_at: {
                let at = *self.last_redo_at.lock().unwrap();
                at.and_then(|at| {
@@ -150,14 +149,8 @@ impl PostgresRedoManager {
                    chrono::Utc::now().checked_sub_signed(chrono::Duration::from_std(age).ok()?)
                })
            },
-            process: self
-                .redo_process
-                .get()
-                .map(|p| WalRedoManagerProcessStatus {
-                    pid: p.id(),
-                    kind: std::borrow::Cow::Borrowed(p.kind().into()),
-                }),
-        }
+            pid: self.redo_process.get().map(|p| p.id()),
+        })
    }
 }

@@ -215,33 +208,37 @@ impl PostgresRedoManager {
        const MAX_RETRY_ATTEMPTS: u32 = 1;
        let mut n_attempts = 0u32;
        loop {
-            let proc: Arc<process::Process> = match self.redo_process.get_or_init_detached().await {
-                Ok(guard) => Arc::clone(&guard),
-                Err(permit) => {
-                    // don't hold poison_guard, the launch code can bail
-                    let start = Instant::now();
-                    let proc = Arc::new(
-                        process::Process::launch(self.conf, self.tenant_shard_id, pg_version)
+            let proc: Arc<process::WalRedoProcess> =
+                match self.redo_process.get_or_init_detached().await {
+                    Ok(guard) => Arc::clone(&guard),
+                    Err(permit) => {
+                        // don't hold poison_guard, the launch code can bail
+                        let start = Instant::now();
+                        let proc = Arc::new(
+                            process::WalRedoProcess::launch(
+                                self.conf,
+                                self.tenant_shard_id,
+                                pg_version,
+                            )
                            .context("launch walredo process")?,
-                    );
-                    let duration = start.elapsed();
-                    WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64());
-                    info!(
-                        duration_ms = duration.as_millis(),
-                        pid = proc.id(),
-                        "launched walredo process"
-                    );
-                    self.redo_process.set(Arc::clone(&proc), permit);
-                    proc
-                }
-            };
+                        );
+                        let duration = start.elapsed();
+                        WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64());
+                        info!(
+                            duration_ms = duration.as_millis(),
+                            pid = proc.id(),
+                            "launched walredo process"
+                        );
+                        self.redo_process.set(Arc::clone(&proc), permit);
+                        proc
+                    }
+                };

            let started_at = std::time::Instant::now();

            // Relational WAL records are applied using wal-redo-postgres
            let result = proc
                .apply_wal_records(rel, blknum, &base_img, records, wal_redo_timeout)
-                .await
                .context("apply_wal_records");

            let duration = started_at.elapsed();
--- a/pageserver/src/walredo/process.rs
+++ b/pageserver/src/walredo/process.rs
@@ -1,67 +1,186 @@
-use std::time::Duration;
-
+use self::no_leak_child::NoLeakChild;
+use crate::{
+    config::PageServerConf,
+    metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER},
+    walrecord::NeonWalRecord,
+};
+use anyhow::Context;
 use bytes::Bytes;
+use nix::poll::{PollFd, PollFlags};
 use pageserver_api::{reltag::RelTag, shard::TenantShardId};
-use utils::lsn::Lsn;
-
-use crate::{config::PageServerConf, walrecord::NeonWalRecord};
+use postgres_ffi::BLCKSZ;
+use std::os::fd::AsRawFd;
+#[cfg(feature = "testing")]
+use std::sync::atomic::AtomicUsize;
+use std::{
+    collections::VecDeque,
+    io::{Read, Write},
+    process::{ChildStdin, ChildStdout, Command, Stdio},
+    sync::{Mutex, MutexGuard},
+    time::Duration,
+};
+use tracing::{debug, error, instrument, Instrument};
+use utils::{lsn::Lsn, nonblock::set_nonblock};

 mod no_leak_child;
 /// The IPC protocol that pageserver and walredo process speak over their shared pipe.
 mod protocol;

-mod process_impl {
-    pub(super) mod process_async;
-    pub(super) mod process_std;
+pub struct WalRedoProcess {
+    #[allow(dead_code)]
+    conf: &'static PageServerConf,
+    tenant_shard_id: TenantShardId,
+    // Some() on construction, only becomes None on Drop.
+    child: Option<NoLeakChild>,
+    stdout: Mutex<ProcessOutput>,
+    stdin: Mutex<ProcessInput>,
+    /// Counter to separate same sized walredo inputs failing at the same millisecond.
+    #[cfg(feature = "testing")]
+    dump_sequence: AtomicUsize,
 }

-#[derive(
-    Clone,
-    Copy,
-    Debug,
-    PartialEq,
-    Eq,
-    strum_macros::EnumString,
-    strum_macros::Display,
-    strum_macros::IntoStaticStr,
-    serde_with::DeserializeFromStr,
-    serde_with::SerializeDisplay,
-)]
-#[strum(serialize_all = "kebab-case")]
-#[repr(u8)]
-pub enum Kind {
-    Sync,
-    Async,
+struct ProcessInput {
+    stdin: ChildStdin,
+    n_requests: usize,
 }

-pub(crate) enum Process {
-    Sync(process_impl::process_std::WalRedoProcess),
-    Async(process_impl::process_async::WalRedoProcess),
+struct ProcessOutput {
+    stdout: ChildStdout,
+    pending_responses: VecDeque<Option<Bytes>>,
+    n_processed_responses: usize,
 }

-impl Process {
-    #[inline(always)]
-    pub fn launch(
+impl WalRedoProcess {
+    //
+    // Start postgres binary in special WAL redo mode.
+    //
+    #[instrument(skip_all,fields(pg_version=pg_version))]
+    pub(crate) fn launch(
        conf: &'static PageServerConf,
        tenant_shard_id: TenantShardId,
        pg_version: u32,
    ) -> anyhow::Result<Self> {
-        Ok(match conf.walredo_process_kind {
-            Kind::Sync => Self::Sync(process_impl::process_std::WalRedoProcess::launch(
-                conf,
-                tenant_shard_id,
-                pg_version,
-            )?),
-            Kind::Async => Self::Async(process_impl::process_async::WalRedoProcess::launch(
-                conf,
-                tenant_shard_id,
-                pg_version,
-            )?),
+        crate::span::debug_assert_current_span_has_tenant_id();
+
+        let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible.
+        let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?;
+
+        use no_leak_child::NoLeakChildCommandExt;
+        // Start postgres itself
+        let child = Command::new(pg_bin_dir_path.join("postgres"))
+            // the first arg must be --wal-redo so the child process enters into walredo mode
+            .arg("--wal-redo")
+            // the child doesn't process this arg, but, having it in the argv helps indentify the
+            // walredo process for a particular tenant when debugging a pagserver
+            .args(["--tenant-shard-id", &format!("{tenant_shard_id}")])
+            .stdin(Stdio::piped())
+            .stderr(Stdio::piped())
+            .stdout(Stdio::piped())
+            .env_clear()
+            .env("LD_LIBRARY_PATH", &pg_lib_dir_path)
+            .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path)
+            // NB: The redo process is not trusted after we sent it the first
+            // walredo work. Before that, it is trusted. Specifically, we trust
+            // it to
+            // 1. close all file descriptors except stdin, stdout, stderr because
+            //    pageserver might not be 100% diligent in setting FD_CLOEXEC on all
+            //    the files it opens, and
+            // 2. to use seccomp to sandbox itself before processing the first
+            //    walredo request.
+            .spawn_no_leak_child(tenant_shard_id)
+            .context("spawn process")?;
+        WAL_REDO_PROCESS_COUNTERS.started.inc();
+        let mut child = scopeguard::guard(child, |child| {
+            error!("killing wal-redo-postgres process due to a problem during launch");
+            child.kill_and_wait(WalRedoKillCause::Startup);
+        });
+
+        let stdin = child.stdin.take().unwrap();
+        let stdout = child.stdout.take().unwrap();
+        let stderr = child.stderr.take().unwrap();
+        let stderr = tokio::process::ChildStderr::from_std(stderr)
+            .context("convert to tokio::ChildStderr")?;
+        macro_rules! set_nonblock_or_log_err {
+        ($file:ident) => {{
+            let res = set_nonblock($file.as_raw_fd());
+            if let Err(e) = &res {
+                error!(error = %e, file = stringify!($file), pid = child.id(), "set_nonblock failed");
+            }
+            res
+        }};
+    }
+        set_nonblock_or_log_err!(stdin)?;
+        set_nonblock_or_log_err!(stdout)?;
+
+        // all fallible operations post-spawn are complete, so get rid of the guard
+        let child = scopeguard::ScopeGuard::into_inner(child);
+
+        tokio::spawn(
+        async move {
+            scopeguard::defer! {
+                debug!("wal-redo-postgres stderr_logger_task finished");
+                crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc();
+            }
+            debug!("wal-redo-postgres stderr_logger_task started");
+            crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc();
+
+            use tokio::io::AsyncBufReadExt;
+            let mut stderr_lines = tokio::io::BufReader::new(stderr);
+            let mut buf = Vec::new();
+            let res = loop {
+                buf.clear();
+                // TODO we don't trust the process to cap its stderr length.
+                // Currently it can do unbounded Vec allocation.
+                match stderr_lines.read_until(b'\n', &mut buf).await {
+                    Ok(0) => break Ok(()), // eof
+                    Ok(num_bytes) => {
+                        let output = String::from_utf8_lossy(&buf[..num_bytes]);
+                        error!(%output, "received output");
+                    }
+                    Err(e) => {
+                        break Err(e);
+                    }
+                }
+            };
+            match res {
+                Ok(()) => (),
+                Err(e) => {
+                    error!(error=?e, "failed to read from walredo stderr");
+                }
+            }
+        }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version))
+    );
+
+        Ok(Self {
+            conf,
+            tenant_shard_id,
+            child: Some(child),
+            stdin: Mutex::new(ProcessInput {
+                stdin,
+                n_requests: 0,
+            }),
+            stdout: Mutex::new(ProcessOutput {
+                stdout,
+                pending_responses: VecDeque::new(),
+                n_processed_responses: 0,
+            }),
+            #[cfg(feature = "testing")]
+            dump_sequence: AtomicUsize::default(),
        })
    }

-    #[inline(always)]
-    pub(crate) async fn apply_wal_records(
+    pub(crate) fn id(&self) -> u32 {
+        self.child
+            .as_ref()
+            .expect("must not call this during Drop")
+            .id()
+    }
+
+    // Apply given WAL records ('records') over an old page image. Returns
+    // new page image.
+    //
+    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))]
+    pub(crate) fn apply_wal_records(
        &self,
        rel: RelTag,
        blknum: u32,
@@ -69,29 +188,221 @@ impl Process {
        records: &[(Lsn, NeonWalRecord)],
        wal_redo_timeout: Duration,
    ) -> anyhow::Result<Bytes> {
-        match self {
-            Process::Sync(p) => {
-                p.apply_wal_records(rel, blknum, base_img, records, wal_redo_timeout)
-                    .await
+        let tag = protocol::BufferTag { rel, blknum };
+        let input = self.stdin.lock().unwrap();
+
+        // Serialize all the messages to send the WAL redo process first.
+        //
+        // This could be problematic if there are millions of records to replay,
+        // but in practice the number of records is usually so small that it doesn't
+        // matter, and it's better to keep this code simple.
+        //
+        // Most requests start with a before-image with BLCKSZ bytes, followed by
+        // by some other WAL records. Start with a buffer that can hold that
+        // comfortably.
+        let mut writebuf: Vec<u8> = Vec::with_capacity((BLCKSZ as usize) * 3);
+        protocol::build_begin_redo_for_block_msg(tag, &mut writebuf);
+        if let Some(img) = base_img {
+            protocol::build_push_page_msg(tag, img, &mut writebuf);
+        }
+        for (lsn, rec) in records.iter() {
+            if let NeonWalRecord::Postgres {
+                will_init: _,
+                rec: postgres_rec,
+            } = rec
+            {
+                protocol::build_apply_record_msg(*lsn, postgres_rec, &mut writebuf);
+            } else {
+                anyhow::bail!("tried to pass neon wal record to postgres WAL redo");
            }
-            Process::Async(p) => {
-                p.apply_wal_records(rel, blknum, base_img, records, wal_redo_timeout)
-                    .await
+        }
+        protocol::build_get_page_msg(tag, &mut writebuf);
+        WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64);
+
+        let res = self.apply_wal_records0(&writebuf, input, wal_redo_timeout);
+
+        if res.is_err() {
+            // not all of these can be caused by this particular input, however these are so rare
+            // in tests so capture all.
+            self.record_and_log(&writebuf);
+        }
+
+        res
+    }
+
+    fn apply_wal_records0(
+        &self,
+        writebuf: &[u8],
+        input: MutexGuard<ProcessInput>,
+        wal_redo_timeout: Duration,
+    ) -> anyhow::Result<Bytes> {
+        let mut proc = { input }; // TODO: remove this legacy rename, but this keep the patch small.
+        let mut nwrite = 0usize;
+
+        while nwrite < writebuf.len() {
+            let mut stdin_pollfds = [PollFd::new(&proc.stdin, PollFlags::POLLOUT)];
+            let n = loop {
+                match nix::poll::poll(&mut stdin_pollfds[..], wal_redo_timeout.as_millis() as i32) {
+                    Err(nix::errno::Errno::EINTR) => continue,
+                    res => break res,
+                }
+            }?;
+
+            if n == 0 {
+                anyhow::bail!("WAL redo timed out");
            }
+
+            // If 'stdin' is writeable, do write.
+            let in_revents = stdin_pollfds[0].revents().unwrap();
+            if in_revents & (PollFlags::POLLERR | PollFlags::POLLOUT) != PollFlags::empty() {
+                nwrite += proc.stdin.write(&writebuf[nwrite..])?;
+            }
+            if in_revents.contains(PollFlags::POLLHUP) {
+                // We still have more data to write, but the process closed the pipe.
+                anyhow::bail!("WAL redo process closed its stdin unexpectedly");
+            }
+        }
+        let request_no = proc.n_requests;
+        proc.n_requests += 1;
+        drop(proc);
+
+        // To improve walredo performance we separate sending requests and receiving
+        // responses. Them are protected by different mutexes (output and input).
+        // If thread T1, T2, T3 send requests D1, D2, D3 to walredo process
+        // then there is not warranty that T1 will first granted output mutex lock.
+        // To address this issue we maintain number of sent requests, number of processed
+        // responses and ring buffer with pending responses. After sending response
+        // (under input mutex), threads remembers request number. Then it releases
+        // input mutex, locks output mutex and fetch in ring buffer all responses until
+        // its stored request number. The it takes correspondent element from
+        // pending responses ring buffer and truncate all empty elements from the front,
+        // advancing processed responses number.
+
+        let mut output = self.stdout.lock().unwrap();
+        let n_processed_responses = output.n_processed_responses;
+        while n_processed_responses + output.pending_responses.len() <= request_no {
+            // We expect the WAL redo process to respond with an 8k page image. We read it
+            // into this buffer.
+            let mut resultbuf = vec![0; BLCKSZ.into()];
+            let mut nresult: usize = 0; // # of bytes read into 'resultbuf' so far
+            while nresult < BLCKSZ.into() {
+                let mut stdout_pollfds = [PollFd::new(&output.stdout, PollFlags::POLLIN)];
+                // We do two things simultaneously: reading response from stdout
+                // and forward any logging information that the child writes to its stderr to the page server's log.
+                let n = loop {
+                    match nix::poll::poll(
+                        &mut stdout_pollfds[..],
+                        wal_redo_timeout.as_millis() as i32,
+                    ) {
+                        Err(nix::errno::Errno::EINTR) => continue,
+                        res => break res,
+                    }
+                }?;
+
+                if n == 0 {
+                    anyhow::bail!("WAL redo timed out");
+                }
+
+                // If we have some data in stdout, read it to the result buffer.
+                let out_revents = stdout_pollfds[0].revents().unwrap();
+                if out_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
+                    nresult += output.stdout.read(&mut resultbuf[nresult..])?;
+                }
+                if out_revents.contains(PollFlags::POLLHUP) {
+                    anyhow::bail!("WAL redo process closed its stdout unexpectedly");
+                }
+            }
+            output
+                .pending_responses
+                .push_back(Some(Bytes::from(resultbuf)));
+        }
+        // Replace our request's response with None in `pending_responses`.
+        // Then make space in the ring buffer by clearing out any seqence of contiguous
+        // `None`'s from the front of `pending_responses`.
+        // NB: We can't pop_front() because other requests' responses because another
+        // requester might have grabbed the output mutex before us:
+        // T1: grab input mutex
+        // T1: send request_no 23
+        // T1: release input mutex
+        // T2: grab input mutex
+        // T2: send request_no 24
+        // T2: release input mutex
+        // T2: grab output mutex
+        // T2: n_processed_responses + output.pending_responses.len() <= request_no
+        //            23                                0                   24
+        // T2: enters poll loop that reads stdout
+        // T2: put response for 23 into pending_responses
+        // T2: put response for 24 into pending_resposnes
+        // pending_responses now looks like this: Front Some(response_23) Some(response_24) Back
+        // T2: takes its response_24
+        // pending_responses now looks like this: Front Some(response_23) None Back
+        // T2: does the while loop below
+        // pending_responses now looks like this: Front Some(response_23) None Back
+        // T2: releases output mutex
+        // T1: grabs output mutex
+        // T1: n_processed_responses + output.pending_responses.len() > request_no
+        //            23                                2                   23
+        // T1: skips poll loop that reads stdout
+        // T1: takes its response_23
+        // pending_responses now looks like this: Front None None Back
+        // T2: does the while loop below
+        // pending_responses now looks like this: Front Back
+        // n_processed_responses now has value 25
+        let res = output.pending_responses[request_no - n_processed_responses]
+            .take()
+            .expect("we own this request_no, nobody else is supposed to take it");
+        while let Some(front) = output.pending_responses.front() {
+            if front.is_none() {
+                output.pending_responses.pop_front();
+                output.n_processed_responses += 1;
+            } else {
+                break;
+            }
+        }
+        Ok(res)
+    }
+
+    #[cfg(feature = "testing")]
+    fn record_and_log(&self, writebuf: &[u8]) {
+        use std::sync::atomic::Ordering;
+
+        let millis = std::time::SystemTime::now()
+            .duration_since(std::time::SystemTime::UNIX_EPOCH)
+            .unwrap()
+            .as_millis();
+
+        let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed);
+
+        // these files will be collected to an allure report
+        let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len());
+
+        let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename);
+
+        let res = std::fs::OpenOptions::new()
+            .write(true)
+            .create_new(true)
+            .read(true)
+            .open(path)
+            .and_then(|mut f| f.write_all(writebuf));
+
+        // trip up allowed_errors
+        if let Err(e) = res {
+            tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}");
+        } else {
+            tracing::error!(filename, "erroring walredo input saved");
        }
    }

-    pub(crate) fn id(&self) -> u32 {
-        match self {
-            Process::Sync(p) => p.id(),
-            Process::Async(p) => p.id(),
-        }
-    }
+    #[cfg(not(feature = "testing"))]
+    fn record_and_log(&self, _: &[u8]) {}
+}

-    pub(crate) fn kind(&self) -> Kind {
-        match self {
-            Process::Sync(_) => Kind::Sync,
-            Process::Async(_) => Kind::Async,
-        }
+impl Drop for WalRedoProcess {
+    fn drop(&mut self) {
+        self.child
+            .take()
+            .expect("we only do this once")
+            .kill_and_wait(WalRedoKillCause::WalRedoProcessDrop);
+        // no way to wait for stderr_logger_task from Drop because that is async only
    }
 }
--- a/pageserver/src/walredo/process/process_impl/process_async.rs
+++ b/pageserver/src/walredo/process/process_impl/process_async.rs
@@ -1,374 +0,0 @@
-use self::no_leak_child::NoLeakChild;
-use crate::{
-    config::PageServerConf,
-    metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER},
-    walrecord::NeonWalRecord,
-    walredo::process::{no_leak_child, protocol},
-};
-use anyhow::Context;
-use bytes::Bytes;
-use pageserver_api::{reltag::RelTag, shard::TenantShardId};
-use postgres_ffi::BLCKSZ;
-#[cfg(feature = "testing")]
-use std::sync::atomic::AtomicUsize;
-use std::{
-    collections::VecDeque,
-    process::{Command, Stdio},
-    time::Duration,
-};
-use tokio::io::{AsyncReadExt, AsyncWriteExt};
-use tracing::{debug, error, instrument, Instrument};
-use utils::{lsn::Lsn, poison::Poison};
-
-pub struct WalRedoProcess {
-    #[allow(dead_code)]
-    conf: &'static PageServerConf,
-    tenant_shard_id: TenantShardId,
-    // Some() on construction, only becomes None on Drop.
-    child: Option<NoLeakChild>,
-    stdout: tokio::sync::Mutex<Poison<ProcessOutput>>,
-    stdin: tokio::sync::Mutex<Poison<ProcessInput>>,
-    /// Counter to separate same sized walredo inputs failing at the same millisecond.
-    #[cfg(feature = "testing")]
-    dump_sequence: AtomicUsize,
-}
-
-struct ProcessInput {
-    stdin: tokio::process::ChildStdin,
-    n_requests: usize,
-}
-
-struct ProcessOutput {
-    stdout: tokio::process::ChildStdout,
-    pending_responses: VecDeque<Option<Bytes>>,
-    n_processed_responses: usize,
-}
-
-impl WalRedoProcess {
-    //
-    // Start postgres binary in special WAL redo mode.
-    //
-    #[instrument(skip_all,fields(pg_version=pg_version))]
-    pub(crate) fn launch(
-        conf: &'static PageServerConf,
-        tenant_shard_id: TenantShardId,
-        pg_version: u32,
-    ) -> anyhow::Result<Self> {
-        crate::span::debug_assert_current_span_has_tenant_id();
-
-        let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible.
-        let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?;
-
-        use no_leak_child::NoLeakChildCommandExt;
-        // Start postgres itself
-        let child = Command::new(pg_bin_dir_path.join("postgres"))
-            // the first arg must be --wal-redo so the child process enters into walredo mode
-            .arg("--wal-redo")
-            // the child doesn't process this arg, but, having it in the argv helps indentify the
-            // walredo process for a particular tenant when debugging a pagserver
-            .args(["--tenant-shard-id", &format!("{tenant_shard_id}")])
-            .stdin(Stdio::piped())
-            .stderr(Stdio::piped())
-            .stdout(Stdio::piped())
-            .env_clear()
-            .env("LD_LIBRARY_PATH", &pg_lib_dir_path)
-            .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path)
-            // NB: The redo process is not trusted after we sent it the first
-            // walredo work. Before that, it is trusted. Specifically, we trust
-            // it to
-            // 1. close all file descriptors except stdin, stdout, stderr because
-            //    pageserver might not be 100% diligent in setting FD_CLOEXEC on all
-            //    the files it opens, and
-            // 2. to use seccomp to sandbox itself before processing the first
-            //    walredo request.
-            .spawn_no_leak_child(tenant_shard_id)
-            .context("spawn process")?;
-        WAL_REDO_PROCESS_COUNTERS.started.inc();
-        let mut child = scopeguard::guard(child, |child| {
-            error!("killing wal-redo-postgres process due to a problem during launch");
-            child.kill_and_wait(WalRedoKillCause::Startup);
-        });
-
-        let stdin = child.stdin.take().unwrap();
-        let stdout = child.stdout.take().unwrap();
-        let stderr = child.stderr.take().unwrap();
-        let stderr = tokio::process::ChildStderr::from_std(stderr)
-            .context("convert to tokio::ChildStderr")?;
-        let stdin =
-            tokio::process::ChildStdin::from_std(stdin).context("convert to tokio::ChildStdin")?;
-        let stdout = tokio::process::ChildStdout::from_std(stdout)
-            .context("convert to tokio::ChildStdout")?;
-
-        // all fallible operations post-spawn are complete, so get rid of the guard
-        let child = scopeguard::ScopeGuard::into_inner(child);
-
-        tokio::spawn(
-            async move {
-                scopeguard::defer! {
-                    debug!("wal-redo-postgres stderr_logger_task finished");
-                    crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc();
-                }
-                debug!("wal-redo-postgres stderr_logger_task started");
-                crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc();
-
-                use tokio::io::AsyncBufReadExt;
-                let mut stderr_lines = tokio::io::BufReader::new(stderr);
-                let mut buf = Vec::new();
-                let res = loop {
-                    buf.clear();
-                    // TODO we don't trust the process to cap its stderr length.
-                    // Currently it can do unbounded Vec allocation.
-                    match stderr_lines.read_until(b'\n', &mut buf).await {
-                        Ok(0) => break Ok(()), // eof
-                        Ok(num_bytes) => {
-                            let output = String::from_utf8_lossy(&buf[..num_bytes]);
-                            error!(%output, "received output");
-                        }
-                        Err(e) => {
-                            break Err(e);
-                        }
-                    }
-                };
-                match res {
-                    Ok(()) => (),
-                    Err(e) => {
-                        error!(error=?e, "failed to read from walredo stderr");
-                    }
-                }
-            }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version))
-        );
-
-        Ok(Self {
-            conf,
-            tenant_shard_id,
-            child: Some(child),
-            stdin: tokio::sync::Mutex::new(Poison::new(
-                "stdin",
-                ProcessInput {
-                    stdin,
-                    n_requests: 0,
-                },
-            )),
-            stdout: tokio::sync::Mutex::new(Poison::new(
-                "stdout",
-                ProcessOutput {
-                    stdout,
-                    pending_responses: VecDeque::new(),
-                    n_processed_responses: 0,
-                },
-            )),
-            #[cfg(feature = "testing")]
-            dump_sequence: AtomicUsize::default(),
-        })
-    }
-
-    pub(crate) fn id(&self) -> u32 {
-        self.child
-            .as_ref()
-            .expect("must not call this during Drop")
-            .id()
-    }
-
-    /// Apply given WAL records ('records') over an old page image. Returns
-    /// new page image.
-    ///
-    /// # Cancel-Safety
-    ///
-    /// Cancellation safe.
-    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))]
-    pub(crate) async fn apply_wal_records(
-        &self,
-        rel: RelTag,
-        blknum: u32,
-        base_img: &Option<Bytes>,
-        records: &[(Lsn, NeonWalRecord)],
-        wal_redo_timeout: Duration,
-    ) -> anyhow::Result<Bytes> {
-        let tag = protocol::BufferTag { rel, blknum };
-
-        // Serialize all the messages to send the WAL redo process first.
-        //
-        // This could be problematic if there are millions of records to replay,
-        // but in practice the number of records is usually so small that it doesn't
-        // matter, and it's better to keep this code simple.
-        //
-        // Most requests start with a before-image with BLCKSZ bytes, followed by
-        // by some other WAL records. Start with a buffer that can hold that
-        // comfortably.
-        let mut writebuf: Vec<u8> = Vec::with_capacity((BLCKSZ as usize) * 3);
-        protocol::build_begin_redo_for_block_msg(tag, &mut writebuf);
-        if let Some(img) = base_img {
-            protocol::build_push_page_msg(tag, img, &mut writebuf);
-        }
-        for (lsn, rec) in records.iter() {
-            if let NeonWalRecord::Postgres {
-                will_init: _,
-                rec: postgres_rec,
-            } = rec
-            {
-                protocol::build_apply_record_msg(*lsn, postgres_rec, &mut writebuf);
-            } else {
-                anyhow::bail!("tried to pass neon wal record to postgres WAL redo");
-            }
-        }
-        protocol::build_get_page_msg(tag, &mut writebuf);
-        WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64);
-
-        let Ok(res) =
-            tokio::time::timeout(wal_redo_timeout, self.apply_wal_records0(&writebuf)).await
-        else {
-            anyhow::bail!("WAL redo timed out");
-        };
-
-        if res.is_err() {
-            // not all of these can be caused by this particular input, however these are so rare
-            // in tests so capture all.
-            self.record_and_log(&writebuf);
-        }
-
-        res
-    }
-
-    /// # Cancel-Safety
-    ///
-    /// When not polled to completion (e.g. because in `tokio::select!` another
-    /// branch becomes ready before this future), concurrent and subsequent
-    /// calls may fail due to [`utils::poison::Poison::check_and_arm`] calls.
-    /// Dispose of this process instance and create a new one.
-    async fn apply_wal_records0(&self, writebuf: &[u8]) -> anyhow::Result<Bytes> {
-        let request_no = {
-            let mut lock_guard = self.stdin.lock().await;
-            let mut poison_guard = lock_guard.check_and_arm()?;
-            let input = poison_guard.data_mut();
-            input
-                .stdin
-                .write_all(writebuf)
-                .await
-                .context("write to walredo stdin")?;
-            let request_no = input.n_requests;
-            input.n_requests += 1;
-            poison_guard.disarm();
-            request_no
-        };
-
-        // To improve walredo performance we separate sending requests and receiving
-        // responses. Them are protected by different mutexes (output and input).
-        // If thread T1, T2, T3 send requests D1, D2, D3 to walredo process
-        // then there is not warranty that T1 will first granted output mutex lock.
-        // To address this issue we maintain number of sent requests, number of processed
-        // responses and ring buffer with pending responses. After sending response
-        // (under input mutex), threads remembers request number. Then it releases
-        // input mutex, locks output mutex and fetch in ring buffer all responses until
-        // its stored request number. The it takes correspondent element from
-        // pending responses ring buffer and truncate all empty elements from the front,
-        // advancing processed responses number.
-
-        let mut lock_guard = self.stdout.lock().await;
-        let mut poison_guard = lock_guard.check_and_arm()?;
-        let output = poison_guard.data_mut();
-        let n_processed_responses = output.n_processed_responses;
-        while n_processed_responses + output.pending_responses.len() <= request_no {
-            // We expect the WAL redo process to respond with an 8k page image. We read it
-            // into this buffer.
-            let mut resultbuf = vec![0; BLCKSZ.into()];
-            output
-                .stdout
-                .read_exact(&mut resultbuf)
-                .await
-                .context("read walredo stdout")?;
-            output
-                .pending_responses
-                .push_back(Some(Bytes::from(resultbuf)));
-        }
-        // Replace our request's response with None in `pending_responses`.
-        // Then make space in the ring buffer by clearing out any seqence of contiguous
-        // `None`'s from the front of `pending_responses`.
-        // NB: We can't pop_front() because other requests' responses because another
-        // requester might have grabbed the output mutex before us:
-        // T1: grab input mutex
-        // T1: send request_no 23
-        // T1: release input mutex
-        // T2: grab input mutex
-        // T2: send request_no 24
-        // T2: release input mutex
-        // T2: grab output mutex
-        // T2: n_processed_responses + output.pending_responses.len() <= request_no
-        //            23                                0                   24
-        // T2: enters poll loop that reads stdout
-        // T2: put response for 23 into pending_responses
-        // T2: put response for 24 into pending_resposnes
-        // pending_responses now looks like this: Front Some(response_23) Some(response_24) Back
-        // T2: takes its response_24
-        // pending_responses now looks like this: Front Some(response_23) None Back
-        // T2: does the while loop below
-        // pending_responses now looks like this: Front Some(response_23) None Back
-        // T2: releases output mutex
-        // T1: grabs output mutex
-        // T1: n_processed_responses + output.pending_responses.len() > request_no
-        //            23                                2                   23
-        // T1: skips poll loop that reads stdout
-        // T1: takes its response_23
-        // pending_responses now looks like this: Front None None Back
-        // T2: does the while loop below
-        // pending_responses now looks like this: Front Back
-        // n_processed_responses now has value 25
-        let res = output.pending_responses[request_no - n_processed_responses]
-            .take()
-            .expect("we own this request_no, nobody else is supposed to take it");
-        while let Some(front) = output.pending_responses.front() {
-            if front.is_none() {
-                output.pending_responses.pop_front();
-                output.n_processed_responses += 1;
-            } else {
-                break;
-            }
-        }
-        poison_guard.disarm();
-        Ok(res)
-    }
-
-    #[cfg(feature = "testing")]
-    fn record_and_log(&self, writebuf: &[u8]) {
-        use std::sync::atomic::Ordering;
-
-        let millis = std::time::SystemTime::now()
-            .duration_since(std::time::SystemTime::UNIX_EPOCH)
-            .unwrap()
-            .as_millis();
-
-        let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed);
-
-        // these files will be collected to an allure report
-        let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len());
-
-        let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename);
-
-        use std::io::Write;
-        let res = std::fs::OpenOptions::new()
-            .write(true)
-            .create_new(true)
-            .read(true)
-            .open(path)
-            .and_then(|mut f| f.write_all(writebuf));
-
-        // trip up allowed_errors
-        if let Err(e) = res {
-            tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}");
-        } else {
-            tracing::error!(filename, "erroring walredo input saved");
-        }
-    }
-
-    #[cfg(not(feature = "testing"))]
-    fn record_and_log(&self, _: &[u8]) {}
-}
-
-impl Drop for WalRedoProcess {
-    fn drop(&mut self) {
-        self.child
-            .take()
-            .expect("we only do this once")
-            .kill_and_wait(WalRedoKillCause::WalRedoProcessDrop);
-        // no way to wait for stderr_logger_task from Drop because that is async only
-    }
-}
--- a/pageserver/src/walredo/process/process_impl/process_std.rs
+++ b/pageserver/src/walredo/process/process_impl/process_std.rs
@@ -1,405 +0,0 @@
-use self::no_leak_child::NoLeakChild;
-use crate::{
-    config::PageServerConf,
-    metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER},
-    walrecord::NeonWalRecord,
-    walredo::process::{no_leak_child, protocol},
-};
-use anyhow::Context;
-use bytes::Bytes;
-use nix::poll::{PollFd, PollFlags};
-use pageserver_api::{reltag::RelTag, shard::TenantShardId};
-use postgres_ffi::BLCKSZ;
-use std::os::fd::AsRawFd;
-#[cfg(feature = "testing")]
-use std::sync::atomic::AtomicUsize;
-use std::{
-    collections::VecDeque,
-    io::{Read, Write},
-    process::{ChildStdin, ChildStdout, Command, Stdio},
-    sync::{Mutex, MutexGuard},
-    time::Duration,
-};
-use tracing::{debug, error, instrument, Instrument};
-use utils::{lsn::Lsn, nonblock::set_nonblock};
-
-pub struct WalRedoProcess {
-    #[allow(dead_code)]
-    conf: &'static PageServerConf,
-    tenant_shard_id: TenantShardId,
-    // Some() on construction, only becomes None on Drop.
-    child: Option<NoLeakChild>,
-    stdout: Mutex<ProcessOutput>,
-    stdin: Mutex<ProcessInput>,
-    /// Counter to separate same sized walredo inputs failing at the same millisecond.
-    #[cfg(feature = "testing")]
-    dump_sequence: AtomicUsize,
-}
-
-struct ProcessInput {
-    stdin: ChildStdin,
-    n_requests: usize,
-}
-
-struct ProcessOutput {
-    stdout: ChildStdout,
-    pending_responses: VecDeque<Option<Bytes>>,
-    n_processed_responses: usize,
-}
-
-impl WalRedoProcess {
-    //
-    // Start postgres binary in special WAL redo mode.
-    //
-    #[instrument(skip_all,fields(pg_version=pg_version))]
-    pub(crate) fn launch(
-        conf: &'static PageServerConf,
-        tenant_shard_id: TenantShardId,
-        pg_version: u32,
-    ) -> anyhow::Result<Self> {
-        crate::span::debug_assert_current_span_has_tenant_id();
-
-        let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible.
-        let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?;
-
-        use no_leak_child::NoLeakChildCommandExt;
-        // Start postgres itself
-        let child = Command::new(pg_bin_dir_path.join("postgres"))
-            // the first arg must be --wal-redo so the child process enters into walredo mode
-            .arg("--wal-redo")
-            // the child doesn't process this arg, but, having it in the argv helps indentify the
-            // walredo process for a particular tenant when debugging a pagserver
-            .args(["--tenant-shard-id", &format!("{tenant_shard_id}")])
-            .stdin(Stdio::piped())
-            .stderr(Stdio::piped())
-            .stdout(Stdio::piped())
-            .env_clear()
-            .env("LD_LIBRARY_PATH", &pg_lib_dir_path)
-            .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path)
-            // NB: The redo process is not trusted after we sent it the first
-            // walredo work. Before that, it is trusted. Specifically, we trust
-            // it to
-            // 1. close all file descriptors except stdin, stdout, stderr because
-            //    pageserver might not be 100% diligent in setting FD_CLOEXEC on all
-            //    the files it opens, and
-            // 2. to use seccomp to sandbox itself before processing the first
-            //    walredo request.
-            .spawn_no_leak_child(tenant_shard_id)
-            .context("spawn process")?;
-        WAL_REDO_PROCESS_COUNTERS.started.inc();
-        let mut child = scopeguard::guard(child, |child| {
-            error!("killing wal-redo-postgres process due to a problem during launch");
-            child.kill_and_wait(WalRedoKillCause::Startup);
-        });
-
-        let stdin = child.stdin.take().unwrap();
-        let stdout = child.stdout.take().unwrap();
-        let stderr = child.stderr.take().unwrap();
-        let stderr = tokio::process::ChildStderr::from_std(stderr)
-            .context("convert to tokio::ChildStderr")?;
-        macro_rules! set_nonblock_or_log_err {
-        ($file:ident) => {{
-            let res = set_nonblock($file.as_raw_fd());
-            if let Err(e) = &res {
-                error!(error = %e, file = stringify!($file), pid = child.id(), "set_nonblock failed");
-            }
-            res
-        }};
-    }
-        set_nonblock_or_log_err!(stdin)?;
-        set_nonblock_or_log_err!(stdout)?;
-
-        // all fallible operations post-spawn are complete, so get rid of the guard
-        let child = scopeguard::ScopeGuard::into_inner(child);
-
-        tokio::spawn(
-            async move {
-                scopeguard::defer! {
-                    debug!("wal-redo-postgres stderr_logger_task finished");
-                    crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc();
-                }
-                debug!("wal-redo-postgres stderr_logger_task started");
-                crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc();
-
-                use tokio::io::AsyncBufReadExt;
-                let mut stderr_lines = tokio::io::BufReader::new(stderr);
-                let mut buf = Vec::new();
-                let res = loop {
-                    buf.clear();
-                    // TODO we don't trust the process to cap its stderr length.
-                    // Currently it can do unbounded Vec allocation.
-                    match stderr_lines.read_until(b'\n', &mut buf).await {
-                        Ok(0) => break Ok(()), // eof
-                        Ok(num_bytes) => {
-                            let output = String::from_utf8_lossy(&buf[..num_bytes]);
-                            error!(%output, "received output");
-                        }
-                        Err(e) => {
-                            break Err(e);
-                        }
-                    }
-                };
-                match res {
-                    Ok(()) => (),
-                    Err(e) => {
-                        error!(error=?e, "failed to read from walredo stderr");
-                    }
-                }
-            }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version))
-        );
-
-        Ok(Self {
-            conf,
-            tenant_shard_id,
-            child: Some(child),
-            stdin: Mutex::new(ProcessInput {
-                stdin,
-                n_requests: 0,
-            }),
-            stdout: Mutex::new(ProcessOutput {
-                stdout,
-                pending_responses: VecDeque::new(),
-                n_processed_responses: 0,
-            }),
-            #[cfg(feature = "testing")]
-            dump_sequence: AtomicUsize::default(),
-        })
-    }
-
-    pub(crate) fn id(&self) -> u32 {
-        self.child
-            .as_ref()
-            .expect("must not call this during Drop")
-            .id()
-    }
-
-    // Apply given WAL records ('records') over an old page image. Returns
-    // new page image.
-    //
-    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))]
-    pub(crate) async fn apply_wal_records(
-        &self,
-        rel: RelTag,
-        blknum: u32,
-        base_img: &Option<Bytes>,
-        records: &[(Lsn, NeonWalRecord)],
-        wal_redo_timeout: Duration,
-    ) -> anyhow::Result<Bytes> {
-        let tag = protocol::BufferTag { rel, blknum };
-        let input = self.stdin.lock().unwrap();
-
-        // Serialize all the messages to send the WAL redo process first.
-        //
-        // This could be problematic if there are millions of records to replay,
-        // but in practice the number of records is usually so small that it doesn't
-        // matter, and it's better to keep this code simple.
-        //
-        // Most requests start with a before-image with BLCKSZ bytes, followed by
-        // by some other WAL records. Start with a buffer that can hold that
-        // comfortably.
-        let mut writebuf: Vec<u8> = Vec::with_capacity((BLCKSZ as usize) * 3);
-        protocol::build_begin_redo_for_block_msg(tag, &mut writebuf);
-        if let Some(img) = base_img {
-            protocol::build_push_page_msg(tag, img, &mut writebuf);
-        }
-        for (lsn, rec) in records.iter() {
-            if let NeonWalRecord::Postgres {
-                will_init: _,
-                rec: postgres_rec,
-            } = rec
-            {
-                protocol::build_apply_record_msg(*lsn, postgres_rec, &mut writebuf);
-            } else {
-                anyhow::bail!("tried to pass neon wal record to postgres WAL redo");
-            }
-        }
-        protocol::build_get_page_msg(tag, &mut writebuf);
-        WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64);
-
-        let res = self.apply_wal_records0(&writebuf, input, wal_redo_timeout);
-
-        if res.is_err() {
-            // not all of these can be caused by this particular input, however these are so rare
-            // in tests so capture all.
-            self.record_and_log(&writebuf);
-        }
-
-        res
-    }
-
-    fn apply_wal_records0(
-        &self,
-        writebuf: &[u8],
-        input: MutexGuard<ProcessInput>,
-        wal_redo_timeout: Duration,
-    ) -> anyhow::Result<Bytes> {
-        let mut proc = { input }; // TODO: remove this legacy rename, but this keep the patch small.
-        let mut nwrite = 0usize;
-
-        while nwrite < writebuf.len() {
-            let mut stdin_pollfds = [PollFd::new(&proc.stdin, PollFlags::POLLOUT)];
-            let n = loop {
-                match nix::poll::poll(&mut stdin_pollfds[..], wal_redo_timeout.as_millis() as i32) {
-                    Err(nix::errno::Errno::EINTR) => continue,
-                    res => break res,
-                }
-            }?;
-
-            if n == 0 {
-                anyhow::bail!("WAL redo timed out");
-            }
-
-            // If 'stdin' is writeable, do write.
-            let in_revents = stdin_pollfds[0].revents().unwrap();
-            if in_revents & (PollFlags::POLLERR | PollFlags::POLLOUT) != PollFlags::empty() {
-                nwrite += proc.stdin.write(&writebuf[nwrite..])?;
-            }
-            if in_revents.contains(PollFlags::POLLHUP) {
-                // We still have more data to write, but the process closed the pipe.
-                anyhow::bail!("WAL redo process closed its stdin unexpectedly");
-            }
-        }
-        let request_no = proc.n_requests;
-        proc.n_requests += 1;
-        drop(proc);
-
-        // To improve walredo performance we separate sending requests and receiving
-        // responses. Them are protected by different mutexes (output and input).
-        // If thread T1, T2, T3 send requests D1, D2, D3 to walredo process
-        // then there is not warranty that T1 will first granted output mutex lock.
-        // To address this issue we maintain number of sent requests, number of processed
-        // responses and ring buffer with pending responses. After sending response
-        // (under input mutex), threads remembers request number. Then it releases
-        // input mutex, locks output mutex and fetch in ring buffer all responses until
-        // its stored request number. The it takes correspondent element from
-        // pending responses ring buffer and truncate all empty elements from the front,
-        // advancing processed responses number.
-
-        let mut output = self.stdout.lock().unwrap();
-        let n_processed_responses = output.n_processed_responses;
-        while n_processed_responses + output.pending_responses.len() <= request_no {
-            // We expect the WAL redo process to respond with an 8k page image. We read it
-            // into this buffer.
-            let mut resultbuf = vec![0; BLCKSZ.into()];
-            let mut nresult: usize = 0; // # of bytes read into 'resultbuf' so far
-            while nresult < BLCKSZ.into() {
-                let mut stdout_pollfds = [PollFd::new(&output.stdout, PollFlags::POLLIN)];
-                // We do two things simultaneously: reading response from stdout
-                // and forward any logging information that the child writes to its stderr to the page server's log.
-                let n = loop {
-                    match nix::poll::poll(
-                        &mut stdout_pollfds[..],
-                        wal_redo_timeout.as_millis() as i32,
-                    ) {
-                        Err(nix::errno::Errno::EINTR) => continue,
-                        res => break res,
-                    }
-                }?;
-
-                if n == 0 {
-                    anyhow::bail!("WAL redo timed out");
-                }
-
-                // If we have some data in stdout, read it to the result buffer.
-                let out_revents = stdout_pollfds[0].revents().unwrap();
-                if out_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
-                    nresult += output.stdout.read(&mut resultbuf[nresult..])?;
-                }
-                if out_revents.contains(PollFlags::POLLHUP) {
-                    anyhow::bail!("WAL redo process closed its stdout unexpectedly");
-                }
-            }
-            output
-                .pending_responses
-                .push_back(Some(Bytes::from(resultbuf)));
-        }
-        // Replace our request's response with None in `pending_responses`.
-        // Then make space in the ring buffer by clearing out any seqence of contiguous
-        // `None`'s from the front of `pending_responses`.
-        // NB: We can't pop_front() because other requests' responses because another
-        // requester might have grabbed the output mutex before us:
-        // T1: grab input mutex
-        // T1: send request_no 23
-        // T1: release input mutex
-        // T2: grab input mutex
-        // T2: send request_no 24
-        // T2: release input mutex
-        // T2: grab output mutex
-        // T2: n_processed_responses + output.pending_responses.len() <= request_no
-        //            23                                0                   24
-        // T2: enters poll loop that reads stdout
-        // T2: put response for 23 into pending_responses
-        // T2: put response for 24 into pending_resposnes
-        // pending_responses now looks like this: Front Some(response_23) Some(response_24) Back
-        // T2: takes its response_24
-        // pending_responses now looks like this: Front Some(response_23) None Back
-        // T2: does the while loop below
-        // pending_responses now looks like this: Front Some(response_23) None Back
-        // T2: releases output mutex
-        // T1: grabs output mutex
-        // T1: n_processed_responses + output.pending_responses.len() > request_no
-        //            23                                2                   23
-        // T1: skips poll loop that reads stdout
-        // T1: takes its response_23
-        // pending_responses now looks like this: Front None None Back
-        // T2: does the while loop below
-        // pending_responses now looks like this: Front Back
-        // n_processed_responses now has value 25
-        let res = output.pending_responses[request_no - n_processed_responses]
-            .take()
-            .expect("we own this request_no, nobody else is supposed to take it");
-        while let Some(front) = output.pending_responses.front() {
-            if front.is_none() {
-                output.pending_responses.pop_front();
-                output.n_processed_responses += 1;
-            } else {
-                break;
-            }
-        }
-        Ok(res)
-    }
-
-    #[cfg(feature = "testing")]
-    fn record_and_log(&self, writebuf: &[u8]) {
-        use std::sync::atomic::Ordering;
-
-        let millis = std::time::SystemTime::now()
-            .duration_since(std::time::SystemTime::UNIX_EPOCH)
-            .unwrap()
-            .as_millis();
-
-        let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed);
-
-        // these files will be collected to an allure report
-        let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len());
-
-        let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename);
-
-        let res = std::fs::OpenOptions::new()
-            .write(true)
-            .create_new(true)
-            .read(true)
-            .open(path)
-            .and_then(|mut f| f.write_all(writebuf));
-
-        // trip up allowed_errors
-        if let Err(e) = res {
-            tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}");
-        } else {
-            tracing::error!(filename, "erroring walredo input saved");
-        }
-    }
-
-    #[cfg(not(feature = "testing"))]
-    fn record_and_log(&self, _: &[u8]) {}
-}
-
-impl Drop for WalRedoProcess {
-    fn drop(&mut self) {
-        self.child
-            .take()
-            .expect("we only do this once")
-            .kill_and_wait(WalRedoKillCause::WalRedoProcessDrop);
-        // no way to wait for stderr_logger_task from Drop because that is async only
-    }
-}
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -49,7 +49,7 @@ char	   *neon_auth_token;
 int			readahead_buffer_size = 128;
 int			flush_every_n_requests = 8;

-int         neon_protocol_version = 1;
+int         neon_protocol_version;

 static int	n_reconnect_attempts = 0;
 static int	max_reconnect_attempts = 60;
@@ -381,17 +381,7 @@ pageserver_connect(shardno_t shard_no, int elevel)
 		pfree(msg);
 		return false;
 	}
-	switch (neon_protocol_version)
-	{
-		case 2:
-			query = psprintf("pagestream_v2 %s %s", neon_tenant, neon_timeline);
-			break;
-		case 1:
-			query = psprintf("pagestream %s %s", neon_tenant, neon_timeline);
-			break;
-		default:
-			elog(ERROR, "unexpected neon_protocol_version %d", neon_protocol_version);
-	}
+	query = psprintf("pagestream %s %s", neon_tenant, neon_timeline);
 	ret = PQsendQuery(conn, query);
 	pfree(query);
 	if (ret != 1)
@@ -452,7 +442,7 @@ pageserver_connect(shardno_t shard_no, int elevel)
 		return false;
 	}

-	neon_shard_log(shard_no, LOG, "libpagestore: connected to '%s' with protocol version %d", connstr, neon_protocol_version);
+	neon_shard_log(shard_no, LOG, "libpagestore: connected to '%s'", connstr);
 	page_servers[shard_no].conn = conn;
 	page_servers[shard_no].wes = wes;

@@ -860,10 +850,8 @@ pg_init_libpagestore(void)
 							"Version of compute<->page server protocol",
 							NULL,
 							&neon_protocol_version,
-							1, /* default to old protocol for now */
-							1, /* min */
-							2, /* max */
-							PGC_SU_BACKEND,
+							NEON_PROTOCOL_VERSION, 1, 2,
+							PGC_USERSET,
 							0,	/* no flags required */
 							NULL, NULL, NULL);

--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -28,6 +28,13 @@
 #define MAX_SHARDS 128
 #define MAX_PAGESERVER_CONNSTRING_SIZE 256

+/*
+ * Currently, the protocol version is not sent to the server.
+ * So it is critical that format of existing commands is not changed.
+ * New protocol versions can just add new commands.
+ */
+#define NEON_PROTOCOL_VERSION  2
+
 typedef enum
 {
 	/* pagestore_client -> pagestore */
@@ -37,6 +44,12 @@ typedef enum
 	T_NeonDbSizeRequest,
 	T_NeonGetSlruSegmentRequest,

+	T_NeonExistsV2Request = 10, /* new protocol message tags start from 10 */
+	T_NeonNblocksV2Request,
+	T_NeonGetPageV2Request,
+	T_NeonDbSizeV2Request,
+	T_NeonGetSlruSegmentV2Request,
+
 	/* pagestore -> pagestore_client */
 	T_NeonExistsResponse = 100,
 	T_NeonNblocksResponse,
--- a/Show More
+++ b/Show More