Merge pull request #7119 from neondatabase/rc/proxy/2024-03-14

Proxy release 2024-03-14
Merge branch 'release-proxy' into rc/proxy/2024-03-14
2026-06-29 02:00:37 +00:00 · 2024-03-14 14:57:05 +05:00 · 2024-03-14 14:16:36 +05:00 · 2024-03-08 08:19:16 +00:00 · 2024-03-04 17:36:11 +04:00 · 2024-03-04 16:41:46 +04:00
206 changed files with 3772 additions and 11328 deletions
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -461,7 +461,6 @@ jobs:

      - name: Pytest regression tests
        uses: ./.github/actions/run-python-test-set
-        timeout-minutes: 60
        with:
          build_type: ${{ matrix.build_type }}
          test_selection: regress
@@ -1121,16 +1120,10 @@ jobs:
        run: |
          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=false
-          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
-            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main \
-              -f deployPgSniRouter=false \
-              -f deployProxy=false \
-              -f deployStorage=true \
-              -f deployStorageBroker=true \
-              -f branch=main \
-              -f dockerTag=${{needs.tag.outputs.build-tag}} \
-              -f deployPreprodRegion=true

+            # TODO: move deployPreprodRegion to release (`"$GITHUB_REF_NAME" == "release"` block), once Staging support different compute tag prefixes for different regions
+            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=true
+          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
            gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main \
              -f deployPgSniRouter=false \
              -f deployProxy=false \
@@ -1139,15 +1132,6 @@ jobs:
              -f branch=main \
              -f dockerTag=${{needs.tag.outputs.build-tag}}
          elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
-            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main \
-              -f deployPgSniRouter=true \
-              -f deployProxy=true \
-              -f deployStorage=false \
-              -f deployStorageBroker=false \
-              -f branch=main \
-              -f dockerTag=${{needs.tag.outputs.build-tag}} \
-              -f deployPreprodRegion=true
-
            gh workflow --repo neondatabase/aws run deploy-proxy-prod.yml --ref main \
              -f deployPgSniRouter=true \
              -f deployProxy=true \
--- a/3
+++ b/3
@@ -1,13 +1,12 @@
 /compute_tools/ @neondatabase/control-plane @neondatabase/compute
 /control_plane/attachment_service @neondatabase/storage
 /libs/pageserver_api/ @neondatabase/storage
-/libs/postgres_ffi/ @neondatabase/compute @neondatabase/safekeepers
+/libs/postgres_ffi/ @neondatabase/compute
 /libs/remote_storage/ @neondatabase/storage
 /libs/safekeeper_api/ @neondatabase/safekeepers
 /libs/vm_monitor/ @neondatabase/autoscaling
 /pageserver/ @neondatabase/storage
 /pgxn/ @neondatabase/compute
-/pgxn/neon/ @neondatabase/compute @neondatabase/safekeepers
 /proxy/ @neondatabase/proxy
 /safekeeper/ @neondatabase/safekeepers
 /vendor/ @neondatabase/compute
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -276,7 +276,7 @@ version = "0.1.0"
 dependencies = [
 "anyhow",
 "aws-config",
- "bytes",
+ "aws-sdk-secretsmanager",
 "camino",
 "clap",
 "control_plane",
@@ -288,8 +288,6 @@ dependencies = [
 "hex",
 "humantime",
 "hyper",
- "lasso",
- "measured",
 "metrics",
 "once_cell",
 "pageserver_api",
@@ -297,7 +295,6 @@ dependencies = [
 "postgres_connection",
 "r2d2",
 "reqwest",
- "routerify",
 "serde",
 "serde_json",
 "thiserror",
@@ -346,9 +343,9 @@ dependencies = [

 [[package]]
 name = "aws-credential-types"
-version = "1.1.8"
+version = "1.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fa8587ae17c8e967e4b05a62d495be2fb7701bec52a97f7acfe8a29f938384c8"
+checksum = "33cc49dcdd31c8b6e79850a179af4c367669150c7ac0135f176c61bec81a70f7"
 dependencies = [
 "aws-smithy-async",
 "aws-smithy-runtime-api",
@@ -358,9 +355,9 @@ dependencies = [

 [[package]]
 name = "aws-runtime"
-version = "1.1.8"
+version = "1.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b13dc54b4b49f8288532334bba8f87386a40571c47c37b1304979b556dc613c8"
+checksum = "eb031bff99877c26c28895766f7bb8484a05e24547e370768d6cc9db514662aa"
 dependencies = [
 "aws-credential-types",
 "aws-sigv4",
@@ -380,29 +377,6 @@ dependencies = [
 "uuid",
 ]

-[[package]]
-name = "aws-sdk-iam"
-version = "1.17.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b8ae76026bfb1b80a6aed0bb400c1139cd9c0563e26bce1986cd021c6a968c7b"
-dependencies = [
- "aws-credential-types",
- "aws-runtime",
- "aws-smithy-async",
- "aws-smithy-http",
- "aws-smithy-json",
- "aws-smithy-query",
- "aws-smithy-runtime",
- "aws-smithy-runtime-api",
- "aws-smithy-types",
- "aws-smithy-xml",
- "aws-types",
- "http 0.2.9",
- "once_cell",
- "regex-lite",
- "tracing",
-]
-
 [[package]]
 name = "aws-sdk-s3"
 version = "1.14.0"
@@ -432,6 +406,29 @@ dependencies = [
 "url",
 ]

+[[package]]
+name = "aws-sdk-secretsmanager"
+version = "1.14.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0a0b64e61e7d632d9df90a2e0f32630c68c24960cab1d27d848718180af883d3"
+dependencies = [
+ "aws-credential-types",
+ "aws-runtime",
+ "aws-smithy-async",
+ "aws-smithy-http",
+ "aws-smithy-json",
+ "aws-smithy-runtime",
+ "aws-smithy-runtime-api",
+ "aws-smithy-types",
+ "aws-types",
+ "bytes",
+ "fastrand 2.0.0",
+ "http 0.2.9",
+ "once_cell",
+ "regex-lite",
+ "tracing",
+]
+
 [[package]]
 name = "aws-sdk-sso"
 version = "1.12.0"
@@ -501,9 +498,9 @@ dependencies = [

 [[package]]
 name = "aws-sigv4"
-version = "1.2.0"
+version = "1.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "11d6f29688a4be9895c0ba8bef861ad0c0dac5c15e9618b9b7a6c233990fc263"
+checksum = "c371c6b0ac54d4605eb6f016624fb5c7c2925d315fdf600ac1bf21b19d5f1742"
 dependencies = [
 "aws-credential-types",
 "aws-smithy-eventstream",
@@ -516,7 +513,7 @@ dependencies = [
 "hex",
 "hmac",
 "http 0.2.9",
- "http 1.1.0",
+ "http 1.0.0",
 "once_cell",
 "p256",
 "percent-encoding",
@@ -530,9 +527,9 @@ dependencies = [

 [[package]]
 name = "aws-smithy-async"
-version = "1.1.8"
+version = "1.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d26ea8fa03025b2face2b3038a63525a10891e3d8829901d502e5384a0d8cd46"
+checksum = "72ee2d09cce0ef3ae526679b522835d63e75fb427aca5413cd371e490d52dcc6"
 dependencies = [
 "futures-util",
 "pin-project-lite",
@@ -573,9 +570,9 @@ dependencies = [

 [[package]]
 name = "aws-smithy-http"
-version = "0.60.7"
+version = "0.60.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3f10fa66956f01540051b0aa7ad54574640f748f9839e843442d99b970d3aff9"
+checksum = "dab56aea3cd9e1101a0a999447fb346afb680ab1406cebc44b32346e25b4117d"
 dependencies = [
 "aws-smithy-eventstream",
 "aws-smithy-runtime-api",
@@ -594,18 +591,18 @@ dependencies = [

 [[package]]
 name = "aws-smithy-json"
-version = "0.60.7"
+version = "0.60.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4683df9469ef09468dad3473d129960119a0d3593617542b7d52086c8486f2d6"
+checksum = "fd3898ca6518f9215f62678870064398f00031912390efd03f1f6ef56d83aa8e"
 dependencies = [
 "aws-smithy-types",
 ]

 [[package]]
 name = "aws-smithy-query"
-version = "0.60.7"
+version = "0.60.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f2fbd61ceb3fe8a1cb7352e42689cec5335833cd9f94103a61e98f9bb61c64bb"
+checksum = "bda4b1dfc9810e35fba8a620e900522cd1bd4f9578c446e82f49d1ce41d2e9f9"
 dependencies = [
 "aws-smithy-types",
 "urlencoding",
@@ -613,9 +610,9 @@ dependencies = [

 [[package]]
 name = "aws-smithy-runtime"
-version = "1.1.8"
+version = "1.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ec81002d883e5a7fd2bb063d6fb51c4999eb55d404f4fff3dd878bf4733b9f01"
+checksum = "fafdab38f40ad7816e7da5dec279400dd505160780083759f01441af1bbb10ea"
 dependencies = [
 "aws-smithy-async",
 "aws-smithy-http",
@@ -638,15 +635,14 @@ dependencies = [

 [[package]]
 name = "aws-smithy-runtime-api"
-version = "1.2.0"
+version = "1.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9acb931e0adaf5132de878f1398d83f8677f90ba70f01f65ff87f6d7244be1c5"
+checksum = "c18276dd28852f34b3bf501f4f3719781f4999a51c7bff1a5c6dc8c4529adc29"
 dependencies = [
 "aws-smithy-async",
 "aws-smithy-types",
 "bytes",
 "http 0.2.9",
- "http 1.1.0",
 "pin-project-lite",
 "tokio",
 "tracing",
@@ -655,9 +651,9 @@ dependencies = [

 [[package]]
 name = "aws-smithy-types"
-version = "1.1.8"
+version = "1.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "abe14dceea1e70101d38fbf2a99e6a34159477c0fb95e68e05c66bd7ae4c3729"
+checksum = "bb3e134004170d3303718baa2a4eb4ca64ee0a1c0a7041dca31b38be0fb414f3"
 dependencies = [
 "base64-simd",
 "bytes",
@@ -678,18 +674,18 @@ dependencies = [

 [[package]]
 name = "aws-smithy-xml"
-version = "0.60.7"
+version = "0.60.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "872c68cf019c0e4afc5de7753c4f7288ce4b71663212771bf5e4542eb9346ca9"
+checksum = "8604a11b25e9ecaf32f9aa56b9fe253c5e2f606a3477f0071e96d3155a5ed218"
 dependencies = [
 "xmlparser",
 ]

 [[package]]
 name = "aws-types"
-version = "1.1.8"
+version = "1.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0dbf2f3da841a8930f159163175cf6a3d16ddde517c1b0fba7aa776822800f40"
+checksum = "789bbe008e65636fe1b6dbbb374c40c8960d1232b96af5ff4aec349f9c4accf4"
 dependencies = [
 "aws-credential-types",
 "aws-smithy-async",
@@ -1350,7 +1346,6 @@ dependencies = [
 "futures",
 "git-version",
 "hex",
- "humantime",
 "hyper",
 "nix 0.27.1",
 "once_cell",
@@ -2396,9 +2391,9 @@ dependencies = [

 [[package]]
 name = "http"
-version = "1.1.0"
+version = "1.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "21b9ddb458710bc376481b842f5da65cdf31522de232c1ca8146abce2a358258"
+checksum = "b32afd38673a8016f7c9ae69e5af41a58f81b1d31689040f2f1959594ce194ea"
 dependencies = [
 "bytes",
 "fnv",
@@ -2498,7 +2493,7 @@ dependencies = [
 "hyper",
 "log",
 "rustls 0.21.9",
- "rustls-native-certs 0.6.2",
+ "rustls-native-certs",
 "tokio",
 "tokio-rustls 0.24.0",
 ]
@@ -2884,35 +2879,6 @@ version = "0.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"

-[[package]]
-name = "measured"
-version = "0.0.13"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f246648d027839a34b420e27c7de1165ace96e19ef894985d0a6ff89a7840a9f"
-dependencies = [
- "bytes",
- "hashbrown 0.14.0",
- "itoa",
- "lasso",
- "measured-derive",
- "memchr",
- "parking_lot 0.12.1",
- "rustc-hash",
- "ryu",
-]
-
-[[package]]
-name = "measured-derive"
-version = "0.0.13"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "edaa5cc22d99d5d6d7d99c3b5b5f7e7f8034c22f1b5d62a1adecd2ed005d9b80"
-dependencies = [
- "heck",
- "proc-macro2",
- "quote",
- "syn 2.0.52",
-]
-
 [[package]]
 name = "memchr"
 version = "2.6.4"
@@ -3563,7 +3529,6 @@ dependencies = [
 "postgres_connection",
 "postgres_ffi",
 "pq_proto",
- "procfs",
 "rand 0.8.5",
 "regex",
 "remote_storage",
@@ -3581,7 +3546,6 @@ dependencies = [
 "strum_macros",
 "svg_fmt",
 "sync_wrapper",
- "sysinfo",
 "tenant_size_model",
 "thiserror",
 "tokio",
@@ -3935,7 +3899,7 @@ dependencies = [
 [[package]]
 name = "postgres"
 version = "0.19.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f"
 dependencies = [
 "bytes",
 "fallible-iterator",
@@ -3948,7 +3912,7 @@ dependencies = [
 [[package]]
 name = "postgres-native-tls"
 version = "0.5.0"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f"
 dependencies = [
 "native-tls",
 "tokio",
@@ -3959,7 +3923,7 @@ dependencies = [
 [[package]]
 name = "postgres-protocol"
 version = "0.6.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f"
 dependencies = [
 "base64 0.20.0",
 "byteorder",
@@ -3972,13 +3936,12 @@ dependencies = [
 "rand 0.8.5",
 "sha2",
 "stringprep",
- "tokio",
 ]

 [[package]]
 name = "postgres-types"
 version = "0.2.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f"
 dependencies = [
 "bytes",
 "fallible-iterator",
@@ -4200,10 +4163,6 @@ version = "0.1.0"
 dependencies = [
 "anyhow",
 "async-trait",
- "aws-config",
- "aws-sdk-iam",
- "aws-sigv4",
- "aws-types",
 "base64 0.13.1",
 "bstr",
 "bytes",
@@ -4214,7 +4173,6 @@ dependencies = [
 "consumption_metrics",
 "dashmap",
 "env_logger",
- "fallible-iterator",
 "futures",
 "git-version",
 "hashbrown 0.13.2",
@@ -4222,7 +4180,6 @@ dependencies = [
 "hex",
 "hmac",
 "hostname",
- "http 1.1.0",
 "humantime",
 "hyper",
 "hyper-tungstenite",
@@ -4266,7 +4223,6 @@ dependencies = [
 "smallvec",
 "smol_str",
 "socket2 0.5.5",
- "subtle",
 "sync_wrapper",
 "task-local-extensions",
 "thiserror",
@@ -4438,9 +4394,9 @@ dependencies = [

 [[package]]
 name = "redis"
-version = "0.25.2"
+version = "0.24.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "71d64e978fd98a0e6b105d066ba4889a7301fca65aeac850a877d8797343feeb"
+checksum = "c580d9cbbe1d1b479e8d67cf9daf6a62c957e6846048408b80b43ac3f6af84cd"
 dependencies = [
 "async-trait",
 "bytes",
@@ -4449,15 +4405,15 @@ dependencies = [
 "itoa",
 "percent-encoding",
 "pin-project-lite",
- "rustls 0.22.2",
- "rustls-native-certs 0.7.0",
- "rustls-pemfile 2.1.1",
- "rustls-pki-types",
+ "rustls 0.21.9",
+ "rustls-native-certs",
+ "rustls-pemfile 1.0.2",
+ "rustls-webpki 0.101.7",
 "ryu",
 "sha1_smol",
- "socket2 0.5.5",
+ "socket2 0.4.9",
 "tokio",
- "tokio-rustls 0.25.0",
+ "tokio-rustls 0.24.0",
 "tokio-util",
 "url",
 ]
@@ -4886,19 +4842,6 @@ dependencies = [
 "security-framework",
 ]

-[[package]]
-name = "rustls-native-certs"
-version = "0.7.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8f1fb85efa936c42c6d5fc28d2629bb51e4b2f4b8a5211e297d599cc5a093792"
-dependencies = [
- "openssl-probe",
- "rustls-pemfile 2.1.1",
- "rustls-pki-types",
- "schannel",
- "security-framework",
-]
-
 [[package]]
 name = "rustls-pemfile"
 version = "1.0.2"
@@ -5401,23 +5344,13 @@ checksum = "ae1a47186c03a32177042e55dbc5fd5aee900b8e0069a8d70fba96a9375cd012"

 [[package]]
 name = "sha2"
-version = "0.10.8"
+version = "0.10.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "793db75ad2bcafc3ffa7c68b215fee268f537982cd901d132f89c6343f3a3dc8"
+checksum = "82e6b795fe2e3b1e845bafcb27aa35405c4d47cdfc92af5fc8d3002f76cebdc0"
 dependencies = [
 "cfg-if",
 "cpufeatures",
 "digest",
- "sha2-asm",
-]
-
-[[package]]
-name = "sha2-asm"
-version = "0.6.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f27ba7066011e3fb30d808b51affff34f0a66d3a03a58edd787c6e420e40e44e"
-dependencies = [
- "cc",
 ]

 [[package]]
@@ -5953,7 +5886,7 @@ dependencies = [
 [[package]]
 name = "tokio-epoll-uring"
 version = "0.1.0"
-source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#342ddd197a060a8354e8f11f4d12994419fff939"
+source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#868d2c42b5d54ca82fead6e8f2f233b69a540d3e"
 dependencies = [
 "futures",
 "nix 0.26.4",
@@ -6000,7 +5933,7 @@ dependencies = [
 [[package]]
 name = "tokio-postgres"
 version = "0.7.7"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f"
 dependencies = [
 "async-trait",
 "byteorder",
@@ -6166,7 +6099,7 @@ dependencies = [
 "percent-encoding",
 "pin-project",
 "prost",
- "rustls-native-certs 0.6.2",
+ "rustls-native-certs",
 "rustls-pemfile 1.0.2",
 "tokio",
 "tokio-rustls 0.24.0",
@@ -6490,7 +6423,7 @@ dependencies = [
 [[package]]
 name = "uring-common"
 version = "0.1.0"
-source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#342ddd197a060a8354e8f11f4d12994419fff939"
+source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#868d2c42b5d54ca82fead6e8f2f233b69a540d3e"
 dependencies = [
 "bytes",
 "io-uring",
@@ -6533,7 +6466,6 @@ version = "0.1.0"
 dependencies = [
 "anyhow",
 "arc-swap",
- "async-compression",
 "async-trait",
 "bincode",
 "byteorder",
@@ -6572,14 +6504,12 @@ dependencies = [
 "thiserror",
 "tokio",
 "tokio-stream",
- "tokio-tar",
 "tokio-util",
 "tracing",
 "tracing-error",
 "tracing-subscriber",
 "url",
 "uuid",
- "walkdir",
 "workspace_hack",
 ]

@@ -7051,6 +6981,7 @@ dependencies = [
 "aws-sigv4",
 "aws-smithy-async",
 "aws-smithy-http",
+ "aws-smithy-runtime-api",
 "aws-smithy-types",
 "axum",
 "base64 0.21.1",
@@ -7096,7 +7027,6 @@ dependencies = [
 "scopeguard",
 "serde",
 "serde_json",
- "sha2",
 "smallvec",
 "subtle",
 "syn 1.0.109",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -52,12 +52,10 @@ async-stream = "0.3"
 async-trait = "0.1"
 aws-config = { version = "1.1.4", default-features = false, features=["rustls"] }
 aws-sdk-s3 = "1.14"
-aws-sdk-iam = "1.15.0"
+aws-sdk-secretsmanager = { version = "1.14.0" }
 aws-smithy-async = { version = "1.1.4", default-features = false, features=["rt-tokio"] }
 aws-smithy-types = "1.1.4"
 aws-credential-types = "1.1.4"
-aws-sigv4 = { version = "1.2.0", features = ["sign-http"] }
-aws-types = "1.1.7"
 axum = { version = "0.6.20", features = ["ws"] }
 base64 = "0.13.0"
 bincode = "1.3"
@@ -78,7 +76,6 @@ either = "1.8"
 enum-map = "2.4.2"
 enumset = "1.0.12"
 fail = "0.5.0"
-fallible-iterator = "0.2"
 fs2 = "0.4.3"
 futures = "0.3"
 futures-core = "0.3"
@@ -91,7 +88,6 @@ hex = "0.4"
 hex-literal = "0.4"
 hmac = "0.12.1"
 hostname = "0.3.1"
-http = {version = "1.1.0", features = ["std"]}
 http-types = { version = "2", default-features = false }
 humantime = "2.1"
 humantime-serde = "1.1.1"
@@ -105,7 +101,6 @@ lasso = "0.7"
 leaky-bucket = "1.0.1"
 libc = "0.2"
 md5 = "0.7.0"
-measured = { version = "0.0.13", features=["default", "lasso"] }
 memoffset = "0.8"
 native-tls = "0.2"
 nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] }
@@ -125,7 +120,7 @@ procfs = "0.14"
 prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
 prost = "0.11"
 rand = "0.8"
-redis = { version = "0.25.2", features = ["tokio-rustls-comp", "keep-alive"] }
+redis = { version = "0.24.0", features = ["tokio-rustls-comp", "keep-alive"] }
 regex = "1.10.2"
 reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
 reqwest-tracing = { version = "0.4.7", features = ["opentelemetry_0_20"] }
@@ -153,7 +148,6 @@ smol_str = { version = "0.2.0", features = ["serde"] }
 socket2 = "0.5"
 strum = "0.24"
 strum_macros = "0.24"
-"subtle"  = "2.5.0"
 svg_fmt = "0.4.1"
 sync_wrapper = "0.1.2"
 tar = "0.4"
--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -135,7 +135,7 @@ WORKDIR /home/nonroot

 # Rust
 # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
-ENV RUSTC_VERSION=1.77.0
+ENV RUSTC_VERSION=1.76.0
 ENV RUSTUP_HOME="/home/nonroot/.rustup"
 ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
 RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
@@ -149,7 +149,7 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
    cargo install --git https://github.com/paritytech/cachepot && \
    cargo install rustfilt && \
    cargo install cargo-hakari && \
-    cargo install cargo-deny --locked && \
+    cargo install cargo-deny && \
    cargo install cargo-hack && \
    cargo install cargo-nextest && \
    rm -rf /home/nonroot/.cargo/registry && \
--- a/README.md
+++ b/README.md
@@ -238,14 +238,6 @@ If you encounter errors during setting up the initial tenant, it's best to stop

 ## Running tests

-### Rust unit tests
-
-We are using [`cargo-nextest`](https://nexte.st/) to run the tests in Github Workflows.
-Some crates do not support running plain `cargo test` anymore, prefer `cargo nextest run` instead.
-You can install `cargo-nextest` with `cargo install cargo-nextest`.
-
-### Integration tests
-
 Ensure your dependencies are installed as described [here](https://github.com/neondatabase/neon#dependency-installation-notes).

 ```sh
--- a/clippy.toml
+++ b/clippy.toml
@@ -2,8 +2,6 @@ disallowed-methods = [
    "tokio::task::block_in_place",
    # Allow this for now, to deny it later once we stop using Handle::block_on completely
    # "tokio::runtime::Handle::block_on",
-    # use tokio_epoll_uring_ext instead
-    "tokio_epoll_uring::thread_local_system",
 ]

 disallowed-macros = [
--- a/compute_tools/README.md
+++ b/compute_tools/README.md
@@ -32,29 +32,6 @@ compute_ctl -D /var/db/postgres/compute \
            -b /usr/local/bin/postgres
 ```

-## State Diagram
-
-Computes can be in various states. Below is a diagram that details how a
-compute moves between states.
-
-```mermaid
-%% https://mermaid.js.org/syntax/stateDiagram.html
-stateDiagram-v2
-  [*] --> Empty : Compute spawned
-  Empty --> ConfigurationPending : Waiting for compute spec
-  ConfigurationPending --> Configuration : Received compute spec
-  Configuration --> Failed : Failed to configure the compute
-  Configuration --> Running : Compute has been configured
-  Empty --> Init : Compute spec is immediately available
-  Empty --> TerminationPending : Requested termination
-  Init --> Failed : Failed to start Postgres
-  Init --> Running : Started Postgres
-  Running --> TerminationPending : Requested termination
-  TerminationPending --> Terminated : Terminated compute
-  Failed --> [*] : Compute exited
-  Terminated --> [*] : Compute exited
-```
-
 ## Tests

 Cargo formatter:
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -17,7 +17,6 @@ pub fn line_in_file(path: &Path, line: &str) -> Result<bool> {
        .write(true)
        .create(true)
        .append(false)
-        .truncate(false)
        .open(path)?;
    let buf = io::BufReader::new(&file);
    let mut count: usize = 0;
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -743,21 +743,19 @@ pub fn handle_extension_neon(client: &mut Client) -> Result<()> {
    // which may happen in two cases:
    // - extension was just installed
    // - extension was already installed and is up to date
-    // DISABLED due to compute node unpinning epic
-    // let query = "ALTER EXTENSION neon UPDATE";
-    // info!("update neon extension version with query: {}", query);
-    // client.simple_query(query)?;
+    let query = "ALTER EXTENSION neon UPDATE";
+    info!("update neon extension version with query: {}", query);
+    client.simple_query(query)?;

    Ok(())
 }

 #[instrument(skip_all)]
-pub fn handle_neon_extension_upgrade(_client: &mut Client) -> Result<()> {
-    info!("handle neon extension upgrade (not really)");
-    // DISABLED due to compute node unpinning epic
-    // let query = "ALTER EXTENSION neon UPDATE";
-    // info!("update neon extension version with query: {}", query);
-    // client.simple_query(query)?;
+pub fn handle_neon_extension_upgrade(client: &mut Client) -> Result<()> {
+    info!("handle neon extension upgrade");
+    let query = "ALTER EXTENSION neon UPDATE";
+    info!("update neon extension version with query: {}", query);
+    client.simple_query(query)?;

    Ok(())
 }
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -12,7 +12,6 @@ clap.workspace = true
 comfy-table.workspace = true
 futures.workspace = true
 git-version.workspace = true
-humantime.workspace = true
 nix.workspace = true
 once_cell.workspace = true
 postgres.workspace = true
--- a/control_plane/attachment_service/Cargo.toml
+++ b/control_plane/attachment_service/Cargo.toml
@@ -16,7 +16,7 @@ testing = []
 [dependencies]
 anyhow.workspace = true
 aws-config.workspace = true
-bytes.workspace = true
+aws-sdk-secretsmanager.workspace = true
 camino.workspace = true
 clap.workspace = true
 fail.workspace = true
@@ -25,20 +25,17 @@ git-version.workspace = true
 hex.workspace = true
 hyper.workspace = true
 humantime.workspace = true
-lasso.workspace = true
 once_cell.workspace = true
 pageserver_api.workspace = true
 pageserver_client.workspace = true
 postgres_connection.workspace = true
 reqwest.workspace = true
-routerify.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 thiserror.workspace = true
 tokio.workspace = true
 tokio-util.workspace = true
 tracing.workspace = true
-measured.workspace = true

 diesel = { version = "2.1.4", features = ["serde_json", "postgres", "r2d2"] }
 diesel_migrations = { version = "2.1.0" }
--- a/control_plane/attachment_service/migrations/2024-03-18-184429_rename_policy/down.sql
+++ b/control_plane/attachment_service/migrations/2024-03-18-184429_rename_policy/down.sql
@@ -1,3 +0,0 @@
-
-UPDATE tenant_shards set placement_policy='{"Double": 1}' where placement_policy='{"Attached": 1}';
-UPDATE tenant_shards set placement_policy='"Single"' where placement_policy='{"Attached": 0}';
--- a/control_plane/attachment_service/migrations/2024-03-18-184429_rename_policy/up.sql
+++ b/control_plane/attachment_service/migrations/2024-03-18-184429_rename_policy/up.sql
@@ -1,3 +0,0 @@
-
-UPDATE tenant_shards set placement_policy='{"Attached": 1}' where placement_policy='{"Double": 1}';
-UPDATE tenant_shards set placement_policy='{"Attached": 0}' where placement_policy='"Single"';
--- a/control_plane/attachment_service/src/heartbeater.rs
+++ b/control_plane/attachment_service/src/heartbeater.rs
@@ -1,227 +0,0 @@
-use futures::{stream::FuturesUnordered, StreamExt};
-use std::{
-    collections::HashMap,
-    sync::Arc,
-    time::{Duration, Instant},
-};
-use tokio_util::sync::CancellationToken;
-
-use pageserver_api::{
-    controller_api::{NodeAvailability, UtilizationScore},
-    models::PageserverUtilization,
-};
-
-use thiserror::Error;
-use utils::id::NodeId;
-
-use crate::node::Node;
-
-struct HeartbeaterTask {
-    receiver: tokio::sync::mpsc::UnboundedReceiver<HeartbeatRequest>,
-    cancel: CancellationToken,
-
-    state: HashMap<NodeId, PageserverState>,
-
-    max_unavailable_interval: Duration,
-    jwt_token: Option<String>,
-}
-
-#[derive(Debug, Clone)]
-pub(crate) enum PageserverState {
-    Available {
-        last_seen_at: Instant,
-        utilization: PageserverUtilization,
-    },
-    Offline,
-}
-
-#[derive(Debug)]
-pub(crate) struct AvailablityDeltas(pub Vec<(NodeId, PageserverState)>);
-
-#[derive(Debug, Error)]
-pub(crate) enum HeartbeaterError {
-    #[error("Cancelled")]
-    Cancel,
-}
-
-struct HeartbeatRequest {
-    pageservers: Arc<HashMap<NodeId, Node>>,
-    reply: tokio::sync::oneshot::Sender<Result<AvailablityDeltas, HeartbeaterError>>,
-}
-
-pub(crate) struct Heartbeater {
-    sender: tokio::sync::mpsc::UnboundedSender<HeartbeatRequest>,
-}
-
-impl Heartbeater {
-    pub(crate) fn new(
-        jwt_token: Option<String>,
-        max_unavailable_interval: Duration,
-        cancel: CancellationToken,
-    ) -> Self {
-        let (sender, receiver) = tokio::sync::mpsc::unbounded_channel::<HeartbeatRequest>();
-        let mut heartbeater =
-            HeartbeaterTask::new(receiver, jwt_token, max_unavailable_interval, cancel);
-        tokio::task::spawn(async move { heartbeater.run().await });
-
-        Self { sender }
-    }
-
-    pub(crate) async fn heartbeat(
-        &self,
-        pageservers: Arc<HashMap<NodeId, Node>>,
-    ) -> Result<AvailablityDeltas, HeartbeaterError> {
-        let (sender, receiver) = tokio::sync::oneshot::channel();
-        self.sender
-            .send(HeartbeatRequest {
-                pageservers,
-                reply: sender,
-            })
-            .unwrap();
-
-        receiver.await.unwrap()
-    }
-}
-
-impl HeartbeaterTask {
-    fn new(
-        receiver: tokio::sync::mpsc::UnboundedReceiver<HeartbeatRequest>,
-        jwt_token: Option<String>,
-        max_unavailable_interval: Duration,
-        cancel: CancellationToken,
-    ) -> Self {
-        Self {
-            receiver,
-            cancel,
-            state: HashMap::new(),
-            max_unavailable_interval,
-            jwt_token,
-        }
-    }
-
-    async fn run(&mut self) {
-        loop {
-            tokio::select! {
-                request = self.receiver.recv() => {
-                    match request {
-                        Some(req) => {
-                            let res = self.heartbeat(req.pageservers).await;
-                            req.reply.send(res).unwrap();
-                        },
-                        None => { return; }
-                    }
-                },
-                _ = self.cancel.cancelled() => return
-            }
-        }
-    }
-
-    async fn heartbeat(
-        &mut self,
-        pageservers: Arc<HashMap<NodeId, Node>>,
-    ) -> Result<AvailablityDeltas, HeartbeaterError> {
-        let mut new_state = HashMap::new();
-
-        let mut heartbeat_futs = FuturesUnordered::new();
-        for (node_id, node) in &*pageservers {
-            heartbeat_futs.push({
-                let jwt_token = self.jwt_token.clone();
-                let cancel = self.cancel.clone();
-
-                // Clone the node and mark it as available such that the request
-                // goes through to the pageserver even when the node is marked offline.
-                // This doesn't impact the availability observed by [`crate::service::Service`].
-                let mut node = node.clone();
-                node.set_availability(NodeAvailability::Active(UtilizationScore::worst()));
-
-                async move {
-                    let response = node
-                        .with_client_retries(
-                            |client| async move { client.get_utilization().await },
-                            &jwt_token,
-                            3,
-                            3,
-                            Duration::from_secs(1),
-                            &cancel,
-                        )
-                        .await;
-
-                    let response = match response {
-                        Some(r) => r,
-                        None => {
-                            // This indicates cancellation of the request.
-                            // We ignore the node in this case.
-                            return None;
-                        }
-                    };
-
-                    let status = if let Ok(utilization) = response {
-                        PageserverState::Available {
-                            last_seen_at: Instant::now(),
-                            utilization,
-                        }
-                    } else {
-                        PageserverState::Offline
-                    };
-
-                    Some((*node_id, status))
-                }
-            });
-
-            loop {
-                let maybe_status = tokio::select! {
-                    next = heartbeat_futs.next() => {
-                        match next {
-                            Some(result) => result,
-                            None => { break; }
-                        }
-                    },
-                    _ = self.cancel.cancelled() => { return Err(HeartbeaterError::Cancel); }
-                };
-
-                if let Some((node_id, status)) = maybe_status {
-                    new_state.insert(node_id, status);
-                }
-            }
-        }
-
-        let mut deltas = Vec::new();
-        let now = Instant::now();
-        for (node_id, ps_state) in new_state {
-            use std::collections::hash_map::Entry::*;
-            let entry = self.state.entry(node_id);
-
-            let mut needs_update = false;
-            match entry {
-                Occupied(ref occ) => match (occ.get(), &ps_state) {
-                    (PageserverState::Offline, PageserverState::Offline) => {}
-                    (PageserverState::Available { last_seen_at, .. }, PageserverState::Offline) => {
-                        if now - *last_seen_at >= self.max_unavailable_interval {
-                            deltas.push((node_id, ps_state.clone()));
-                            needs_update = true;
-                        }
-                    }
-                    _ => {
-                        deltas.push((node_id, ps_state.clone()));
-                        needs_update = true;
-                    }
-                },
-                Vacant(_) => {
-                    deltas.push((node_id, ps_state.clone()));
-                }
-            }
-
-            match entry {
-                Occupied(mut occ) if needs_update => {
-                    (*occ.get_mut()) = ps_state;
-                }
-                Vacant(vac) => {
-                    vac.insert(ps_state);
-                }
-                _ => {}
-            }
-        }
-
-        Ok(AvailablityDeltas(deltas))
-    }
-}
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -1,11 +1,5 @@
-use crate::metrics::{
-    HttpRequestLatencyLabelGroup, HttpRequestStatusLabelGroup, PageserverRequestLabelGroup,
-    METRICS_REGISTRY,
-};
 use crate::reconciler::ReconcileError;
 use crate::service::{Service, STARTUP_RECONCILE_TIMEOUT};
-use futures::Future;
-use hyper::header::CONTENT_TYPE;
 use hyper::{Body, Request, Response};
 use hyper::{StatusCode, Uri};
 use pageserver_api::models::{
@@ -20,7 +14,7 @@ use tokio_util::sync::CancellationToken;
 use utils::auth::{Scope, SwappableJwtAuth};
 use utils::failpoint_support::failpoints_handler;
 use utils::http::endpoint::{auth_middleware, check_permission_with, request_span};
-use utils::http::request::{must_get_query_param, parse_query_param, parse_request_param};
+use utils::http::request::{must_get_query_param, parse_request_param};
 use utils::id::{TenantId, TimelineId};

 use utils::{
@@ -34,14 +28,12 @@ use utils::{
 };

 use pageserver_api::controller_api::{
-    NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, TenantShardMigrateRequest,
+    NodeConfigureRequest, NodeRegisterRequest, TenantShardMigrateRequest,
 };
 use pageserver_api::upcall_api::{ReAttachRequest, ValidateRequest};

 use control_plane::storage_controller::{AttachHookRequest, InspectRequest};

-use routerify::Middleware;
-
 /// State available to HTTP request handlers
 #[derive(Clone)]
 pub struct HttpState {
@@ -256,10 +248,8 @@ async fn handle_tenant_secondary_download(
    req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
-    let wait = parse_query_param(&req, "wait_ms")?.map(Duration::from_millis);
-
-    let (status, progress) = service.tenant_secondary_download(tenant_id, wait).await?;
-    json_response(status, progress)
+    service.tenant_secondary_download(tenant_id).await?;
+    json_response(StatusCode::OK, ())
 }

 async fn handle_tenant_delete(
@@ -321,7 +311,7 @@ async fn handle_tenant_timeline_passthrough(
    tracing::info!("Proxying request for tenant {} ({})", tenant_id, path);

    // Find the node that holds shard zero
-    let (node, tenant_shard_id) = service.tenant_shard0_node(tenant_id)?;
+    let (base_url, tenant_shard_id) = service.tenant_shard0_baseurl(tenant_id)?;

    // Callers will always pass an unsharded tenant ID.  Before proxying, we must
    // rewrite this to a shard-aware shard zero ID.
@@ -330,39 +320,12 @@ async fn handle_tenant_timeline_passthrough(
    let tenant_shard_str = format!("{}", tenant_shard_id);
    let path = path.replace(&tenant_str, &tenant_shard_str);

-    let latency = &METRICS_REGISTRY
-        .metrics_group
-        .storage_controller_passthrough_request_latency;
-
-    // This is a bit awkward. We remove the param from the request
-    // and join the words by '_' to get a label for the request.
-    let just_path = path.replace(&tenant_shard_str, "");
-    let path_label = just_path
-        .split('/')
-        .filter(|token| !token.is_empty())
-        .collect::<Vec<_>>()
-        .join("_");
-    let labels = PageserverRequestLabelGroup {
-        pageserver_id: &node.get_id().to_string(),
-        path: &path_label,
-        method: crate::metrics::Method::Get,
-    };
-
-    let _timer = latency.start_timer(labels.clone());
-
-    let client = mgmt_api::Client::new(node.base_url(), service.get_config().jwt_token.as_deref());
+    let client = mgmt_api::Client::new(base_url, service.get_config().jwt_token.as_deref());
    let resp = client.get_raw(path).await.map_err(|_e|
        // FIXME: give APiError a proper Unavailable variant.  We return 503 here because
        // if we can't successfully send a request to the pageserver, we aren't available.
        ApiError::ShuttingDown)?;

-    if !resp.status().is_success() {
-        let error_counter = &METRICS_REGISTRY
-            .metrics_group
-            .storage_controller_passthrough_request_error;
-        error_counter.inc(labels);
-    }
-
    // We have a reqest::Response, would like a http::Response
    let mut builder = hyper::Response::builder()
        .status(resp.status())
@@ -388,16 +351,6 @@ async fn handle_tenant_locate(
    json_response(StatusCode::OK, service.tenant_locate(tenant_id)?)
 }

-async fn handle_tenant_describe(
-    service: Arc<Service>,
-    req: Request<Body>,
-) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
-
-    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
-    json_response(StatusCode::OK, service.tenant_describe(tenant_id)?)
-}
-
 async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
    check_permissions(&req, Scope::Admin)?;

@@ -436,14 +389,7 @@ async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>,

    json_response(
        StatusCode::OK,
-        state
-            .service
-            .node_configure(
-                config_req.node_id,
-                config_req.availability.map(NodeAvailability::from),
-                config_req.scheduling,
-            )
-            .await?,
+        state.service.node_configure(config_req).await?,
    )
 }

@@ -533,11 +479,7 @@ impl From<ReconcileError> for ApiError {

 /// Common wrapper for request handlers that call into Service and will operate on tenants: they must only
 /// be allowed to run if Service has finished its initial reconciliation.
-async fn tenant_service_handler<R, H>(
-    request: Request<Body>,
-    handler: H,
-    request_name: RequestName,
-) -> R::Output
+async fn tenant_service_handler<R, H>(request: Request<Body>, handler: H) -> R::Output
 where
    R: std::future::Future<Output = Result<Response<Body>, ApiError>> + Send + 'static,
    H: FnOnce(Arc<Service>, Request<Body>) -> R + Send + Sync + 'static,
@@ -557,10 +499,9 @@ where
        ));
    }

-    named_request_span(
+    request_span(
        request,
        |request| async move { handler(service, request).await },
-        request_name,
    )
    .await
 }
@@ -571,98 +512,11 @@ fn check_permissions(request: &Request<Body>, required_scope: Scope) -> Result<(
    })
 }

-#[derive(Clone, Debug)]
-struct RequestMeta {
-    method: hyper::http::Method,
-    at: Instant,
-}
-
-fn prologue_metrics_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
-) -> Middleware<B, ApiError> {
-    Middleware::pre(move |req| async move {
-        let meta = RequestMeta {
-            method: req.method().clone(),
-            at: Instant::now(),
-        };
-
-        req.set_context(meta);
-
-        Ok(req)
-    })
-}
-
-fn epilogue_metrics_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
-) -> Middleware<B, ApiError> {
-    Middleware::post_with_info(move |resp, req_info| async move {
-        let request_name = match req_info.context::<RequestName>() {
-            Some(name) => name,
-            None => {
-                return Ok(resp);
-            }
-        };
-
-        if let Some(meta) = req_info.context::<RequestMeta>() {
-            let status = &crate::metrics::METRICS_REGISTRY
-                .metrics_group
-                .storage_controller_http_request_status;
-            let latency = &crate::metrics::METRICS_REGISTRY
-                .metrics_group
-                .storage_controller_http_request_latency;
-
-            status.inc(HttpRequestStatusLabelGroup {
-                path: request_name.0,
-                method: meta.method.clone().into(),
-                status: crate::metrics::StatusCode(resp.status()),
-            });
-
-            latency.observe(
-                HttpRequestLatencyLabelGroup {
-                    path: request_name.0,
-                    method: meta.method.into(),
-                },
-                meta.at.elapsed().as_secs_f64(),
-            );
-        }
-        Ok(resp)
-    })
-}
-
-pub async fn measured_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    pub const TEXT_FORMAT: &str = "text/plain; version=0.0.4";
-
-    let payload = crate::metrics::METRICS_REGISTRY.encode();
-    let response = Response::builder()
-        .status(200)
-        .header(CONTENT_TYPE, TEXT_FORMAT)
-        .body(payload.into())
-        .unwrap();
-
-    Ok(response)
-}
-
-#[derive(Clone)]
-struct RequestName(&'static str);
-
-async fn named_request_span<R, H>(
-    request: Request<Body>,
-    handler: H,
-    name: RequestName,
-) -> R::Output
-where
-    R: Future<Output = Result<Response<Body>, ApiError>> + Send + 'static,
-    H: FnOnce(Request<Body>) -> R + Send + Sync + 'static,
-{
-    request.set_context(name);
-    request_span(request, handler).await
-}
-
 pub fn make_router(
    service: Arc<Service>,
    auth: Option<Arc<SwappableJwtAuth>>,
 ) -> RouterBuilder<hyper::Body, ApiError> {
-    let mut router = endpoint::make_router()
-        .middleware(prologue_metrics_middleware())
-        .middleware(epilogue_metrics_middleware());
+    let mut router = endpoint::make_router();
    if auth.is_some() {
        router = router.middleware(auth_middleware(|request| {
            let state = get_state(request);
@@ -671,166 +525,96 @@ pub fn make_router(
            } else {
                state.auth.as_deref()
            }
-        }));
+        }))
    }

    router
        .data(Arc::new(HttpState::new(service, auth)))
-        .get("/metrics", |r| {
-            named_request_span(r, measured_metrics_handler, RequestName("metrics"))
-        })
        // Non-prefixed generic endpoints (status, metrics)
-        .get("/status", |r| {
-            named_request_span(r, handle_status, RequestName("status"))
-        })
-        .get("/ready", |r| {
-            named_request_span(r, handle_ready, RequestName("ready"))
-        })
+        .get("/status", |r| request_span(r, handle_status))
+        .get("/ready", |r| request_span(r, handle_ready))
        // Upcalls for the pageserver: point the pageserver's `control_plane_api` config to this prefix
        .post("/upcall/v1/re-attach", |r| {
-            named_request_span(r, handle_re_attach, RequestName("upcall_v1_reattach"))
-        })
-        .post("/upcall/v1/validate", |r| {
-            named_request_span(r, handle_validate, RequestName("upcall_v1_validate"))
+            request_span(r, handle_re_attach)
        })
+        .post("/upcall/v1/validate", |r| request_span(r, handle_validate))
        // Test/dev/debug endpoints
        .post("/debug/v1/attach-hook", |r| {
-            named_request_span(r, handle_attach_hook, RequestName("debug_v1_attach_hook"))
-        })
-        .post("/debug/v1/inspect", |r| {
-            named_request_span(r, handle_inspect, RequestName("debug_v1_inspect"))
+            request_span(r, handle_attach_hook)
        })
+        .post("/debug/v1/inspect", |r| request_span(r, handle_inspect))
        .post("/debug/v1/tenant/:tenant_id/drop", |r| {
-            named_request_span(r, handle_tenant_drop, RequestName("debug_v1_tenant_drop"))
+            request_span(r, handle_tenant_drop)
        })
        .post("/debug/v1/node/:node_id/drop", |r| {
-            named_request_span(r, handle_node_drop, RequestName("debug_v1_node_drop"))
-        })
-        .get("/debug/v1/tenant", |r| {
-            named_request_span(r, handle_tenants_dump, RequestName("debug_v1_tenant"))
-        })
-        .get("/debug/v1/tenant/:tenant_id/locate", |r| {
-            tenant_service_handler(
-                r,
-                handle_tenant_locate,
-                RequestName("debug_v1_tenant_locate"),
-            )
+            request_span(r, handle_node_drop)
        })
+        .get("/debug/v1/tenant", |r| request_span(r, handle_tenants_dump))
        .get("/debug/v1/scheduler", |r| {
-            named_request_span(r, handle_scheduler_dump, RequestName("debug_v1_scheduler"))
+            request_span(r, handle_scheduler_dump)
        })
        .post("/debug/v1/consistency_check", |r| {
-            named_request_span(
-                r,
-                handle_consistency_check,
-                RequestName("debug_v1_consistency_check"),
-            )
+            request_span(r, handle_consistency_check)
        })
        .put("/debug/v1/failpoints", |r| {
            request_span(r, |r| failpoints_handler(r, CancellationToken::new()))
        })
+        .get("/control/v1/tenant/:tenant_id/locate", |r| {
+            tenant_service_handler(r, handle_tenant_locate)
+        })
        // Node operations
        .post("/control/v1/node", |r| {
-            named_request_span(r, handle_node_register, RequestName("control_v1_node"))
-        })
-        .get("/control/v1/node", |r| {
-            named_request_span(r, handle_node_list, RequestName("control_v1_node"))
+            request_span(r, handle_node_register)
        })
+        .get("/control/v1/node", |r| request_span(r, handle_node_list))
        .put("/control/v1/node/:node_id/config", |r| {
-            named_request_span(
-                r,
-                handle_node_configure,
-                RequestName("control_v1_node_config"),
-            )
+            request_span(r, handle_node_configure)
        })
        // Tenant Shard operations
        .put("/control/v1/tenant/:tenant_shard_id/migrate", |r| {
-            tenant_service_handler(
-                r,
-                handle_tenant_shard_migrate,
-                RequestName("control_v1_tenant_migrate"),
-            )
+            tenant_service_handler(r, handle_tenant_shard_migrate)
        })
        .put("/control/v1/tenant/:tenant_id/shard_split", |r| {
-            tenant_service_handler(
-                r,
-                handle_tenant_shard_split,
-                RequestName("control_v1_tenant_shard_split"),
-            )
-        })
-        .get("/control/v1/tenant/:tenant_id", |r| {
-            tenant_service_handler(
-                r,
-                handle_tenant_describe,
-                RequestName("control_v1_tenant_describe"),
-            )
+            tenant_service_handler(r, handle_tenant_shard_split)
        })
        // Tenant operations
        // The ^/v1/ endpoints act as a "Virtual Pageserver", enabling shard-naive clients to call into
        // this service to manage tenants that actually consist of many tenant shards, as if they are a single entity.
        .post("/v1/tenant", |r| {
-            tenant_service_handler(r, handle_tenant_create, RequestName("v1_tenant"))
+            tenant_service_handler(r, handle_tenant_create)
        })
        .delete("/v1/tenant/:tenant_id", |r| {
-            tenant_service_handler(r, handle_tenant_delete, RequestName("v1_tenant"))
+            tenant_service_handler(r, handle_tenant_delete)
        })
        .put("/v1/tenant/config", |r| {
-            tenant_service_handler(r, handle_tenant_config_set, RequestName("v1_tenant_config"))
+            tenant_service_handler(r, handle_tenant_config_set)
        })
        .get("/v1/tenant/:tenant_id/config", |r| {
-            tenant_service_handler(r, handle_tenant_config_get, RequestName("v1_tenant_config"))
+            tenant_service_handler(r, handle_tenant_config_get)
        })
        .put("/v1/tenant/:tenant_shard_id/location_config", |r| {
-            tenant_service_handler(
-                r,
-                handle_tenant_location_config,
-                RequestName("v1_tenant_location_config"),
-            )
+            tenant_service_handler(r, handle_tenant_location_config)
        })
        .put("/v1/tenant/:tenant_id/time_travel_remote_storage", |r| {
-            tenant_service_handler(
-                r,
-                handle_tenant_time_travel_remote_storage,
-                RequestName("v1_tenant_time_travel_remote_storage"),
-            )
+            tenant_service_handler(r, handle_tenant_time_travel_remote_storage)
        })
        .post("/v1/tenant/:tenant_id/secondary/download", |r| {
-            tenant_service_handler(
-                r,
-                handle_tenant_secondary_download,
-                RequestName("v1_tenant_secondary_download"),
-            )
+            tenant_service_handler(r, handle_tenant_secondary_download)
        })
        // Timeline operations
        .delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
-            tenant_service_handler(
-                r,
-                handle_tenant_timeline_delete,
-                RequestName("v1_tenant_timeline"),
-            )
+            tenant_service_handler(r, handle_tenant_timeline_delete)
        })
        .post("/v1/tenant/:tenant_id/timeline", |r| {
-            tenant_service_handler(
-                r,
-                handle_tenant_timeline_create,
-                RequestName("v1_tenant_timeline"),
-            )
+            tenant_service_handler(r, handle_tenant_timeline_create)
        })
        // Tenant detail GET passthrough to shard zero
        .get("/v1/tenant/:tenant_id", |r| {
-            tenant_service_handler(
-                r,
-                handle_tenant_timeline_passthrough,
-                RequestName("v1_tenant_passthrough"),
-            )
+            tenant_service_handler(r, handle_tenant_timeline_passthrough)
        })
        // Timeline GET passthrough to shard zero.  Note that the `*` in the URL is a wildcard: any future
        // timeline GET APIs will be implicitly included.
        .get("/v1/tenant/:tenant_id/timeline*", |r| {
-            tenant_service_handler(
-                r,
-                handle_tenant_timeline_passthrough,
-                RequestName("v1_tenant_timeline_passthrough"),
-            )
+            tenant_service_handler(r, handle_tenant_timeline_passthrough)
        })
 }
--- a/control_plane/attachment_service/src/lib.rs
+++ b/control_plane/attachment_service/src/lib.rs
@@ -3,12 +3,10 @@ use utils::seqwait::MonotonicCounter;

 mod auth;
 mod compute_hook;
-mod heartbeater;
 pub mod http;
 mod id_lock_map;
 pub mod metrics;
 mod node;
-mod pageserver_client;
 pub mod persistence;
 mod reconciler;
 mod scheduler;
--- a/control_plane/attachment_service/src/main.rs
+++ b/control_plane/attachment_service/src/main.rs
@@ -2,7 +2,8 @@ use anyhow::{anyhow, Context};
 use attachment_service::http::make_router;
 use attachment_service::metrics::preinitialize_metrics;
 use attachment_service::persistence::Persistence;
-use attachment_service::service::{Config, Service, MAX_UNAVAILABLE_INTERVAL_DEFAULT};
+use attachment_service::service::{Config, Service};
+use aws_config::{BehaviorVersion, Region};
 use camino::Utf8PathBuf;
 use clap::Parser;
 use diesel::Connection;
@@ -53,30 +54,6 @@ struct Cli {
    /// URL to connect to postgres, like postgresql://localhost:1234/attachment_service
    #[arg(long)]
    database_url: Option<String>,
-
-    /// Flag to enable dev mode, which permits running without auth
-    #[arg(long, default_value = "false")]
-    dev: bool,
-
-    /// Grace period before marking unresponsive pageserver offline
-    #[arg(long)]
-    max_unavailable_interval: Option<humantime::Duration>,
-}
-
-enum StrictMode {
-    /// In strict mode, we will require that all secrets are loaded, i.e. security features
-    /// may not be implicitly turned off by omitting secrets in the environment.
-    Strict,
-    /// In dev mode, secrets are optional, and omitting a particular secret will implicitly
-    /// disable the auth related to it (e.g. no pageserver jwt key -> send unauthenticated
-    /// requests, no public key -> don't authenticate incoming requests).
-    Dev,
-}
-
-impl Default for StrictMode {
-    fn default() -> Self {
-        Self::Strict
-    }
 }

 /// Secrets may either be provided on the command line (for testing), or loaded from AWS SecretManager: this
@@ -89,6 +66,13 @@ struct Secrets {
 }

 impl Secrets {
+    const DATABASE_URL_SECRET: &'static str = "rds-neon-storage-controller-url";
+    const PAGESERVER_JWT_TOKEN_SECRET: &'static str =
+        "neon-storage-controller-pageserver-jwt-token";
+    const CONTROL_PLANE_JWT_TOKEN_SECRET: &'static str =
+        "neon-storage-controller-control-plane-jwt-token";
+    const PUBLIC_KEY_SECRET: &'static str = "neon-storage-controller-public-key";
+
    const DATABASE_URL_ENV: &'static str = "DATABASE_URL";
    const PAGESERVER_JWT_TOKEN_ENV: &'static str = "PAGESERVER_JWT_TOKEN";
    const CONTROL_PLANE_JWT_TOKEN_ENV: &'static str = "CONTROL_PLANE_JWT_TOKEN";
@@ -99,41 +83,111 @@ impl Secrets {
    /// - Environment variables if DATABASE_URL is set.
    /// - AWS Secrets Manager secrets
    async fn load(args: &Cli) -> anyhow::Result<Self> {
-        let Some(database_url) =
-            Self::load_secret(&args.database_url, Self::DATABASE_URL_ENV).await
-        else {
-            anyhow::bail!(
-                "Database URL is not set (set `--database-url`, or `DATABASE_URL` environment)"
-            )
-        };
-
-        let public_key = match Self::load_secret(&args.public_key, Self::PUBLIC_KEY_ENV).await {
-            Some(v) => Some(JwtAuth::from_key(v).context("Loading public key")?),
-            None => None,
-        };
-
-        let this = Self {
-            database_url,
-            public_key,
-            jwt_token: Self::load_secret(&args.jwt_token, Self::PAGESERVER_JWT_TOKEN_ENV).await,
-            control_plane_jwt_token: Self::load_secret(
-                &args.control_plane_jwt_token,
-                Self::CONTROL_PLANE_JWT_TOKEN_ENV,
-            )
-            .await,
-        };
-
-        Ok(this)
+        match &args.database_url {
+            Some(url) => Self::load_cli(url, args),
+            None => match std::env::var(Self::DATABASE_URL_ENV) {
+                Ok(database_url) => Self::load_env(database_url),
+                Err(_) => Self::load_aws_sm().await,
+            },
+        }
    }

-    async fn load_secret(cli: &Option<String>, env_name: &str) -> Option<String> {
-        if let Some(v) = cli {
-            Some(v.clone())
-        } else if let Ok(v) = std::env::var(env_name) {
-            Some(v)
-        } else {
-            None
+    fn load_env(database_url: String) -> anyhow::Result<Self> {
+        let public_key = match std::env::var(Self::PUBLIC_KEY_ENV) {
+            Ok(public_key) => Some(JwtAuth::from_key(public_key).context("Loading public key")?),
+            Err(_) => None,
+        };
+        Ok(Self {
+            database_url,
+            public_key,
+            jwt_token: std::env::var(Self::PAGESERVER_JWT_TOKEN_ENV).ok(),
+            control_plane_jwt_token: std::env::var(Self::CONTROL_PLANE_JWT_TOKEN_ENV).ok(),
+        })
+    }
+
+    async fn load_aws_sm() -> anyhow::Result<Self> {
+        let Ok(region) = std::env::var("AWS_REGION") else {
+            anyhow::bail!("AWS_REGION is not set, cannot load secrets automatically: either set this, or use CLI args to supply secrets");
+        };
+        let config = aws_config::defaults(BehaviorVersion::v2023_11_09())
+            .region(Region::new(region.clone()))
+            .load()
+            .await;
+
+        let asm = aws_sdk_secretsmanager::Client::new(&config);
+
+        let Some(database_url) = asm
+            .get_secret_value()
+            .secret_id(Self::DATABASE_URL_SECRET)
+            .send()
+            .await?
+            .secret_string()
+            .map(str::to_string)
+        else {
+            anyhow::bail!(
+                "Database URL secret not found at {region}/{}",
+                Self::DATABASE_URL_SECRET
+            )
+        };
+
+        let jwt_token = asm
+            .get_secret_value()
+            .secret_id(Self::PAGESERVER_JWT_TOKEN_SECRET)
+            .send()
+            .await?
+            .secret_string()
+            .map(str::to_string);
+        if jwt_token.is_none() {
+            tracing::warn!("No pageserver JWT token set: this will only work if authentication is disabled on the pageserver");
        }
+
+        let control_plane_jwt_token = asm
+            .get_secret_value()
+            .secret_id(Self::CONTROL_PLANE_JWT_TOKEN_SECRET)
+            .send()
+            .await?
+            .secret_string()
+            .map(str::to_string);
+        if jwt_token.is_none() {
+            tracing::warn!("No control plane JWT token set: this will only work if authentication is disabled on the pageserver");
+        }
+
+        let public_key = asm
+            .get_secret_value()
+            .secret_id(Self::PUBLIC_KEY_SECRET)
+            .send()
+            .await?
+            .secret_string()
+            .map(str::to_string);
+        let public_key = match public_key {
+            Some(key) => Some(JwtAuth::from_key(key)?),
+            None => {
+                tracing::warn!(
+                    "No public key set: inccoming HTTP requests will not be authenticated"
+                );
+                None
+            }
+        };
+
+        Ok(Self {
+            database_url,
+            public_key,
+            jwt_token,
+            control_plane_jwt_token,
+        })
+    }
+
+    fn load_cli(database_url: &str, args: &Cli) -> anyhow::Result<Self> {
+        let public_key = match &args.public_key {
+            None => None,
+            Some(key) => Some(JwtAuth::from_key(key.clone()).context("Loading public key")?),
+        };
+        Ok(Self {
+            database_url: database_url.to_owned(),
+            public_key,
+            jwt_token: args.jwt_token.clone(),
+            control_plane_jwt_token: args.control_plane_jwt_token.clone(),
+        })
    }
 }

@@ -189,50 +243,12 @@ async fn async_main() -> anyhow::Result<()> {
        args.listen
    );

-    let strict_mode = if args.dev {
-        StrictMode::Dev
-    } else {
-        StrictMode::Strict
-    };
-
    let secrets = Secrets::load(&args).await?;

-    // Validate required secrets and arguments are provided in strict mode
-    match strict_mode {
-        StrictMode::Strict
-            if (secrets.public_key.is_none()
-                || secrets.jwt_token.is_none()
-                || secrets.control_plane_jwt_token.is_none()) =>
-        {
-            // Production systems should always have secrets configured: if public_key was not set
-            // then we would implicitly disable auth.
-            anyhow::bail!(
-                    "Insecure config!  One or more secrets is not set.  This is only permitted in `--dev` mode"
-                );
-        }
-        StrictMode::Strict if args.compute_hook_url.is_none() => {
-            // Production systems should always have a compute hook set, to prevent falling
-            // back to trying to use neon_local.
-            anyhow::bail!(
-                "`--compute-hook-url` is not set: this is only permitted in `--dev` mode"
-            );
-        }
-        StrictMode::Strict => {
-            tracing::info!("Starting in strict mode: configuration is OK.")
-        }
-        StrictMode::Dev => {
-            tracing::warn!("Starting in dev mode: this may be an insecure configuration.")
-        }
-    }
-
    let config = Config {
        jwt_token: secrets.jwt_token,
        control_plane_jwt_token: secrets.control_plane_jwt_token,
        compute_hook_url: args.compute_hook_url,
-        max_unavailable_interval: args
-            .max_unavailable_interval
-            .map(humantime::Duration::into)
-            .unwrap_or(MAX_UNAVAILABLE_INTERVAL_DEFAULT),
    };

    // After loading secrets & config, but before starting anything else, apply database migrations
--- a/control_plane/attachment_service/src/metrics.rs
+++ b/control_plane/attachment_service/src/metrics.rs
@@ -1,284 +1,32 @@
-//!
-//! This module provides metric definitions for the storage controller.
-//!
-//! All metrics are grouped in [`StorageControllerMetricGroup`]. [`StorageControllerMetrics`] holds
-//! the mentioned metrics and their encoder. It's globally available via the [`METRICS_REGISTRY`]
-//! constant.
-//!
-//! The rest of the code defines label group types and deals with converting outer types to labels.
-//!
-use bytes::Bytes;
-use measured::{
-    label::{LabelValue, StaticLabelSet},
-    FixedCardinalityLabel, MetricGroup,
-};
+use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
 use once_cell::sync::Lazy;
-use std::sync::Mutex;

-use crate::persistence::{DatabaseError, DatabaseOperation};
+pub(crate) struct ReconcilerMetrics {
+    pub(crate) spawned: IntCounter,
+    pub(crate) complete: IntCounterVec,
+}

-pub(crate) static METRICS_REGISTRY: Lazy<StorageControllerMetrics> =
-    Lazy::new(StorageControllerMetrics::default);
+impl ReconcilerMetrics {
+    // Labels used on [`Self::complete`]
+    pub(crate) const SUCCESS: &'static str = "ok";
+    pub(crate) const ERROR: &'static str = "success";
+    pub(crate) const CANCEL: &'static str = "cancel";
+}
+
+pub(crate) static RECONCILER: Lazy<ReconcilerMetrics> = Lazy::new(|| ReconcilerMetrics {
+    spawned: register_int_counter!(
+        "storage_controller_reconcile_spawn",
+        "Count of how many times we spawn a reconcile task",
+    )
+    .expect("failed to define a metric"),
+    complete: register_int_counter_vec!(
+        "storage_controller_reconcile_complete",
+        "Reconciler tasks completed, broken down by success/failure/cancelled",
+        &["status"],
+    )
+    .expect("failed to define a metric"),
+});

 pub fn preinitialize_metrics() {
-    Lazy::force(&METRICS_REGISTRY);
-}
-
-pub(crate) struct StorageControllerMetrics {
-    pub(crate) metrics_group: StorageControllerMetricGroup,
-    encoder: Mutex<measured::text::TextEncoder>,
-}
-
-#[derive(measured::MetricGroup)]
-pub(crate) struct StorageControllerMetricGroup {
-    /// Count of how many times we spawn a reconcile task
-    pub(crate) storage_controller_reconcile_spawn: measured::Counter,
-    /// Reconciler tasks completed, broken down by success/failure/cancelled
-    pub(crate) storage_controller_reconcile_complete:
-        measured::CounterVec<ReconcileCompleteLabelGroupSet>,
-
-    /// HTTP request status counters for handled requests
-    pub(crate) storage_controller_http_request_status:
-        measured::CounterVec<HttpRequestStatusLabelGroupSet>,
-    /// HTTP request handler latency across all status codes
-    pub(crate) storage_controller_http_request_latency:
-        measured::HistogramVec<HttpRequestLatencyLabelGroupSet, 5>,
-
-    /// Count of HTTP requests to the pageserver that resulted in an error,
-    /// broken down by the pageserver node id, request name and method
-    pub(crate) storage_controller_pageserver_request_error:
-        measured::CounterVec<PageserverRequestLabelGroupSet>,
-
-    /// Latency of HTTP requests to the pageserver, broken down by pageserver
-    /// node id, request name and method. This include both successful and unsuccessful
-    /// requests.
-    pub(crate) storage_controller_pageserver_request_latency:
-        measured::HistogramVec<PageserverRequestLabelGroupSet, 5>,
-
-    /// Count of pass-through HTTP requests to the pageserver that resulted in an error,
-    /// broken down by the pageserver node id, request name and method
-    pub(crate) storage_controller_passthrough_request_error:
-        measured::CounterVec<PageserverRequestLabelGroupSet>,
-
-    /// Latency of pass-through HTTP requests to the pageserver, broken down by pageserver
-    /// node id, request name and method. This include both successful and unsuccessful
-    /// requests.
-    pub(crate) storage_controller_passthrough_request_latency:
-        measured::HistogramVec<PageserverRequestLabelGroupSet, 5>,
-
-    /// Count of errors in database queries, broken down by error type and operation.
-    pub(crate) storage_controller_database_query_error:
-        measured::CounterVec<DatabaseQueryErrorLabelGroupSet>,
-
-    /// Latency of database queries, broken down by operation.
-    pub(crate) storage_controller_database_query_latency:
-        measured::HistogramVec<DatabaseQueryLatencyLabelGroupSet, 5>,
-}
-
-impl StorageControllerMetrics {
-    pub(crate) fn encode(&self) -> Bytes {
-        let mut encoder = self.encoder.lock().unwrap();
-        self.metrics_group.collect_into(&mut *encoder);
-        encoder.finish()
-    }
-}
-
-impl Default for StorageControllerMetrics {
-    fn default() -> Self {
-        Self {
-            metrics_group: StorageControllerMetricGroup::new(),
-            encoder: Mutex::new(measured::text::TextEncoder::new()),
-        }
-    }
-}
-
-impl StorageControllerMetricGroup {
-    pub(crate) fn new() -> Self {
-        Self {
-            storage_controller_reconcile_spawn: measured::Counter::new(),
-            storage_controller_reconcile_complete: measured::CounterVec::new(
-                ReconcileCompleteLabelGroupSet {
-                    status: StaticLabelSet::new(),
-                },
-            ),
-            storage_controller_http_request_status: measured::CounterVec::new(
-                HttpRequestStatusLabelGroupSet {
-                    path: lasso::ThreadedRodeo::new(),
-                    method: StaticLabelSet::new(),
-                    status: StaticLabelSet::new(),
-                },
-            ),
-            storage_controller_http_request_latency: measured::HistogramVec::new(
-                measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
-            ),
-            storage_controller_pageserver_request_error: measured::CounterVec::new(
-                PageserverRequestLabelGroupSet {
-                    pageserver_id: lasso::ThreadedRodeo::new(),
-                    path: lasso::ThreadedRodeo::new(),
-                    method: StaticLabelSet::new(),
-                },
-            ),
-            storage_controller_pageserver_request_latency: measured::HistogramVec::new(
-                measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
-            ),
-            storage_controller_passthrough_request_error: measured::CounterVec::new(
-                PageserverRequestLabelGroupSet {
-                    pageserver_id: lasso::ThreadedRodeo::new(),
-                    path: lasso::ThreadedRodeo::new(),
-                    method: StaticLabelSet::new(),
-                },
-            ),
-            storage_controller_passthrough_request_latency: measured::HistogramVec::new(
-                measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
-            ),
-            storage_controller_database_query_error: measured::CounterVec::new(
-                DatabaseQueryErrorLabelGroupSet {
-                    operation: StaticLabelSet::new(),
-                    error_type: StaticLabelSet::new(),
-                },
-            ),
-            storage_controller_database_query_latency: measured::HistogramVec::new(
-                measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
-            ),
-        }
-    }
-}
-
-#[derive(measured::LabelGroup)]
-#[label(set = ReconcileCompleteLabelGroupSet)]
-pub(crate) struct ReconcileCompleteLabelGroup {
-    pub(crate) status: ReconcileOutcome,
-}
-
-#[derive(measured::LabelGroup)]
-#[label(set = HttpRequestStatusLabelGroupSet)]
-pub(crate) struct HttpRequestStatusLabelGroup<'a> {
-    #[label(dynamic_with = lasso::ThreadedRodeo)]
-    pub(crate) path: &'a str,
-    pub(crate) method: Method,
-    pub(crate) status: StatusCode,
-}
-
-#[derive(measured::LabelGroup)]
-#[label(set = HttpRequestLatencyLabelGroupSet)]
-pub(crate) struct HttpRequestLatencyLabelGroup<'a> {
-    #[label(dynamic_with = lasso::ThreadedRodeo)]
-    pub(crate) path: &'a str,
-    pub(crate) method: Method,
-}
-
-impl Default for HttpRequestLatencyLabelGroupSet {
-    fn default() -> Self {
-        Self {
-            path: lasso::ThreadedRodeo::new(),
-            method: StaticLabelSet::new(),
-        }
-    }
-}
-
-#[derive(measured::LabelGroup, Clone)]
-#[label(set = PageserverRequestLabelGroupSet)]
-pub(crate) struct PageserverRequestLabelGroup<'a> {
-    #[label(dynamic_with = lasso::ThreadedRodeo)]
-    pub(crate) pageserver_id: &'a str,
-    #[label(dynamic_with = lasso::ThreadedRodeo)]
-    pub(crate) path: &'a str,
-    pub(crate) method: Method,
-}
-
-impl Default for PageserverRequestLabelGroupSet {
-    fn default() -> Self {
-        Self {
-            pageserver_id: lasso::ThreadedRodeo::new(),
-            path: lasso::ThreadedRodeo::new(),
-            method: StaticLabelSet::new(),
-        }
-    }
-}
-
-#[derive(measured::LabelGroup)]
-#[label(set = DatabaseQueryErrorLabelGroupSet)]
-pub(crate) struct DatabaseQueryErrorLabelGroup {
-    pub(crate) error_type: DatabaseErrorLabel,
-    pub(crate) operation: DatabaseOperation,
-}
-
-#[derive(measured::LabelGroup)]
-#[label(set = DatabaseQueryLatencyLabelGroupSet)]
-pub(crate) struct DatabaseQueryLatencyLabelGroup {
-    pub(crate) operation: DatabaseOperation,
-}
-
-#[derive(FixedCardinalityLabel)]
-pub(crate) enum ReconcileOutcome {
-    #[label(rename = "ok")]
-    Success,
-    Error,
-    Cancel,
-}
-
-#[derive(FixedCardinalityLabel, Clone)]
-pub(crate) enum Method {
-    Get,
-    Put,
-    Post,
-    Delete,
-    Other,
-}
-
-impl From<hyper::Method> for Method {
-    fn from(value: hyper::Method) -> Self {
-        if value == hyper::Method::GET {
-            Method::Get
-        } else if value == hyper::Method::PUT {
-            Method::Put
-        } else if value == hyper::Method::POST {
-            Method::Post
-        } else if value == hyper::Method::DELETE {
-            Method::Delete
-        } else {
-            Method::Other
-        }
-    }
-}
-
-pub(crate) struct StatusCode(pub(crate) hyper::http::StatusCode);
-
-impl LabelValue for StatusCode {
-    fn visit<V: measured::label::LabelVisitor>(&self, v: V) -> V::Output {
-        v.write_int(self.0.as_u16() as u64)
-    }
-}
-
-impl FixedCardinalityLabel for StatusCode {
-    fn cardinality() -> usize {
-        (100..1000).len()
-    }
-
-    fn encode(&self) -> usize {
-        self.0.as_u16() as usize
-    }
-
-    fn decode(value: usize) -> Self {
-        Self(hyper::http::StatusCode::from_u16(u16::try_from(value).unwrap()).unwrap())
-    }
-}
-
-#[derive(FixedCardinalityLabel)]
-pub(crate) enum DatabaseErrorLabel {
-    Query,
-    Connection,
-    ConnectionPool,
-    Logical,
-}
-
-impl DatabaseError {
-    pub(crate) fn error_label(&self) -> DatabaseErrorLabel {
-        match self {
-            Self::Query(_) => DatabaseErrorLabel::Query,
-            Self::Connection(_) => DatabaseErrorLabel::Connection,
-            Self::ConnectionPool(_) => DatabaseErrorLabel::ConnectionPool,
-            Self::Logical(_) => DatabaseErrorLabel::Logical,
-        }
-    }
+    Lazy::force(&RECONCILER);
 }
--- a/control_plane/attachment_service/src/node.rs
+++ b/control_plane/attachment_service/src/node.rs
@@ -12,9 +12,7 @@ use serde::Serialize;
 use tokio_util::sync::CancellationToken;
 use utils::{backoff, id::NodeId};

-use crate::{
-    pageserver_client::PageserverClient, persistence::NodePersistence, scheduler::MaySchedule,
-};
+use crate::persistence::NodePersistence;

 /// Represents the in-memory description of a Node.
 ///
@@ -113,8 +111,8 @@ impl Node {
        use NodeAvailability::*;

        match (self.availability, availability) {
-            (Offline, Active(_)) => ToActive,
-            (Active(_), Offline) => ToOffline,
+            (Offline, Active) => ToActive,
+            (Active, Offline) => ToOffline,
            _ => Unchanged,
        }
    }
@@ -125,21 +123,21 @@ impl Node {
        // a reference to the original Node's cancellation status.  Checking both of these results
        // in a "pessimistic" check where we will consider a Node instance unavailable if it was unavailable
        // when we cloned it, or if the original Node instance's cancellation token was fired.
-        matches!(self.availability, NodeAvailability::Active(_)) && !self.cancel.is_cancelled()
+        matches!(self.availability, NodeAvailability::Active) && !self.cancel.is_cancelled()
    }

    /// Is this node elegible to have work scheduled onto it?
-    pub(crate) fn may_schedule(&self) -> MaySchedule {
-        let score = match self.availability {
-            NodeAvailability::Active(score) => score,
-            NodeAvailability::Offline => return MaySchedule::No,
-        };
+    pub(crate) fn may_schedule(&self) -> bool {
+        match self.availability {
+            NodeAvailability::Active => {}
+            NodeAvailability::Offline => return false,
+        }

        match self.scheduling {
-            NodeSchedulingPolicy::Active => MaySchedule::Yes(score),
-            NodeSchedulingPolicy::Draining => MaySchedule::No,
-            NodeSchedulingPolicy::Filling => MaySchedule::Yes(score),
-            NodeSchedulingPolicy::Pause => MaySchedule::No,
+            NodeSchedulingPolicy::Active => true,
+            NodeSchedulingPolicy::Draining => false,
+            NodeSchedulingPolicy::Filling => true,
+            NodeSchedulingPolicy::Pause => false,
        }
    }

@@ -157,7 +155,8 @@ impl Node {
            listen_pg_addr,
            listen_pg_port,
            scheduling: NodeSchedulingPolicy::Filling,
-            availability: NodeAvailability::Offline,
+            // TODO: we shouldn't really call this Active until we've heartbeated it.
+            availability: NodeAvailability::Active,
            cancel: CancellationToken::new(),
        }
    }
@@ -204,7 +203,7 @@ impl Node {
        cancel: &CancellationToken,
    ) -> Option<mgmt_api::Result<T>>
    where
-        O: FnMut(PageserverClient) -> F,
+        O: FnMut(mgmt_api::Client) -> F,
        F: std::future::Future<Output = mgmt_api::Result<T>>,
    {
        fn is_fatal(e: &mgmt_api::Error) -> bool {
@@ -226,12 +225,8 @@ impl Node {
                    .build()
                    .expect("Failed to construct HTTP client");

-                let client = PageserverClient::from_client(
-                    self.get_id(),
-                    http_client,
-                    self.base_url(),
-                    jwt.as_deref(),
-                );
+                let client =
+                    mgmt_api::Client::from_client(http_client, self.base_url(), jwt.as_deref());

                let node_cancel_fut = self.cancel.cancelled();

--- a/control_plane/attachment_service/src/pageserver_client.rs
+++ b/control_plane/attachment_service/src/pageserver_client.rs
@@ -1,203 +0,0 @@
-use pageserver_api::{
-    models::{
-        LocationConfig, LocationConfigListResponse, PageserverUtilization, SecondaryProgress,
-        TenantShardSplitRequest, TenantShardSplitResponse, TimelineCreateRequest, TimelineInfo,
-    },
-    shard::TenantShardId,
-};
-use pageserver_client::mgmt_api::{Client, Result};
-use reqwest::StatusCode;
-use utils::id::{NodeId, TimelineId};
-
-/// Thin wrapper around [`pageserver_client::mgmt_api::Client`]. It allows the storage
-/// controller to collect metrics in a non-intrusive manner.
-#[derive(Debug, Clone)]
-pub(crate) struct PageserverClient {
-    inner: Client,
-    node_id_label: String,
-}
-
-macro_rules! measured_request {
-    ($name:literal, $method:expr, $node_id: expr, $invoke:expr) => {{
-        let labels = crate::metrics::PageserverRequestLabelGroup {
-            pageserver_id: $node_id,
-            path: $name,
-            method: $method,
-        };
-
-        let latency = &crate::metrics::METRICS_REGISTRY
-            .metrics_group
-            .storage_controller_pageserver_request_latency;
-        let _timer_guard = latency.start_timer(labels.clone());
-
-        let res = $invoke;
-
-        if res.is_err() {
-            let error_counters = &crate::metrics::METRICS_REGISTRY
-                .metrics_group
-                .storage_controller_pageserver_request_error;
-            error_counters.inc(labels)
-        }
-
-        res
-    }};
-}
-
-impl PageserverClient {
-    pub(crate) fn new(node_id: NodeId, mgmt_api_endpoint: String, jwt: Option<&str>) -> Self {
-        Self {
-            inner: Client::from_client(reqwest::Client::new(), mgmt_api_endpoint, jwt),
-            node_id_label: node_id.0.to_string(),
-        }
-    }
-
-    pub(crate) fn from_client(
-        node_id: NodeId,
-        raw_client: reqwest::Client,
-        mgmt_api_endpoint: String,
-        jwt: Option<&str>,
-    ) -> Self {
-        Self {
-            inner: Client::from_client(raw_client, mgmt_api_endpoint, jwt),
-            node_id_label: node_id.0.to_string(),
-        }
-    }
-
-    pub(crate) async fn tenant_delete(&self, tenant_shard_id: TenantShardId) -> Result<StatusCode> {
-        measured_request!(
-            "tenant",
-            crate::metrics::Method::Delete,
-            &self.node_id_label,
-            self.inner.tenant_delete(tenant_shard_id).await
-        )
-    }
-
-    pub(crate) async fn tenant_time_travel_remote_storage(
-        &self,
-        tenant_shard_id: TenantShardId,
-        timestamp: &str,
-        done_if_after: &str,
-    ) -> Result<()> {
-        measured_request!(
-            "tenant_time_travel_remote_storage",
-            crate::metrics::Method::Put,
-            &self.node_id_label,
-            self.inner
-                .tenant_time_travel_remote_storage(tenant_shard_id, timestamp, done_if_after)
-                .await
-        )
-    }
-
-    pub(crate) async fn tenant_secondary_download(
-        &self,
-        tenant_id: TenantShardId,
-        wait: Option<std::time::Duration>,
-    ) -> Result<(StatusCode, SecondaryProgress)> {
-        measured_request!(
-            "tenant_secondary_download",
-            crate::metrics::Method::Post,
-            &self.node_id_label,
-            self.inner.tenant_secondary_download(tenant_id, wait).await
-        )
-    }
-
-    pub(crate) async fn location_config(
-        &self,
-        tenant_shard_id: TenantShardId,
-        config: LocationConfig,
-        flush_ms: Option<std::time::Duration>,
-        lazy: bool,
-    ) -> Result<()> {
-        measured_request!(
-            "location_config",
-            crate::metrics::Method::Put,
-            &self.node_id_label,
-            self.inner
-                .location_config(tenant_shard_id, config, flush_ms, lazy)
-                .await
-        )
-    }
-
-    pub(crate) async fn list_location_config(&self) -> Result<LocationConfigListResponse> {
-        measured_request!(
-            "location_configs",
-            crate::metrics::Method::Get,
-            &self.node_id_label,
-            self.inner.list_location_config().await
-        )
-    }
-
-    pub(crate) async fn get_location_config(
-        &self,
-        tenant_shard_id: TenantShardId,
-    ) -> Result<Option<LocationConfig>> {
-        measured_request!(
-            "location_config",
-            crate::metrics::Method::Get,
-            &self.node_id_label,
-            self.inner.get_location_config(tenant_shard_id).await
-        )
-    }
-
-    pub(crate) async fn timeline_create(
-        &self,
-        tenant_shard_id: TenantShardId,
-        req: &TimelineCreateRequest,
-    ) -> Result<TimelineInfo> {
-        measured_request!(
-            "timeline",
-            crate::metrics::Method::Post,
-            &self.node_id_label,
-            self.inner.timeline_create(tenant_shard_id, req).await
-        )
-    }
-
-    pub(crate) async fn timeline_delete(
-        &self,
-        tenant_shard_id: TenantShardId,
-        timeline_id: TimelineId,
-    ) -> Result<StatusCode> {
-        measured_request!(
-            "timeline",
-            crate::metrics::Method::Delete,
-            &self.node_id_label,
-            self.inner
-                .timeline_delete(tenant_shard_id, timeline_id)
-                .await
-        )
-    }
-
-    pub(crate) async fn tenant_shard_split(
-        &self,
-        tenant_shard_id: TenantShardId,
-        req: TenantShardSplitRequest,
-    ) -> Result<TenantShardSplitResponse> {
-        measured_request!(
-            "tenant_shard_split",
-            crate::metrics::Method::Put,
-            &self.node_id_label,
-            self.inner.tenant_shard_split(tenant_shard_id, req).await
-        )
-    }
-
-    pub(crate) async fn timeline_list(
-        &self,
-        tenant_shard_id: &TenantShardId,
-    ) -> Result<Vec<TimelineInfo>> {
-        measured_request!(
-            "timelines",
-            crate::metrics::Method::Get,
-            &self.node_id_label,
-            self.inner.timeline_list(tenant_shard_id).await
-        )
-    }
-
-    pub(crate) async fn get_utilization(&self) -> Result<PageserverUtilization> {
-        measured_request!(
-            "utilization",
-            crate::metrics::Method::Get,
-            &self.node_id_label,
-            self.inner.get_utilization().await
-        )
-    }
-}
--- a/control_plane/attachment_service/src/persistence.rs
+++ b/control_plane/attachment_service/src/persistence.rs
@@ -19,9 +19,6 @@ use serde::{Deserialize, Serialize};
 use utils::generation::Generation;
 use utils::id::{NodeId, TenantId};

-use crate::metrics::{
-    DatabaseQueryErrorLabelGroup, DatabaseQueryLatencyLabelGroup, METRICS_REGISTRY,
-};
 use crate::node::Node;

 /// ## What do we store?
@@ -78,25 +75,6 @@ pub(crate) enum DatabaseError {
    Logical(String),
 }

-#[derive(measured::FixedCardinalityLabel, Clone)]
-pub(crate) enum DatabaseOperation {
-    InsertNode,
-    UpdateNode,
-    DeleteNode,
-    ListNodes,
-    BeginShardSplit,
-    CompleteShardSplit,
-    AbortShardSplit,
-    Detach,
-    ReAttach,
-    IncrementGeneration,
-    ListTenantShards,
-    InsertTenantShards,
-    UpdateTenantShard,
-    DeleteTenant,
-    UpdateTenantConfig,
-}
-
 #[must_use]
 pub(crate) enum AbortShardSplitStatus {
    /// We aborted the split in the database by reverting to the parent shards
@@ -137,34 +115,6 @@ impl Persistence {
        }
    }

-    /// Wraps `with_conn` in order to collect latency and error metrics
-    async fn with_measured_conn<F, R>(&self, op: DatabaseOperation, func: F) -> DatabaseResult<R>
-    where
-        F: Fn(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
-        R: Send + 'static,
-    {
-        let latency = &METRICS_REGISTRY
-            .metrics_group
-            .storage_controller_database_query_latency;
-        let _timer = latency.start_timer(DatabaseQueryLatencyLabelGroup {
-            operation: op.clone(),
-        });
-
-        let res = self.with_conn(func).await;
-
-        if let Err(err) = &res {
-            let error_counter = &METRICS_REGISTRY
-                .metrics_group
-                .storage_controller_database_query_error;
-            error_counter.inc(DatabaseQueryErrorLabelGroup {
-                error_type: err.error_label(),
-                operation: op,
-            })
-        }
-
-        res
-    }
-
    /// Call the provided function in a tokio blocking thread, with a Diesel database connection.
    async fn with_conn<F, R>(&self, func: F) -> DatabaseResult<R>
    where
@@ -180,27 +130,21 @@ impl Persistence {
    /// When a node is first registered, persist it before using it for anything
    pub(crate) async fn insert_node(&self, node: &Node) -> DatabaseResult<()> {
        let np = node.to_persistent();
-        self.with_measured_conn(
-            DatabaseOperation::InsertNode,
-            move |conn| -> DatabaseResult<()> {
-                diesel::insert_into(crate::schema::nodes::table)
-                    .values(&np)
-                    .execute(conn)?;
-                Ok(())
-            },
-        )
+        self.with_conn(move |conn| -> DatabaseResult<()> {
+            diesel::insert_into(crate::schema::nodes::table)
+                .values(&np)
+                .execute(conn)?;
+            Ok(())
+        })
        .await
    }

    /// At startup, populate the list of nodes which our shards may be placed on
    pub(crate) async fn list_nodes(&self) -> DatabaseResult<Vec<NodePersistence>> {
        let nodes: Vec<NodePersistence> = self
-            .with_measured_conn(
-                DatabaseOperation::ListNodes,
-                move |conn| -> DatabaseResult<_> {
-                    Ok(crate::schema::nodes::table.load::<NodePersistence>(conn)?)
-                },
-            )
+            .with_conn(move |conn| -> DatabaseResult<_> {
+                Ok(crate::schema::nodes::table.load::<NodePersistence>(conn)?)
+            })
            .await?;

        tracing::info!("list_nodes: loaded {} nodes", nodes.len());
@@ -215,7 +159,7 @@ impl Persistence {
    ) -> DatabaseResult<()> {
        use crate::schema::nodes::dsl::*;
        let updated = self
-            .with_measured_conn(DatabaseOperation::UpdateNode, move |conn| {
+            .with_conn(move |conn| {
                let updated = diesel::update(nodes)
                    .filter(node_id.eq(input_node_id.0 as i64))
                    .set((scheduling_policy.eq(String::from(input_scheduling)),))
@@ -237,12 +181,9 @@ impl Persistence {
    /// be enriched at runtime with state discovered on pageservers.
    pub(crate) async fn list_tenant_shards(&self) -> DatabaseResult<Vec<TenantShardPersistence>> {
        let loaded = self
-            .with_measured_conn(
-                DatabaseOperation::ListTenantShards,
-                move |conn| -> DatabaseResult<_> {
-                    Ok(crate::schema::tenant_shards::table.load::<TenantShardPersistence>(conn)?)
-                },
-            )
+            .with_conn(move |conn| -> DatabaseResult<_> {
+                Ok(crate::schema::tenant_shards::table.load::<TenantShardPersistence>(conn)?)
+            })
            .await?;

        if loaded.is_empty() {
@@ -270,10 +211,15 @@ impl Persistence {

        let mut decoded = serde_json::from_slice::<JsonPersistence>(&bytes)
            .map_err(|e| DatabaseError::Logical(format!("Deserialization error: {e}")))?;
-        for shard in decoded.tenants.values_mut() {
-            if shard.placement_policy == "\"Single\"" {
-                // Backward compat for test data after PR https://github.com/neondatabase/neon/pull/7165
-                shard.placement_policy = "{\"Attached\":0}".to_string();
+        for (tenant_id, tenant) in &mut decoded.tenants {
+            // Backward compat: an old attachments.json from before PR #6251, replace
+            // empty strings with proper defaults.
+            if tenant.tenant_id.is_empty() {
+                tenant.tenant_id = tenant_id.to_string();
+                tenant.config = serde_json::to_string(&TenantConfig::default())
+                    .map_err(|e| DatabaseError::Logical(format!("Serialization error: {e}")))?;
+                tenant.placement_policy = serde_json::to_string(&PlacementPolicy::Single)
+                    .map_err(|e| DatabaseError::Logical(format!("Serialization error: {e}")))?;
            }
        }

@@ -319,20 +265,17 @@ impl Persistence {
        shards: Vec<TenantShardPersistence>,
    ) -> DatabaseResult<()> {
        use crate::schema::tenant_shards::dsl::*;
-        self.with_measured_conn(
-            DatabaseOperation::InsertTenantShards,
-            move |conn| -> DatabaseResult<()> {
-                conn.transaction(|conn| -> QueryResult<()> {
-                    for tenant in &shards {
-                        diesel::insert_into(tenant_shards)
-                            .values(tenant)
-                            .execute(conn)?;
-                    }
-                    Ok(())
-                })?;
+        self.with_conn(move |conn| -> DatabaseResult<()> {
+            conn.transaction(|conn| -> QueryResult<()> {
+                for tenant in &shards {
+                    diesel::insert_into(tenant_shards)
+                        .values(tenant)
+                        .execute(conn)?;
+                }
                Ok(())
-            },
-        )
+            })?;
+            Ok(())
+        })
        .await
    }

@@ -340,31 +283,25 @@ impl Persistence {
    /// the tenant from memory on this server.
    pub(crate) async fn delete_tenant(&self, del_tenant_id: TenantId) -> DatabaseResult<()> {
        use crate::schema::tenant_shards::dsl::*;
-        self.with_measured_conn(
-            DatabaseOperation::DeleteTenant,
-            move |conn| -> DatabaseResult<()> {
-                diesel::delete(tenant_shards)
-                    .filter(tenant_id.eq(del_tenant_id.to_string()))
-                    .execute(conn)?;
+        self.with_conn(move |conn| -> DatabaseResult<()> {
+            diesel::delete(tenant_shards)
+                .filter(tenant_id.eq(del_tenant_id.to_string()))
+                .execute(conn)?;

-                Ok(())
-            },
-        )
+            Ok(())
+        })
        .await
    }

    pub(crate) async fn delete_node(&self, del_node_id: NodeId) -> DatabaseResult<()> {
        use crate::schema::nodes::dsl::*;
-        self.with_measured_conn(
-            DatabaseOperation::DeleteNode,
-            move |conn| -> DatabaseResult<()> {
-                diesel::delete(nodes)
-                    .filter(node_id.eq(del_node_id.0 as i64))
-                    .execute(conn)?;
+        self.with_conn(move |conn| -> DatabaseResult<()> {
+            diesel::delete(nodes)
+                .filter(node_id.eq(del_node_id.0 as i64))
+                .execute(conn)?;

-                Ok(())
-            },
-        )
+            Ok(())
+        })
        .await
    }

@@ -378,7 +315,7 @@ impl Persistence {
    ) -> DatabaseResult<HashMap<TenantShardId, Generation>> {
        use crate::schema::tenant_shards::dsl::*;
        let updated = self
-            .with_measured_conn(DatabaseOperation::ReAttach, move |conn| {
+            .with_conn(move |conn| {
                let rows_updated = diesel::update(tenant_shards)
                    .filter(generation_pageserver.eq(node_id.0 as i64))
                    .set(generation.eq(generation + 1))
@@ -428,7 +365,7 @@ impl Persistence {
    ) -> anyhow::Result<Generation> {
        use crate::schema::tenant_shards::dsl::*;
        let updated = self
-            .with_measured_conn(DatabaseOperation::IncrementGeneration, move |conn| {
+            .with_conn(move |conn| {
                let updated = diesel::update(tenant_shards)
                    .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
                    .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
@@ -472,7 +409,7 @@ impl Persistence {
    ) -> DatabaseResult<()> {
        use crate::schema::tenant_shards::dsl::*;

-        self.with_measured_conn(DatabaseOperation::UpdateTenantShard, move |conn| {
+        self.with_conn(move |conn| {
            let query = diesel::update(tenant_shards)
                .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
                .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
@@ -513,7 +450,7 @@ impl Persistence {
    ) -> DatabaseResult<()> {
        use crate::schema::tenant_shards::dsl::*;

-        self.with_measured_conn(DatabaseOperation::UpdateTenantConfig, move |conn| {
+        self.with_conn(move |conn| {
            diesel::update(tenant_shards)
                .filter(tenant_id.eq(input_tenant_id.to_string()))
                .set((config.eq(serde_json::to_string(&input_config).unwrap()),))
@@ -528,7 +465,7 @@ impl Persistence {

    pub(crate) async fn detach(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
        use crate::schema::tenant_shards::dsl::*;
-        self.with_measured_conn(DatabaseOperation::Detach, move |conn| {
+        self.with_conn(move |conn| {
            let updated = diesel::update(tenant_shards)
                .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
                .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
@@ -558,7 +495,7 @@ impl Persistence {
        parent_to_children: Vec<(TenantShardId, Vec<TenantShardPersistence>)>,
    ) -> DatabaseResult<()> {
        use crate::schema::tenant_shards::dsl::*;
-        self.with_measured_conn(DatabaseOperation::BeginShardSplit, move |conn| -> DatabaseResult<()> {
+        self.with_conn(move |conn| -> DatabaseResult<()> {
            conn.transaction(|conn| -> DatabaseResult<()> {
                // Mark parent shards as splitting

@@ -622,29 +559,26 @@ impl Persistence {
        old_shard_count: ShardCount,
    ) -> DatabaseResult<()> {
        use crate::schema::tenant_shards::dsl::*;
-        self.with_measured_conn(
-            DatabaseOperation::CompleteShardSplit,
-            move |conn| -> DatabaseResult<()> {
-                conn.transaction(|conn| -> QueryResult<()> {
-                    // Drop parent shards
-                    diesel::delete(tenant_shards)
-                        .filter(tenant_id.eq(split_tenant_id.to_string()))
-                        .filter(shard_count.eq(old_shard_count.literal() as i32))
-                        .execute(conn)?;
+        self.with_conn(move |conn| -> DatabaseResult<()> {
+            conn.transaction(|conn| -> QueryResult<()> {
+                // Drop parent shards
+                diesel::delete(tenant_shards)
+                    .filter(tenant_id.eq(split_tenant_id.to_string()))
+                    .filter(shard_count.eq(old_shard_count.literal() as i32))
+                    .execute(conn)?;

-                    // Clear sharding flag
-                    let updated = diesel::update(tenant_shards)
-                        .filter(tenant_id.eq(split_tenant_id.to_string()))
-                        .set((splitting.eq(0),))
-                        .execute(conn)?;
-                    debug_assert!(updated > 0);
-
-                    Ok(())
-                })?;
+                // Clear sharding flag
+                let updated = diesel::update(tenant_shards)
+                    .filter(tenant_id.eq(split_tenant_id.to_string()))
+                    .set((splitting.eq(0),))
+                    .execute(conn)?;
+                debug_assert!(updated > 0);

                Ok(())
-            },
-        )
+            })?;
+
+            Ok(())
+        })
        .await
    }

@@ -656,44 +590,40 @@ impl Persistence {
        new_shard_count: ShardCount,
    ) -> DatabaseResult<AbortShardSplitStatus> {
        use crate::schema::tenant_shards::dsl::*;
-        self.with_measured_conn(
-            DatabaseOperation::AbortShardSplit,
-            move |conn| -> DatabaseResult<AbortShardSplitStatus> {
-                let aborted =
-                    conn.transaction(|conn| -> DatabaseResult<AbortShardSplitStatus> {
-                        // Clear the splitting state on parent shards
-                        let updated = diesel::update(tenant_shards)
-                            .filter(tenant_id.eq(split_tenant_id.to_string()))
-                            .filter(shard_count.ne(new_shard_count.literal() as i32))
-                            .set((splitting.eq(0),))
-                            .execute(conn)?;
+        self.with_conn(move |conn| -> DatabaseResult<AbortShardSplitStatus> {
+            let aborted = conn.transaction(|conn| -> DatabaseResult<AbortShardSplitStatus> {
+                // Clear the splitting state on parent shards
+                let updated = diesel::update(tenant_shards)
+                    .filter(tenant_id.eq(split_tenant_id.to_string()))
+                    .filter(shard_count.ne(new_shard_count.literal() as i32))
+                    .set((splitting.eq(0),))
+                    .execute(conn)?;

-                        // Parent shards are already gone: we cannot abort.
-                        if updated == 0 {
-                            return Ok(AbortShardSplitStatus::Complete);
-                        }
+                // Parent shards are already gone: we cannot abort.
+                if updated == 0 {
+                    return Ok(AbortShardSplitStatus::Complete);
+                }

-                        // Sanity check: if parent shards were present, their cardinality should
-                        // be less than the number of child shards.
-                        if updated >= new_shard_count.count() as usize {
-                            return Err(DatabaseError::Logical(format!(
-                                "Unexpected parent shard count {updated} while aborting split to \
+                // Sanity check: if parent shards were present, their cardinality should
+                // be less than the number of child shards.
+                if updated >= new_shard_count.count() as usize {
+                    return Err(DatabaseError::Logical(format!(
+                        "Unexpected parent shard count {updated} while aborting split to \
                            count {new_shard_count:?} on tenant {split_tenant_id}"
-                            )));
-                        }
+                    )));
+                }

-                        // Erase child shards
-                        diesel::delete(tenant_shards)
-                            .filter(tenant_id.eq(split_tenant_id.to_string()))
-                            .filter(shard_count.eq(new_shard_count.literal() as i32))
-                            .execute(conn)?;
+                // Erase child shards
+                diesel::delete(tenant_shards)
+                    .filter(tenant_id.eq(split_tenant_id.to_string()))
+                    .filter(shard_count.eq(new_shard_count.literal() as i32))
+                    .execute(conn)?;

-                        Ok(AbortShardSplitStatus::Aborted)
-                    })?;
+                Ok(AbortShardSplitStatus::Aborted)
+            })?;

-                Ok(aborted)
-            },
-        )
+            Ok(aborted)
+        })
        .await
    }
 }
--- a/control_plane/attachment_service/src/reconciler.rs
+++ b/control_plane/attachment_service/src/reconciler.rs
@@ -1,4 +1,3 @@
-use crate::pageserver_client::PageserverClient;
 use crate::persistence::Persistence;
 use crate::service;
 use hyper::StatusCode;
@@ -9,7 +8,7 @@ use pageserver_api::shard::{ShardIdentity, TenantShardId};
 use pageserver_client::mgmt_api;
 use std::collections::HashMap;
 use std::sync::Arc;
-use std::time::{Duration, Instant};
+use std::time::Duration;
 use tokio_util::sync::CancellationToken;
 use utils::generation::Generation;
 use utils::id::{NodeId, TimelineId};
@@ -118,15 +117,6 @@ impl Reconciler {
        flush_ms: Option<Duration>,
        lazy: bool,
    ) -> Result<(), ReconcileError> {
-        if !node.is_available() && config.mode == LocationConfigMode::Detached {
-            // Attempts to detach from offline nodes may be imitated without doing I/O: a node which is offline
-            // will get fully reconciled wrt the shard's intent state when it is reactivated, irrespective of
-            // what we put into `observed`, in [`crate::service::Service::node_activate_reconcile`]
-            tracing::info!("Node {node} is unavailable during detach: proceeding anyway, it will be detached on next activation");
-            self.observed.locations.remove(&node.get_id());
-            return Ok(());
-        }
-
        self.observed
            .locations
            .insert(node.get_id(), ObservedStateLocation { conf: None });
@@ -159,16 +149,9 @@ impl Reconciler {
        };
        tracing::info!("location_config({node}) complete: {:?}", config);

-        match config.mode {
-            LocationConfigMode::Detached => {
-                self.observed.locations.remove(&node.get_id());
-            }
-            _ => {
-                self.observed
-                    .locations
-                    .insert(node.get_id(), ObservedStateLocation { conf: Some(config) });
-            }
-        }
+        self.observed
+            .locations
+            .insert(node.get_id(), ObservedStateLocation { conf: Some(config) });

        Ok(())
    }
@@ -260,11 +243,8 @@ impl Reconciler {
        tenant_shard_id: TenantShardId,
        node: &Node,
    ) -> anyhow::Result<HashMap<TimelineId, Lsn>> {
-        let client = PageserverClient::new(
-            node.get_id(),
-            node.base_url(),
-            self.service_config.jwt_token.as_deref(),
-        );
+        let client =
+            mgmt_api::Client::new(node.base_url(), self.service_config.jwt_token.as_deref());

        let timelines = client.timeline_list(&tenant_shard_id).await?;
        Ok(timelines
@@ -278,81 +258,22 @@ impl Reconciler {
        tenant_shard_id: TenantShardId,
        node: &Node,
    ) -> Result<(), ReconcileError> {
-        // This is not the timeout for a request, but the total amount of time we're willing to wait
-        // for a secondary location to get up to date before
-        const TOTAL_DOWNLOAD_TIMEOUT: Duration = Duration::from_secs(300);
-
-        // This the long-polling interval for the secondary download requests we send to destination pageserver
-        // during a migration.
-        const REQUEST_DOWNLOAD_TIMEOUT: Duration = Duration::from_secs(20);
-
-        let started_at = Instant::now();
-
-        loop {
-            let (status, progress) = match node
-                .with_client_retries(
-                    |client| async move {
-                        client
-                            .tenant_secondary_download(
-                                tenant_shard_id,
-                                Some(REQUEST_DOWNLOAD_TIMEOUT),
-                            )
-                            .await
-                    },
-                    &self.service_config.jwt_token,
-                    1,
-                    3,
-                    REQUEST_DOWNLOAD_TIMEOUT * 2,
-                    &self.cancel,
-                )
-                .await
-            {
-                None => Err(ReconcileError::Cancel),
-                Some(Ok(v)) => Ok(v),
-                Some(Err(e)) => {
-                    // Give up, but proceed: it's unfortunate if we couldn't freshen the destination before
-                    // attaching, but we should not let an issue with a secondary location stop us proceeding
-                    // with a live migration.
-                    tracing::warn!("Failed to prepare by downloading layers on node {node}: {e})");
-                    return Ok(());
-                }
-            }?;
-
-            if status == StatusCode::OK {
-                tracing::info!(
-                    "Downloads to {} complete: {}/{} layers, {}/{} bytes",
-                    node,
-                    progress.layers_downloaded,
-                    progress.layers_total,
-                    progress.bytes_downloaded,
-                    progress.bytes_total
-                );
-                return Ok(());
-            } else if status == StatusCode::ACCEPTED {
-                let total_runtime = started_at.elapsed();
-                if total_runtime > TOTAL_DOWNLOAD_TIMEOUT {
-                    tracing::warn!("Timed out after {}ms downloading layers to {node}.  Progress so far: {}/{} layers, {}/{} bytes",
-                        total_runtime.as_millis(),
-                        progress.layers_downloaded,
-                        progress.layers_total,
-                        progress.bytes_downloaded,
-                        progress.bytes_total
-                    );
-                    // Give up, but proceed: an incompletely warmed destination doesn't prevent migration working,
-                    // it just makes the I/O performance for users less good.
-                    return Ok(());
-                }
-
-                // Log and proceed around the loop to retry.  We don't sleep between requests, because our HTTP call
-                // to the pageserver is a long-poll.
-                tracing::info!(
-                    "Downloads to {} not yet complete: {}/{} layers, {}/{} bytes",
-                    node,
-                    progress.layers_downloaded,
-                    progress.layers_total,
-                    progress.bytes_downloaded,
-                    progress.bytes_total
-                );
+        match node
+            .with_client_retries(
+                |client| async move { client.tenant_secondary_download(tenant_shard_id).await },
+                &self.service_config.jwt_token,
+                1,
+                1,
+                Duration::from_secs(60),
+                &self.cancel,
+            )
+            .await
+        {
+            None => Err(ReconcileError::Cancel),
+            Some(Ok(_)) => Ok(()),
+            Some(Err(e)) => {
+                tracing::info!("  (skipping destination download: {})", e);
+                Ok(())
            }
        }
    }
@@ -495,7 +416,7 @@ impl Reconciler {
            }
        }

-        // Downgrade the origin to secondary.  If the tenant's policy is PlacementPolicy::Attached(0), then
+        // Downgrade the origin to secondary.  If the tenant's policy is PlacementPolicy::Single, then
        // this location will be deleted in the general case reconciliation that runs after this.
        let origin_secondary_conf = build_location_config(
            &self.shard,
--- a/control_plane/attachment_service/src/scheduler.rs
+++ b/control_plane/attachment_service/src/scheduler.rs
@@ -1,5 +1,4 @@
 use crate::{node::Node, tenant_state::TenantState};
-use pageserver_api::controller_api::UtilizationScore;
 use serde::Serialize;
 use std::collections::HashMap;
 use utils::{http::error::ApiError, id::NodeId};
@@ -20,34 +19,15 @@ impl From<ScheduleError> for ApiError {
 }

 #[derive(Serialize, Eq, PartialEq)]
-pub enum MaySchedule {
-    Yes(UtilizationScore),
-    No,
-}
-
-#[derive(Serialize)]
 struct SchedulerNode {
    /// How many shards are currently scheduled on this node, via their [`crate::tenant_state::IntentState`].
    shard_count: usize,

    /// Whether this node is currently elegible to have new shards scheduled (this is derived
    /// from a node's availability state and scheduling policy).
-    may_schedule: MaySchedule,
+    may_schedule: bool,
 }

-impl PartialEq for SchedulerNode {
-    fn eq(&self, other: &Self) -> bool {
-        let may_schedule_matches = matches!(
-            (&self.may_schedule, &other.may_schedule),
-            (MaySchedule::Yes(_), MaySchedule::Yes(_)) | (MaySchedule::No, MaySchedule::No)
-        );
-
-        may_schedule_matches && self.shard_count == other.shard_count
-    }
-}
-
-impl Eq for SchedulerNode {}
-
 /// This type is responsible for selecting which node is used when a tenant shard needs to choose a pageserver
 /// on which to run.
 ///
@@ -206,15 +186,13 @@ impl Scheduler {
            return None;
        }

-        // TODO: When the utilization score returned by the pageserver becomes meaningful,
-        // schedule based on that instead of the shard count.
        let node = nodes
            .iter()
            .map(|node_id| {
                let may_schedule = self
                    .nodes
                    .get(node_id)
-                    .map(|n| n.may_schedule != MaySchedule::No)
+                    .map(|n| n.may_schedule)
                    .unwrap_or(false);
                (*node_id, may_schedule)
            })
@@ -233,7 +211,7 @@ impl Scheduler {
            .nodes
            .iter()
            .filter_map(|(k, v)| {
-                if hard_exclude.contains(k) || v.may_schedule == MaySchedule::No {
+                if hard_exclude.contains(k) || !v.may_schedule {
                    None
                } else {
                    Some((*k, v.shard_count))
@@ -252,7 +230,7 @@ impl Scheduler {
            for (node_id, node) in &self.nodes {
                tracing::info!(
                    "Node {node_id}: may_schedule={} shards={}",
-                    node.may_schedule != MaySchedule::No,
+                    node.may_schedule,
                    node.shard_count
                );
            }
@@ -277,7 +255,6 @@ impl Scheduler {
 pub(crate) mod test_utils {

    use crate::node::Node;
-    use pageserver_api::controller_api::{NodeAvailability, UtilizationScore};
    use std::collections::HashMap;
    use utils::id::NodeId;
    /// Test helper: synthesize the requested number of nodes, all in active state.
@@ -287,14 +264,13 @@ pub(crate) mod test_utils {
        (1..n + 1)
            .map(|i| {
                (NodeId(i), {
-                    let mut node = Node::new(
+                    let node = Node::new(
                        NodeId(i),
                        format!("httphost-{i}"),
                        80 + i as u16,
                        format!("pghost-{i}"),
                        5432 + i as u16,
                    );
-                    node.set_availability(NodeAvailability::Active(UtilizationScore::worst()));
                    assert!(node.is_available());
                    node
                })
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
--- a/control_plane/attachment_service/src/tenant_state.rs
+++ b/control_plane/attachment_service/src/tenant_state.rs
@@ -4,10 +4,7 @@ use std::{
    time::Duration,
 };

-use crate::{
-    metrics::{self, ReconcileCompleteLabelGroup, ReconcileOutcome},
-    persistence::TenantShardPersistence,
-};
+use crate::{metrics, persistence::TenantShardPersistence};
 use pageserver_api::controller_api::PlacementPolicy;
 use pageserver_api::{
    models::{LocationConfig, LocationConfigMode, TenantConfig},
@@ -460,7 +457,22 @@ impl TenantState {
        // Add/remove nodes to fulfil policy
        use PlacementPolicy::*;
        match self.policy {
-            Attached(secondary_count) => {
+            Single => {
+                // Should have exactly one attached, and zero secondaries
+                if !self.intent.secondary.is_empty() {
+                    self.intent.clear_secondary(scheduler);
+                    modified = true;
+                }
+
+                let (modified_attached, _attached_node_id) = self.schedule_attached(scheduler)?;
+                modified |= modified_attached;
+
+                if !self.intent.secondary.is_empty() {
+                    self.intent.clear_secondary(scheduler);
+                    modified = true;
+                }
+            }
+            Double(secondary_count) => {
                let retain_secondaries = if self.intent.attached.is_none()
                    && scheduler.node_preferred(&self.intent.secondary).is_some()
                {
@@ -721,10 +733,7 @@ impl TenantState {
        let reconciler_span = tracing::info_span!(parent: None, "reconciler", seq=%reconcile_seq,
                                                        tenant_id=%reconciler.tenant_shard_id.tenant_id,
                                                        shard_id=%reconciler.tenant_shard_id.shard_slug());
-        metrics::METRICS_REGISTRY
-            .metrics_group
-            .storage_controller_reconcile_spawn
-            .inc();
+        metrics::RECONCILER.spawned.inc();
        let result_tx = result_tx.clone();
        let join_handle = tokio::task::spawn(
            async move {
@@ -742,12 +751,10 @@ impl TenantState {
                // TODO: wrap all remote API operations in cancellation check
                // as well.
                if reconciler.cancel.is_cancelled() {
-                    metrics::METRICS_REGISTRY
-                        .metrics_group
-                        .storage_controller_reconcile_complete
-                        .inc(ReconcileCompleteLabelGroup {
-                            status: ReconcileOutcome::Cancel,
-                        });
+                    metrics::RECONCILER
+                        .complete
+                        .with_label_values(&[metrics::ReconcilerMetrics::CANCEL])
+                        .inc();
                    return;
                }

@@ -762,18 +769,18 @@ impl TenantState {
                }

                // Update result counter
-                let outcome_label = match &result {
-                    Ok(_) => ReconcileOutcome::Success,
-                    Err(ReconcileError::Cancel) => ReconcileOutcome::Cancel,
-                    Err(_) => ReconcileOutcome::Error,
-                };
-
-                metrics::METRICS_REGISTRY
-                    .metrics_group
-                    .storage_controller_reconcile_complete
-                    .inc(ReconcileCompleteLabelGroup {
-                        status: outcome_label,
-                    });
+                match &result {
+                    Ok(_) => metrics::RECONCILER
+                        .complete
+                        .with_label_values(&[metrics::ReconcilerMetrics::SUCCESS]),
+                    Err(ReconcileError::Cancel) => metrics::RECONCILER
+                        .complete
+                        .with_label_values(&[metrics::ReconcilerMetrics::CANCEL]),
+                    Err(_) => metrics::RECONCILER
+                        .complete
+                        .with_label_values(&[metrics::ReconcilerMetrics::ERROR]),
+                }
+                .inc();

                result_tx
                    .send(ReconcileResult {
@@ -888,7 +895,7 @@ pub(crate) mod tests {

        let mut scheduler = Scheduler::new(nodes.values());

-        let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Attached(1));
+        let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Double(1));
        tenant_state
            .schedule(&mut scheduler)
            .expect("we have enough nodes, scheduling should work");
@@ -936,7 +943,7 @@ pub(crate) mod tests {
        let nodes = make_test_nodes(3);
        let mut scheduler = Scheduler::new(nodes.values());

-        let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Attached(1));
+        let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Double(1));

        tenant_state.observed.locations.insert(
            NodeId(3),
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -294,7 +294,7 @@ where
    //      is in state 'taken' but the thread that would unlock it is
    //      not there.
    //   2. A rust object that represented some external resource in the
-    //      parent now got implicitly copied by the fork, even though
+    //      parent now got implicitly copied by the the fork, even though
    //      the object's type is not `Copy`. The parent program may use
    //      non-copyability as way to enforce unique ownership of an
    //      external resource in the typesystem. The fork breaks that
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -437,7 +437,7 @@ async fn handle_tenant(

            let placement_policy = match create_match.get_one::<String>("placement-policy") {
                Some(s) if !s.is_empty() => serde_json::from_str::<PlacementPolicy>(s)?,
-                _ => PlacementPolicy::Attached(0),
+                _ => PlacementPolicy::Single,
            };

            let tenant_conf = PageServerNode::parse_config(tenant_conf)?;
@@ -523,6 +523,88 @@ async fn handle_tenant(
                .with_context(|| format!("Tenant config failed for tenant with id {tenant_id}"))?;
            println!("tenant {tenant_id} successfully configured on the pageserver");
        }
+        Some(("migrate", matches)) => {
+            let tenant_shard_id = get_tenant_shard_id(matches, env)?;
+            let new_pageserver = get_pageserver(env, matches)?;
+            let new_pageserver_id = new_pageserver.conf.id;
+
+            let storage_controller = StorageController::from_env(env);
+            storage_controller
+                .tenant_migrate(tenant_shard_id, new_pageserver_id)
+                .await?;
+
+            println!("tenant {tenant_shard_id} migrated to {}", new_pageserver_id);
+        }
+        Some(("status", matches)) => {
+            let tenant_id = get_tenant_id(matches, env)?;
+
+            let mut shard_table = comfy_table::Table::new();
+            shard_table.set_header(["Shard", "Pageserver", "Physical Size"]);
+
+            let mut tenant_synthetic_size = None;
+
+            let storage_controller = StorageController::from_env(env);
+            for shard in storage_controller.tenant_locate(tenant_id).await?.shards {
+                let pageserver =
+                    PageServerNode::from_env(env, env.get_pageserver_conf(shard.node_id)?);
+
+                let size = pageserver
+                    .http_client
+                    .tenant_details(shard.shard_id)
+                    .await?
+                    .tenant_info
+                    .current_physical_size
+                    .unwrap();
+
+                shard_table.add_row([
+                    format!("{}", shard.shard_id.shard_slug()),
+                    format!("{}", shard.node_id.0),
+                    format!("{} MiB", size / (1024 * 1024)),
+                ]);
+
+                if shard.shard_id.is_zero() {
+                    tenant_synthetic_size =
+                        Some(pageserver.tenant_synthetic_size(shard.shard_id).await?);
+                }
+            }
+
+            let Some(synthetic_size) = tenant_synthetic_size else {
+                bail!("Shard 0 not found")
+            };
+
+            let mut tenant_table = comfy_table::Table::new();
+            tenant_table.add_row(["Tenant ID".to_string(), tenant_id.to_string()]);
+            tenant_table.add_row([
+                "Synthetic size".to_string(),
+                format!("{} MiB", synthetic_size.size.unwrap_or(0) / (1024 * 1024)),
+            ]);
+
+            println!("{tenant_table}");
+            println!("{shard_table}");
+        }
+        Some(("shard-split", matches)) => {
+            let tenant_id = get_tenant_id(matches, env)?;
+            let shard_count: u8 = matches.get_one::<u8>("shard-count").cloned().unwrap_or(0);
+            let shard_stripe_size: Option<ShardStripeSize> = matches
+                .get_one::<Option<ShardStripeSize>>("shard-stripe-size")
+                .cloned()
+                .unwrap();
+
+            let storage_controller = StorageController::from_env(env);
+            let result = storage_controller
+                .tenant_split(tenant_id, shard_count, shard_stripe_size)
+                .await?;
+            println!(
+                "Split tenant {} into shards {}",
+                tenant_id,
+                result
+                    .new_shards
+                    .iter()
+                    .map(|s| format!("{:?}", s))
+                    .collect::<Vec<_>>()
+                    .join(",")
+            );
+        }

        Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{}'", sub_name),
        None => bail!("no tenant subcommand provided"),
@@ -1496,6 +1578,19 @@ fn cli() -> Command {
            .subcommand(Command::new("config")
                .arg(tenant_id_arg.clone())
                .arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false)))
+            .subcommand(Command::new("migrate")
+                .about("Migrate a tenant from one pageserver to another")
+                .arg(tenant_id_arg.clone())
+                .arg(pageserver_id_arg.clone()))
+            .subcommand(Command::new("status")
+                .about("Human readable summary of the tenant's shards and attachment locations")
+                .arg(tenant_id_arg.clone()))
+            .subcommand(Command::new("shard-split")
+                .about("Increase the number of shards in the tenant")
+                .arg(tenant_id_arg.clone())
+                .arg(Arg::new("shard-count").value_parser(value_parser!(u8)).long("shard-count").action(ArgAction::Set).help("Number of shards in the new tenant (default 1)"))
+                .arg(Arg::new("shard-stripe-size").value_parser(value_parser!(u32)).long("shard-stripe-size").action(ArgAction::Set).help("Sharding stripe size in pages"))
+                )
        )
        .subcommand(
            Command::new("pageserver")
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -12,7 +12,7 @@
 //!
 //! The endpoint is managed by the `compute_ctl` binary. When an endpoint is
 //! started, we launch `compute_ctl` It synchronizes the safekeepers, downloads
-//! the basebackup from the pageserver to initialize the data directory, and
+//! the basebackup from the pageserver to initialize the the data directory, and
 //! finally launches the PostgreSQL process. It watches the PostgreSQL process
 //! until it exits.
 //!
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -114,7 +114,7 @@ impl NeonBroker {
 }

 #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
-#[serde(default, deny_unknown_fields)]
+#[serde(default)]
 pub struct PageServerConf {
    // node id
    pub id: NodeId,
@@ -126,9 +126,6 @@ pub struct PageServerConf {
    // auth type used for the PG and HTTP ports
    pub pg_auth_type: AuthType,
    pub http_auth_type: AuthType,
-
-    pub(crate) virtual_file_io_engine: Option<String>,
-    pub(crate) get_vectored_impl: Option<String>,
 }

 impl Default for PageServerConf {
@@ -139,8 +136,6 @@ impl Default for PageServerConf {
            listen_http_addr: String::new(),
            pg_auth_type: AuthType::Trust,
            http_auth_type: AuthType::Trust,
-            virtual_file_io_engine: None,
-            get_vectored_impl: None,
        }
    }
 }
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -78,39 +78,18 @@ impl PageServerNode {
    ///
    /// These all end up on the command line of the `pageserver` binary.
    fn neon_local_overrides(&self, cli_overrides: &[&str]) -> Vec<String> {
+        let id = format!("id={}", self.conf.id);
        // FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc.
        let pg_distrib_dir_param = format!(
            "pg_distrib_dir='{}'",
            self.env.pg_distrib_dir_raw().display()
        );

-        let PageServerConf {
-            id,
-            listen_pg_addr,
-            listen_http_addr,
-            pg_auth_type,
-            http_auth_type,
-            virtual_file_io_engine,
-            get_vectored_impl,
-        } = &self.conf;
+        let http_auth_type_param = format!("http_auth_type='{}'", self.conf.http_auth_type);
+        let listen_http_addr_param = format!("listen_http_addr='{}'", self.conf.listen_http_addr);

-        let id = format!("id={}", id);
-
-        let http_auth_type_param = format!("http_auth_type='{}'", http_auth_type);
-        let listen_http_addr_param = format!("listen_http_addr='{}'", listen_http_addr);
-
-        let pg_auth_type_param = format!("pg_auth_type='{}'", pg_auth_type);
-        let listen_pg_addr_param = format!("listen_pg_addr='{}'", listen_pg_addr);
-        let virtual_file_io_engine = if let Some(virtual_file_io_engine) = virtual_file_io_engine {
-            format!("virtual_file_io_engine='{virtual_file_io_engine}'")
-        } else {
-            String::new()
-        };
-        let get_vectored_impl = if let Some(get_vectored_impl) = get_vectored_impl {
-            format!("get_vectored_impl='{get_vectored_impl}'")
-        } else {
-            String::new()
-        };
+        let pg_auth_type_param = format!("pg_auth_type='{}'", self.conf.pg_auth_type);
+        let listen_pg_addr_param = format!("listen_pg_addr='{}'", self.conf.listen_pg_addr);

        let broker_endpoint_param = format!("broker_endpoint='{}'", self.env.broker.client_url());

@@ -122,8 +101,6 @@ impl PageServerNode {
            listen_http_addr_param,
            listen_pg_addr_param,
            broker_endpoint_param,
-            virtual_file_io_engine,
-            get_vectored_impl,
        ];

        if let Some(control_plane_api) = &self.env.control_plane_api {
@@ -134,7 +111,7 @@ impl PageServerNode {

            // Storage controller uses the same auth as pageserver: if JWT is enabled
            // for us, we will also need it to talk to them.
-            if matches!(http_auth_type, AuthType::NeonJWT) {
+            if matches!(self.conf.http_auth_type, AuthType::NeonJWT) {
                let jwt_token = self
                    .env
                    .generate_auth_token(&Claims::new(None, Scope::GenerationsApi))
@@ -152,7 +129,8 @@ impl PageServerNode {
            ));
        }

-        if *http_auth_type != AuthType::Trust || *pg_auth_type != AuthType::Trust {
+        if self.conf.http_auth_type != AuthType::Trust || self.conf.pg_auth_type != AuthType::Trust
+        {
            // Keys are generated in the toplevel repo dir, pageservers' workdirs
            // are one level below that, so refer to keys with ../
            overrides.push("auth_validation_public_key_path='../auth_public_key.pem'".to_owned());
@@ -576,6 +554,13 @@ impl PageServerNode {
        Ok(self.http_client.list_timelines(*tenant_shard_id).await?)
    }

+    pub async fn tenant_secondary_download(&self, tenant_id: &TenantShardId) -> anyhow::Result<()> {
+        Ok(self
+            .http_client
+            .tenant_secondary_download(*tenant_id)
+            .await?)
+    }
+
    pub async fn timeline_create(
        &self,
        tenant_shard_id: TenantShardId,
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -38,9 +38,6 @@ const COMMAND: &str = "storage_controller";

 const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16;

-// Use a shorter pageserver unavailability interval than the default to speed up tests.
-const NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL: std::time::Duration = std::time::Duration::from_secs(10);
-
 #[derive(Serialize, Deserialize)]
 pub struct AttachHookRequest {
    pub tenant_shard_id: TenantShardId,
@@ -272,18 +269,13 @@ impl StorageController {
        // Run migrations on every startup, in case something changed.
        let database_url = self.setup_database().await?;

-        let max_unavailable: humantime::Duration = NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL.into();
-
        let mut args = vec![
            "-l",
            &self.listen,
            "-p",
            self.path.as_ref(),
-            "--dev",
            "--database-url",
            &database_url,
-            "--max-unavailable-interval",
-            &max_unavailable.to_string(),
        ]
        .into_iter()
        .map(|s| s.to_string())
@@ -476,7 +468,7 @@ impl StorageController {
    pub async fn tenant_locate(&self, tenant_id: TenantId) -> anyhow::Result<TenantLocateResponse> {
        self.dispatch::<(), _>(
            Method::GET,
-            format!("debug/v1/tenant/{tenant_id}/locate"),
+            format!("control/v1/tenant/{tenant_id}/locate"),
            None,
        )
        .await
--- a/docs/rfcs/031-sharding-static.md
+++ b/docs/rfcs/031-sharding-static.md
@@ -1,408 +0,0 @@
-# Sharding Phase 1: Static Key-space Sharding
-
-## Summary
-
-To enable databases with sizes approaching the capacity of a pageserver's disk,
-it is necessary to break up the storage for the database, or _shard_ it.
-
-Sharding in general is a complex area. This RFC aims to define an initial
-capability that will permit creating large-capacity databases using a static configuration
-defined at time of Tenant creation.
-
-## Motivation
-
-Currently, all data for a Tenant, including all its timelines, is stored on a single
-pageserver. The local storage required may be several times larger than the actual
-database size, due to LSM write inflation.
-
-If a database is larger than what one pageserver can hold, then it becomes impossible
-for the pageserver to hold it in local storage, as it must do to provide service to
-clients.
-
-### Prior art
-
-In Neon:
-
- Layer File Spreading: https://www.notion.so/neondatabase/One-Pager-Layer-File-Spreading-Konstantin-21fd9b11b618475da5f39c61dd8ab7a4
- Layer File SPreading: https://www.notion.so/neondatabase/One-Pager-Layer-File-Spreading-Christian-eb6b64182a214e11b3fceceee688d843
- Key Space partitioning: https://www.notion.so/neondatabase/One-Pager-Key-Space-Partitioning-Stas-8e3a28a600a04a25a68523f42a170677
-
-Prior art in other distributed systems is too broad to capture here: pretty much
-any scale out storage system does something like this.
-
-## Requirements
-
- Enable creating a large (for example, 16TiB) database without requiring dedicated
-  pageserver nodes.
- Share read/write bandwidth costs for large databases across pageservers, as well
-  as storage capacity, in order to avoid large capacity databases acting as I/O hotspots
-  that disrupt service to other tenants.
- Our data distribution scheme should handle sparse/nonuniform keys well, since postgres
-  does not write out a single contiguous ranges of page numbers.
-
-_Note: the definition of 'large database' is arbitrary, but the lower bound is to ensure that a database
-that a user might create on a current-gen enterprise SSD should also work well on
-Neon. The upper bound is whatever postgres can handle: i.e. we must make sure that the
-pageserver backend is not the limiting factor in the database size_.
-
-## Non Goals
-
- Independently distributing timelines within the same tenant. If a tenant has many
-  timelines, then sharding may be a less efficient mechanism for distributing load than
-  sharing out timelines between pageservers.
- Distributing work in the LSN dimension: this RFC focuses on the Key dimension only,
-  based on the idea that separate mechanisms will make sense for each dimension.
-
-## Impacted Components
-
-pageserver, control plane, postgres/smgr
-
-## Terminology
-
-**Key**: a postgres page number, qualified by relation. In the sense that the pageserver is a versioned key-value store,
-the page number is the key in that store. `Key` is a literal data type in existing code.
-
-**LSN dimension**: this just means the range of LSNs (history), when talking about the range
-of keys and LSNs as a two dimensional space.
-
-## Implementation
-
-### Key sharding vs. LSN sharding
-
-When we think of sharding across the two dimensional key/lsn space, this is an
-opportunity to think about how the two dimensions differ:
-
- Sharding the key space distributes the _write_ workload of ingesting data
-  and compacting. This work must be carefully managed so that exactly one
-  node owns a given key.
- Sharding the LSN space distributes the _historical read_ workload. This work
-  can be done by anyone without any special coordination, as long as they can
-  see the remote index and layers.
-
-The key sharding is the harder part, and also the more urgent one, to support larger
-capacity databases. Because distributing historical LSN read work is a relatively
-simpler problem that most users don't have, we defer it to future work. It is anticipated
-that some quite simple P2P offload model will enable distributing work for historical
-reads: a node which is low on space can call out to peer to ask it to download and
-serve reads from a historical layer.
-
-### Key mapping scheme
-
-Having decided to focus on key sharding, we must next decide how we will map
-keys to shards. It is proposed to use a "wide striping" approach, to obtain a good compromise
-between data locality and avoiding entire large relations mapping to the same shard.
-
-We will define two spaces:
-
- Key space: unsigned integer
- Shard space: integer from 0 to N-1, where we have N shards.
-
-### Key -> Shard mapping
-
-Keys are currently defined in the pageserver's getpage@lsn interface as follows:
-
-```
-pub struct Key {
-    pub field1: u8,
-    pub field2: u32,
-    pub field3: u32,
-    pub field4: u32,
-    pub field5: u8,
-    pub field6: u32,
-}
-
-
-fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key {
-    Key {
-        field1: 0x00,
-        field2: rel.spcnode,
-        field3: rel.dbnode,
-        field4: rel.relnode,
-        field5: rel.forknum,
-        field6: blknum,
-    }
-}
-```
-
-_Note: keys for relation metadata are ignored here, as this data will be mirrored to all
-shards. For distribution purposes, we only care about user data keys_
-
-The properties we want from our Key->Shard mapping are:
-
- Locality in `blknum`, such that adjacent `blknum` will usually map to
-  the same stripe and consequently land on the same shard, even though the overall
-  collection of blocks in a relation will be spread over many stripes and therefore
-  many shards.
- Avoid the same blknum on different relations landing on the same stripe, so that
-  with many small relations we do not end up aliasing data to the same stripe/shard.
- Avoid vulnerability to aliasing in the values of relation identity fields, such that
-  if there are patterns in the value of `relnode`, these do not manifest as patterns
-  in data placement.
-
-To accomplish this, the blknum is used to select a stripe, and stripes are
-assigned to shards in a pseudorandom order via a hash. The motivation for
-pseudo-random distribution (rather than sequential mapping of stripe to shard)
-is to avoid I/O hotspots when sequentially reading multiple relations: we don't want
-all relations' stripes to touch pageservers in the same order.
-
-To map a `Key` to a shard:
-
- Hash the `Key` field 4 (relNode).
- Divide field 6 (`blknum`) field by the stripe size in pages, and combine the
-  hash of this with the hash from the previous step.
- The total hash modulo the shard count gives the shard holding this key.
-
-Why don't we use the other fields in the Key?
-
- We ignore `forknum` for key mapping, because it distinguishes different classes of data
-  in the same relation, and we would like to keep the data in a relation together.
- We would like to use spcNode and dbNode, but cannot. Postgres database creation operations can refer to an existing database as a template, such that the created
-  database's blocks differ only by spcNode and dbNode from the original. To enable running
-  this type of creation without cross-pageserver communication, we must ensure that these
-  blocks map to the same shard -- we do this by excluding spcNode and dbNode from the hash.
-
-### Data placement examples
-
-For example, consider the extreme large databases cases of postgres data layout in a system with 8 shards
-and a stripe size of 32k pages:
-
- A single large relation: `blknum` division will break the data up into 4096
-  stripes, which will be scattered across the shards.
- 4096 relations of of 32k pages each: each relation will map to exactly one stripe,
-  and that stripe will be placed according to the hash of the key fields 4. The
-  data placement will be statistically uniform across shards.
-
-Data placement will be more uneven on smaller databases:
-
- A tenant with 2 shards and 2 relations of one stripe size each: there is a 50% chance
-  that both relations land on the same shard and no data lands on the other shard.
- A tenant with 8 shards and one relation of size 12 stripes: 4 shards will have double
-  the data of the other four shards.
-
-These uneven cases for small amounts of data do not matter, as long as the stripe size
-is an order of magnitude smaller than the amount of data we are comfortable holding
-in a single shard: if our system handles shard sizes up to 10-100GB, then it is not an issue if
-a tenant has some shards with 256MB size and some shards with 512MB size, even though
-the standard deviation of shard size within the tenant is very high. Our key mapping
-scheme provides a statistical guarantee that as the tenant's overall data size increases,
-uniformity of placement will improve.
-
-### Important Types
-
-#### `ShardIdentity`
-
-Provides the information needed to know whether a particular key belongs
-to a particular shard:
-
- Layout version
- Stripe size
- Shard count
- Shard index
-
-This structure's size is constant. Note that if we had used a differnet key
-mapping scheme such as consistent hashing with explicit hash ranges assigned
-to each shard, then the ShardIdentity's size would grow with the shard count: the simpler
-key mapping scheme used here enables a small fixed size ShardIdentity.
-
-### Pageserver changes
-
-#### Structural
-
-Everywhere the Pageserver currently deals with Tenants, it will move to dealing with
-`TenantShard`s, which are just a `Tenant` plus a `ShardIdentity` telling it which part
-of the keyspace it owns. An un-sharded tenant is just a `TenantShard` whose `ShardIdentity`
-covers the whole keyspace.
-
-When the pageserver writes layers and index_part.json to remote storage, it must
-include the shard index & count in the name, to avoid collisions (the count is
-necessary for future-proofing: the count will vary in time). These keys
-will also include a generation number: the [generation numbers](025-generation-numbers.md) system will work
-exactly the same for TenantShards as it does for Tenants today: each shard will have
-its own generation number.
-
-#### Storage Format: Keys
-
-For tenants with >1 shard, layer files implicitly become sparse: within the key
-range described in the layer name, the layer file for a shard will only hold the
-content relevant to stripes assigned to the shard.
-
-For this reason, the LayerFileName within a tenant is no longer unique: different shards
-may use the same LayerFileName to refer to different data. We may solve this simply
-by including the shard number in the keys used for layers.
-
-The shard number will be included as a prefix (as part of tenant ID), like this:
-
-`pageserver/v1/tenants/<tenant_id>-<shard_number><shard_count>/timelines/<timeline id>/<layer file name>-<generation>`
-
-`pageserver/v1/tenants/<tenant_id>-<shard_number><shard_count>/timelines/<timeline id>/index_part.json-<generation>`
-
-Reasons for this particular format:
-
- Use of a prefix is convenient for implementation (no need to carry the shard ID everywhere
-  we construct a layer file name), and enables efficient listing of index_parts within
-  a particular shard-timeline prefix.
- Including the shard _count_ as well as shard number means that in future when we implement
-  shard splitting, it will be possible for a parent shard and one of its children to write
-  the same layer file without a name collision. For example, a parent shard 0_1 might split
-  into two (0_2, 1_2), and in the process of splitting shard 0_2 could write a layer or index_part
-  that is distinct from what shard 0_1 would have written at the same place.
-
-In practice, we expect shard counts to be relatively small, so a `u8` will be sufficient,
-and therefore the shard part of the path can be a fixed-length hex string like `{:02X}{:02X}`,
-for example a single-shard tenant's prefix will be `0001`.
-
-For backward compatibility, we may define a special `ShardIdentity` that has shard_count==0,
-and use this as a cue to construct paths with no prefix at all.
-
-#### Storage Format: Indices
-
-In the phase 1 described in this RFC, shards only reference layers they write themselves. However,
-when we implement shard splitting in future, it will be useful to enable shards to reference layers
-written by other shards (specifically the parent shard during a split), so that shards don't
-have to exhaustively copy all data into their own shard-prefixed keys.
-
-To enable this, the `IndexPart` structure will be extended to store the (shard number, shard count)
-tuple on each layer, such that it can construct paths for layers written by other shards. This
-naturally raises the question of who "owns" such layers written by ancestral shards: this problem
-will be addressed in phase 2.
-
-For backward compatibility, any index entry without shard information will be assumed to be
-in the legacy shardidentity.
-
-#### WAL Ingest
-
-In Phase 1, all shards will subscribe to the safekeeper to download WAL content. They will filter
-it down to the pages relevant to their shard:
-
- For ordinary user data writes, only retain a write if it matches the ShardIdentity
- For metadata describing relations etc, all shards retain these writes.
-
-The pageservers must somehow give the safekeeper correct feedback on remote_consistent_lsn:
-one solution here is for the 0th shard to periodically peek at the IndexParts for all the other shards,
-and have only the 0th shard populate remote_consistent_lsn. However, this is relatively
-expensive: if the safekeeper can be made shard-aware then it could be taught to use
-the max() of all shards' remote_consistent_lsns to decide when to trim the WAL.
-
-#### Compaction/GC
-
-No changes needed.
-
-The pageserver doesn't have to do anything special during compaction
-or GC. It is implicitly operating on the subset of keys that map to its ShardIdentity.
-This will result in sparse layer files, containing keys only in the stripes that this
-shard owns. Where optimizations currently exist in compaction for spotting "gaps" in
-the key range, these should be updated to ignore gaps that are due to sharding, to
-avoid spuriously splitting up layers ito stripe-sized pieces.
-
-### Compute Endpoints
-
-Compute endpoints will need to:
-
- Accept a vector of connection strings as part of their configuration from the control plane
- Route pageserver requests according to mapping the hash of key to the correct
-  entry in the vector of connection strings.
-
-Doing this in compute rather than routing requests via a single pageserver is
-necessary to enable sharding tenants without adding latency from extra hops.
-
-### Control Plane
-
-Tenants, or _Projects_ in the control plane, will each own a set of TenantShards (this will
-be 1 for small tenants). Logic for placement of tenant shards is just the same as the current logic for placing
-tenants.
-
-Tenant lifecycle operations like deletion will require fanning-out to all the shards
-in the tenant. The same goes for timeline creation and deletion: a timeline should
-not be considered created until it has been created in all shards.
-
-#### Selectively enabling sharding for large tenants
-
-Initially, we will explicitly enable sharding for large tenants only.
-
-In future, this hint mechanism will become optional when we implement automatic
-re-sharding of tenants.
-
-## Future Phases
-
-This section exists to indicate what will likely come next after this phase.
-
-Phases 2a and 2b are amenable to execution in parallel.
-
-### Phase 2a: WAL fan-out
-
-**Problem**: when all shards consume the whole WAL, the network bandwidth used
-for transmitting the WAL from safekeeper to pageservers is multiplied by a factor
-of the shard count.
-
-Network bandwidth is not our most pressing bottleneck, but it is likely to become
-a problem if we set a modest shard count (~8) on a significant number of tenants,
-especially as those larger tenants which we shard are also likely to have higher
-write bandwidth than average.
-
-### Phase 2b: Shard Splitting
-
-**Problem**: the number of shards in a tenant is defined at creation time and cannot
-be changed. This causes excessive sharding for most small tenants, and an upper
-bound on scale for very large tenants.
-
-To address this, a _splitting_ feature will later be added. One shard can split its
-data into a number of children by doing a special compaction operation to generate
-image layers broken up child-shard-wise, and then writing out an `index_part.json` for
-each child. This will then require external coordination (by the control plane) to
-safely attach these new child shards and then move them around to distribute work.
-The opposite _merging_ operation can also be imagined, but is unlikely to be implemented:
-once a Tenant has been sharded, the marginal efficiency benefit of merging is unlikely to justify
-the risk/complexity of implementing such a rarely-encountered scenario.
-
-### Phase N (future): distributed historical reads
-
-**Problem**: while sharding based on key is good for handling changes in overall
-database size, it is less suitable for spiky/unpredictable changes in the read
-workload to historical layers. Sudden increases in historical reads could result
-in sudden increases in local disk capacity required for a TenantShard.
-
-Example: the extreme case of this would be to run a tenant for a year, then create branches
-with ancestors at monthly intervals. This could lead to a sudden 12x inflation in
-the on-disk capacity footprint of a TenantShard, since it would be serving reads
-from all those disparate historical layers.
-
-If we can respond fast enough, then key-sharding a tenant more finely can help with
-this, but splitting may be a relatively expensive operation and the increased historical
-read load may be transient.
-
-A separate mechanism for handling heavy historical reads could be something like
-a gossip mechanism for pageservers to communicate
-about their workload, and then a getpageatlsn offload mechanism where one pageserver can
-ask another to go read the necessary layers from remote storage to serve the read. This
-requires relativly little coordination because it is read-only: any node can service any
-read. All reads to a particular shard would still flow through one node, but the
-disk capactity & I/O impact of servicing the read would be distributed.
-
-## FAQ/Alternatives
-
-### Why stripe the data, rather than using contiguous ranges of keyspace for each shard?
-
-When a database is growing under a write workload, writes may predominantly hit the
-end of the keyspace, creating a bandwidth hotspot on that shard. Similarly, if the user
-is intensively re-writing a particular relation, if that relation lived in a particular
-shard then it would not achieve our goal of distributing the write work across shards.
-
-### Why not proxy read requests through one pageserver, so that endpoints don't have to change?
-
-1. This would not achieve scale-out of network bandwidth: a busy tenant with a large
-   database would still cause a load hotspot on the pageserver routing its read requests.
-2. The additional hop through the "proxy" pageserver would add latency and overall
-   resource cost (CPU, network bandwidth)
-
-### Layer File Spreading: use one pageserver as the owner of a tenant, and have it spread out work on a per-layer basis to peers
-
-In this model, there would be no explicit sharding of work, but the pageserver to which
-a tenant is attached would not hold all layers on its disk: instead, it would call out
-to peers to have them store some layers, and call out to those peers to request reads
-in those layers.
-
-This mechanism will work well for distributing work in the LSN dimension, but in the key
-space dimension it has the major limitation of requiring one node to handle all
-incoming writes, and compactions. Even if the write workload for a large database
-fits in one pageserver, it will still be a hotspot and such tenants may still
-de-facto require their own pageserver.
--- a/docs/rfcs/032-shard-splitting.md
+++ b/docs/rfcs/032-shard-splitting.md
@@ -1,479 +0,0 @@
-# Shard splitting
-
-## Summary
-
-This RFC describes a new pageserver API for splitting an existing tenant shard into
-multiple shards, and describes how to use this API to safely increase the total
-shard count of a tenant.
-
-## Motivation
-
-In the [sharding RFC](031-sharding-static.md), a mechanism was introduced to scale
-tenants beyond the capacity of a single pageserver by breaking up the key space
-into stripes, and distributing these stripes across many pageservers. However,
-the shard count was defined once at tenant creation time and not varied thereafter.
-
-In practice, the expected size of a database is rarely known at creation time, and
-it is inefficient to enable sharding for very small tenants: we need to be
-able to create a tenant with a small number of shards (such as 1), and later expand
-when it becomes clear that the tenant has grown in size to a point where sharding
-is beneficial.
-
-### Prior art
-
-Many distributed systems have the problem of choosing how many shards to create for
-tenants that do not specify an expected size up-front. There are a couple of general
-approaches:
-
- Write to a key space in order, and start a new shard when the highest key advances
-  past some point. This doesn't work well for Neon, because we write to our key space
-  in many different contiguous ranges (per relation), rather than in one contiguous
-  range. To adapt to this kind of model, we would need a sharding scheme where each
-  relation had its own range of shards, which would be inefficient for the common
-  case of databases with many small relations.
- Monitor the system, and automatically re-shard at some size threshold. For
-  example in Ceph, the [pg_autoscaler](https://github.com/ceph/ceph/blob/49c27499af4ee9a90f69fcc6bf3597999d6efc7b/src/pybind/mgr/pg_autoscaler/module.py)
-  component monitors the size of each RADOS Pool, and adjusts the number of Placement
-  Groups (Ceph's shard equivalent).
-
-## Requirements
-
- A configurable capacity limit per-shard is enforced.
- Changes in shard count do not interrupt service beyond requiring postgres
-  to reconnect (i.e. milliseconds).
- Human being does not have to choose shard count
-
-## Non Goals
-
- Shard splitting is always a tenant-global operation: we will not enable splitting
-  one shard while leaving others intact.
- The inverse operation (shard merging) is not described in this RFC. This is a lower
-  priority than splitting, because databases grow more often than they shrink, and
-  a database with many shards will still work properly if the stored data shrinks, just
-  with slightly more overhead (e.g. redundant WAL replication)
- Shard splitting is only initiated based on capacity bounds, not load. Splitting
-  a tenant based on load will make sense for some medium-capacity, high-load workloads,
-  but is more complex to reason about and likely is not desirable until we have
-  shard merging to reduce the shard count again if the database becomes less busy.
-
-## Impacted Components
-
-pageserver, storage controller
-
-(the _storage controller_ is the evolution of what was called `attachment_service` in our test environment)
-
-## Terminology
-
-**Parent** shards are the shards that exist before a split. **Child** shards are
-the new shards created during a split.
-
-**Shard** is synonymous with _tenant shard_.
-
-**Shard Index** is the 2-tuple of shard number and shard count, written in
-paths as {:02x}{:02x}, e.g. `0001`.
-
-## Background
-
-In the implementation section, a couple of existing aspects of sharding are important
-to remember:
-
- Shard identifiers contain the shard number and count, so that "shard 0 of 1" (`0001`) is
-  a distinct shard from "shard 0 of 2" (`0002`). This is the case in key paths, local
-  storage paths, and remote index metadata.
- Remote layer file paths contain the shard index of the shard that created them, and
-  remote indices contain the same index to enable building the layer file path. A shard's
-  index may reference layers that were created by another shard.
- Local tenant shard directories include the shard index. All layers downloaded by
-  a tenant shard are stored in this shard-prefixed path, even if those layers were
-  initially created by another shard: tenant shards do not read and write one anothers'
-  paths.
- The `Tenant` pageserver type represents one tenant _shard_, not the whole tenant.
-  This is for historical reasons and will be cleaned up in future, but the existing
-  name is used here to help comprehension when reading code.
-
-## Implementation
-
-Note: this section focuses on the correctness of the core split process. This will
-be fairly inefficient in a naive implementation, and several important optimizations
-are described in a later section.
-
-There are broadly two parts to the implementation:
-
-1. The pageserver split API, which splits one shard on one pageserver
-2. The overall tenant split proccess which is coordinated by the storage controller,
-   and calls into the pageserver split API as needed.
-
-### Pageserver Split API
-
-The pageserver will expose a new API endpoint at `/v1/tenant/:tenant_shard_id/shard_split`
-that takes the new total shard count in the body.
-
-The pageserver split API operates on one tenant shard, on one pageserver. External
-coordination is required to use it safely, this is described in the later
-'Split procedure' section.
-
-#### Preparation
-
-First identify the shard indices for the new child shards. These are deterministic,
-calculated from the parent shard's index, and the number of children being created (this
-is an input to the API, and validated to be a power of two). In a trivial example, splitting
-0001 in two always results in 0002 and 0102.
-
-Child shard indices are chosen such that the childrens' parts of the keyspace will
-be subsets of the parent's parts of the keyspace.
-
-#### Step 1: write new remote indices
-
-In remote storage, splitting is very simple: we may just write new index_part.json
-objects for each child shard, containing exactly the same layers as the parent shard.
-
-The children will have more data than they need, but this avoids any exhausive
-re-writing or copying of layer files.
-
-The index key path includes a generation number: the parent shard's current
-attached generation number will also be used for the child shards' indices. This
-makes the operation safely retryable: if everything crashes and restarts, we may
-call the split API again on the parent shard, and the result will be some new remote
-indices for the child shards, under a higher generation number.
-
-#### Step 2: start new `Tenant` objects
-
-A new `Tenant` object may be instantiated for each child shard, while the parent
-shard still exists. When calling the tenant_spawn function for this object,
-the remote index from step 1 will be read, and the child shard will start
-to ingest WAL to catch up from whatever was in the remote storage at step 1.
-
-We now wait for child shards' WAL ingestion to catch up with the parent shard,
-so that we can safely tear down the parent shard without risking an availability
-gap to clients reading recent LSNs.
-
-#### Step 3: tear down parent `Tenant` object
-
-Once child shards are running and have caught up with WAL ingest, we no longer
-need the parent shard. Note that clients may still be using it -- when we
-shut it down, any page_service handlers will also shut down, causing clients
-to disconnect. When the client reconnects, it will re-lookup the tenant,
-and hit the child shard instead of the parent (shard lookup from page_service
-should bias toward higher ShardCount shards).
-
-Note that at this stage the page service client has not yet been notified of
-any split. In the trivial single split example:
-
- Shard 0001 is gone: Tenant object torn down
- Shards 0002 and 0102 are running on the same pageserver where Shard 0001 used to live.
- Clients will continue to connect to that server thinking that shard 0001 is there,
-  and all requests will work, because any key that was in shard 0001 is definitely
-  available in either shard 0002 or shard 0102.
- Eventually, the storage controller (not the pageserver) will decide to migrate
-  some child shards away: at that point it will do a live migration, ensuring
-  that the client has an updated configuration before it detaches anything
-  from the original server.
-
-#### Complete
-
-When we send a 200 response to the split request, we are promising the caller:
-
- That the child shards are persistent in remote storage
- That the parent shard has been shut down
-
-This enables the caller to proceed with the overall shard split operation, which
-may involve other shards on other pageservers.
-
-### Storage Controller Split procedure
-
-Splitting a tenant requires calling the pageserver split API, and tracking
-enough state to ensure recovery + completion in the event of any component (pageserver
-or storage controller) crashing (or request timing out) during the split.
-
-1. call the split API on all existing shards. Ensure that the resulting
-   child shards are pinned to their pageservers until _all_ the split calls are done.
-   This pinning may be implemented as a "split bit" on the tenant shards, that
-   blocks any migrations, and also acts as a sign that if we restart, we must go
-   through some recovery steps to resume the split.
-2. Once all the split calls are done, we may unpin the child shards (clear
-   the split bit). The split is now complete: subsequent steps are just migrations,
-   not strictly part of the split.
-3. Try to schedule new pageserver locations for the child shards, using
-   a soft anti-affinity constraint to place shards from the same tenant onto different
-   pageservers.
-
-Updating computes about the new shard count is not necessary until we migrate
-any of the child shards away from the parent's location.
-
-### Recovering from failures
-
-#### Rolling back an incomplete split
-
-An incomplete shard split may be rolled back quite simply, by attaching the parent shards to pageservers,
-and detaching child shards. This will lose any WAL ingested into the children after the parents
-were detached earlier, but the parents will catch up.
-
-No special pageserver API is needed for this. From the storage controllers point of view, the
-procedure is:
-
-1. For all parent shards in the tenant, ensure they are attached
-2. For all child shards, ensure they are not attached
-3. Drop child shards from the storage controller's database, and clear the split bit on the parent shards.
-
-Any remote storage content for child shards is left behind. This is similar to other cases where
-we may leave garbage objects in S3 (e.g. when we upload a layer but crash before uploading an
-index that references it). Future online scrub/cleanup functionality can remove these objects, or
-they will be removed when the tenant is deleted, as tenant deletion lists all objects in the prefix,
-which would include any child shards that were rolled back.
-
-If any timelines had been created on child shards, they will be lost when rolling back. To mitigate
-this, we will **block timeline creation during splitting**, so that we can safely roll back until
-the split is complete, without risking losing timelines.
-
-Rolling back an incomplete split will happen automatically if a split fails due to some fatal
-reason, and will not be accessible via an API:
-
- A pageserver fails to complete its split API request after too many retries
- A pageserver returns a fatal unexpected error such as 400 or 500
- The storage controller database returns a non-retryable error
- Some internal invariant is violated in the storage controller split code
-
-#### Rolling back a complete split
-
-A complete shard split may be rolled back similarly to an incomplete split, with the following
-modifications:
-
- The parent shards will no longer exist in the storage controller database, so these must
-  be re-synthesized somehow: the hard part of this is figuring the parent shards' generations. This
-  may be accomplished either by probing in S3, or by retaining some tombstone state for deleted
-  shards in the storage controller database.
- Any timelines that were created after the split complete will disappear when rolling back
-  to the tenant shards. For this reason, rolling back after a complete split should only
-  be done due to serious issues where loss of recently created timelines is acceptable, or
-  in cases where we have confirmed that no timelines were created in the intervening period.
- Parent shards' layers must not have been deleted: this property will come "for free" when
-  we first roll out sharding, by simply not implementing deletion of parent layers after
-  a split. When we do implement such deletion (see "Cleaning up parent-shard layers" in the
-  Optimizations section), it should apply a TTL to layers such that we have a
-  defined walltime window in which rollback will be possible.
-
-The storage controller will expose an API for rolling back a complete split, for use
-in the field if we encounter some critical bug with a post-split tenant.
-
-#### Retrying API calls during Pageserver Restart
-
-When a pageserver restarts during a split API call, it may witness on-disk content for both parent and
-child shards from an ongoing split. This does not intrinsically break anything, and the
-pageserver may include all these shards in its `/re-attach` request to the storage controller.
-
-In order to support such restarts, it is important that the storage controller stores
-persistent records of each child shard before it calls into a pageserver, as these child shards
-may require generation increments via a `/re-attach` request.
-
-The pageserver restart will also result in a failed API call from the storage controller's point
-of view. Recall that if _any_ pageserver fails to split, the overall split operation may not
-complete, and all shards must remain pinned to their current pageserver locations until the
-split is done.
-
-The pageserver API calls during splitting will retry on transient errors, so that
-short availability gaps do not result in a failure of the overall operation. The
-split in progress will be automatically rolled back if the threshold for API
-retries is reached (e.g. if a pageserver stays offline for longer than a typical
-restart).
-
-#### Rollback on Storage Controller Restart
-
-On startup, the storage controller will inspect the split bit for tenant shards that
-it loads from the database. If any splits are in progress:
-
- Database content will be reverted to the parent shards
- Child shards will be dropped from memory
- The parent and child shards will be included in the general startup reconciliation that
-  the storage controller does: any child shards will be detached from pageservers because
-  they don't exist in the storage controller's expected set of shards, and parent shards
-  will be attached if they aren't already.
-
-#### Storage controller API request failures/retries
-
-The split request handler will implement idempotency: if the [`Tenant`] requested to split
-doesn't exist, we will check for the would-be child shards, and if they already exist,
-we consider the request complete.
-
-If a request is retried while the original request is still underway, then the split
-request handler will notice an InProgress marker in TenantManager, and return 503
-to encourage the client to backoff/retry. This is the same as the general pageserver
-API handling for calls that try to act on an InProgress shard.
-
-#### Compute start/restart during a split
-
-If a compute starts up during split, it will be configured with the old sharding
-configuration. This will work for reads irrespective of the progress of the split
-as long as no child hards have been migrated away from their original location, and
-this is guaranteed in the split procedure (see earlier section).
-
-#### Pageserver fails permanently during a split
-
-If a pageserver permanently fails (i.e. the storage controller availability state for it
-goes to Offline) while a split is in progress, the splitting operation will roll back, and
-during the roll back it will skip any API calls to the offline pageserver. If the offline
-pageserver becomes available again, any stale locations will be cleaned up via the normal reconciliation process (the `/re-attach` API).
-
-### Handling secondary locations
-
-For correctness, it is not necessary to split secondary locations. We can simply detach
-the secondary locations for parent shards, and then attach new secondary locations
-for child shards.
-
-Clearly this is not optimal, as it will result in re-downloads of layer files that
-were already present on disk. See "Splitting secondary locations"
-
-### Conditions to trigger a split
-
-The pageserver will expose a new API for reporting on shards that are candidates
-for split: this will return a top-N report of the largest tenant shards by
-physical size (remote size). This should exclude any tenants that are already
-at the maximum configured shard count.
-
-The API would look something like:
-`/v1/top_n_tenant?shard_count_lt=8&sort_by=resident_size`
-
-The storage controller will poll that API across all pageservers it manages at some appropriate interval (e.g. 60 seconds).
-
-A split operation will be started when the tenant exceeds some threshold. This threshold
-should be _less than_ how large we actually want shards to be, perhaps much less. That's to
-minimize the amount of work involved in splitting -- if we want 100GiB shards, we shouldn't
-wait for a tenant to exceed 100GiB before we split anything. Some data analysis of existing
-tenant size distribution may be useful here: if we can make a statement like "usually, if
-a tenant has exceeded 20GiB they're probably going to exceed 100GiB later", then we might
-make our policy to split a tenant at 20GiB.
-
-The finest split we can do is by factors of two, but we can do higher-cardinality splits
-too, and this will help to reduce the overhead of repeatedly re-splitting a tenant
-as it grows. An example of a very simple heuristic for early deployment of the splitting
-feature would be: "Split tenants into 8 shards when their physical size exceeds 64GiB": that
-would give us two kinds of tenant (1 shard and 8 shards), and the confidence that once we had
-split a tenant, it will not need re-splitting soon after.
-
-## Optimizations
-
-### Flush parent shard to remote storage during split
-
-Any data that is in WAL but not remote storage at time of split will need
-to be replayed by child shards when they start for the first time. To minimize
-this work, we may flush the parent shard to remote storage before writing the
-remote indices for child shards.
-
-It is important that this flush is subject to some time bounds: we may be splitting
-in response to a surge of write ingest, so it may be time-critical to split. A
-few seconds to flush latest data should be sufficient to optimize common cases without
-running the risk of holding up a split for a harmful length of time when a parent
-shard is being written heavily. If the flush doesn't complete in time, we may proceed
-to shut down the parent shard and carry on with the split.
-
-### Hard linking parent layers into child shard directories
-
-Before we start the Tenant objects for child shards, we may pre-populate their
-local storage directories with hard links to the layer files already present
-in the parent shard's local directory. When the child shard starts and downloads
-its remote index, it will find all those layer files already present on local disk.
-
-This avoids wasting download capacity and makes splitting faster, but more importantly
-it avoids taking up a factor of N more disk space when splitting 1 shard into N.
-
-This mechanism will work well in typical flows where shards are migrated away
-promptly after a split, but for the general case including what happens when
-layers are evicted and re-downloaded after a split, see the 'Proactive compaction'
-section below.
-
-### Filtering during compaction
-
-Compaction, especially image layer generation, should skip any keys that are
-present in a shard's layer files, but do not match the shard's ShardIdentity's
-is_key_local() check. This avoids carrying around data for longer than necessary
-in post-split compactions.
-
-This was already implemented in https://github.com/neondatabase/neon/pull/6246
-
-### Proactive compaction
-
-In remote storage, there is little reason to rewrite any data on a shard split:
-all the children can reference parent layers via the very cheap write of the child
-index_part.json.
-
-In local storage, things are more nuanced. During the initial split there is no
-capacity cost to duplicating parent layers, if we implement the hard linking
-optimization described above. However, as soon as any layers are evicted from
-local disk and re-downloaded, the downloaded layers will not be hard-links any more:
-they'll have real capacity footprint. That isn't a problem if we migrate child shards
-away from the parent node swiftly, but it risks a significant over-use of local disk
-space if we do not.
-
-For example, if we did an 8-way split of a shard, and then _didn't_ migrate 7 of
-the shards elsewhere, then churned all the layers in all the shards via eviction,
-then we would blow up the storage capacity used on the node by 8x. If we're splitting
-a 100GB shard, that could take the pageserver to the point of exhausting disk space.
-
-To avoid this scenario, we could implement a special compaction mode where we just
-read historic layers, drop unwanted keys, and write back the layer file. This
-is pretty expensive, but useful if we have split a large shard and are not going to
-migrate the child shards away.
-
-The heuristic conditions for triggering such a compaction are:
-
- A) eviction plus time: if a child shard
-  has existed for more than a time threshold, and has been requested to perform at least one eviction, then it becomes urgent for this child shard to execute a proactive compaction to reduce its storage footprint, at the cost of I/O load.
- B) resident size plus time: we may inspect the resident layers and calculate how
-  many of them include the overhead of storing pre-split keys. After some time
-  threshold (different to the one in case A) we still have such layers occupying
-  local disk space, then we should proactively compact them.
-
-### Cleaning up parent-shard layers
-
-It is functionally harmless to leave parent shard layers in remote storage indefinitely.
-They would be cleaned up in the event of the tenant's deletion.
-
-As an optimization to avoid leaking remote storage capacity (which costs money), we may
-lazily clean up parent shard layers once no child shards reference them.
-
-This may be done _very_ lazily: e.g. check every PITR interval. The cleanup procedure is:
-
- list all the key prefixes beginning with the tenant ID, and select those shard prefixes
-  which do not belong to the most-recently-split set of shards (_ancestral shards_, i.e. `shard*count < max(shard_count) over all shards)`, and those shard prefixes which do have the latest shard count (_current shards_)
- If there are no _ancestral shard_ prefixes found, we have nothing to clean up and
-  may drop out now.
- find the latest-generation index for each _current shard_, read all and accumulate the set of layers belonging to ancestral shards referenced by these indices.
- for all ancestral shards, list objects in the prefix and delete any layer which was not
-  referenced by a current shard.
-
-If this cleanup is scheduled for 1-2 PITR periods after the split, there is a good chance that child shards will have written their own image layers covering the whole keyspace, such that all parent shard layers will be deletable.
-
-The cleanup may be done by the scrubber (external process), or we may choose to have
-the zeroth shard in the latest generation do the work -- there is no obstacle to one shard
-reading the other shard's indices at runtime, and we do not require visibility of the
-latest index writes.
-
-Cleanup should be artificially delayed by some period (for example 24 hours) to ensure
-that we retain the option to roll back a split in case of bugs.
-
-### Splitting secondary locations
-
-We may implement a pageserver API similar to the main splitting API, which does a simpler
-operation for secondary locations: it would not write anything to S3, instead it would simply
-create the child shard directory on local disk, hard link in directories from the parent,
-and set up the in memory (TenantSlot) state for the children.
-
-Similar to attached locations, a subset of secondary locations will probably need re-locating
-after the split is complete, to avoid leaving multiple child shards on the same pageservers,
-where they may use excessive space for the tenant.
-
-## FAQ/Alternatives
-
-### What should the thresholds be set to?
-
-Shard size limit: the pre-sharding default capacity quota for databases was 200GiB, so this could be a starting point for the per-shard size limit.
-
-Max shard count:
-
- The safekeeper overhead to sharding is currently O(N) network bandwidth because
-  the un-filtered WAL is sent to all shards. To avoid this growing out of control,
-  a limit of 8 shards should be temporarily imposed until WAL filtering is implemented
-  on the safekeeper.
- there is also little benefit to increasing the shard count beyond the number
-  of pageservers in a region.
-
-### Is it worth just rewriting all the data during a split to simplify reasoning about space?
--- a/libs/metrics/src/hll.rs
+++ b/libs/metrics/src/hll.rs
@@ -40,7 +40,7 @@ macro_rules! register_hll {
    }};

    ($N:literal, $NAME:expr, $HELP:expr $(,)?) => {{
-        $crate::register_hll!($N, $crate::opts!($NAME, $HELP))
+        $crate::register_hll!($N, $crate::opts!($NAME, $HELP), $LABELS_NAMES)
    }};
 }

--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -6,10 +6,7 @@ use std::str::FromStr;
 use serde::{Deserialize, Serialize};
 use utils::id::NodeId;

-use crate::{
-    models::{ShardParameters, TenantConfig},
-    shard::{ShardStripeSize, TenantShardId},
-};
+use crate::{models::ShardParameters, shard::TenantShardId};

 #[derive(Serialize, Deserialize)]
 pub struct TenantCreateResponseShard {
@@ -38,7 +35,7 @@ pub struct NodeRegisterRequest {
 pub struct NodeConfigureRequest {
    pub node_id: NodeId,

-    pub availability: Option<NodeAvailabilityWrapper>,
+    pub availability: Option<NodeAvailability>,
    pub scheduling: Option<NodeSchedulingPolicy>,
 }

@@ -60,31 +57,6 @@ pub struct TenantLocateResponse {
    pub shard_params: ShardParameters,
 }

-#[derive(Serialize, Deserialize)]
-pub struct TenantDescribeResponse {
-    pub shards: Vec<TenantDescribeResponseShard>,
-    pub stripe_size: ShardStripeSize,
-    pub policy: PlacementPolicy,
-    pub config: TenantConfig,
-}
-
-#[derive(Serialize, Deserialize)]
-pub struct TenantDescribeResponseShard {
-    pub tenant_shard_id: TenantShardId,
-
-    pub node_attached: Option<NodeId>,
-    pub node_secondary: Vec<NodeId>,
-
-    pub last_error: String,
-
-    /// A task is currently running to reconcile this tenant's intent state with the state on pageservers
-    pub is_reconciling: bool,
-    /// This shard failed in sending a compute notification to the cloud control plane, and a retry is pending.
-    pub is_pending_compute_notification: bool,
-    /// A shard split is currently underway
-    pub is_splitting: bool,
-}
-
 /// Explicitly migrating a particular shard is a low level operation
 /// TODO: higher level "Reschedule tenant" operation where the request
 /// specifies some constraints, e.g. asking it to get off particular node(s)
@@ -94,76 +66,22 @@ pub struct TenantShardMigrateRequest {
    pub node_id: NodeId,
 }

-/// Utilisation score indicating how good a candidate a pageserver
-/// is for scheduling the next tenant. See [`crate::models::PageserverUtilization`].
-/// Lower values are better.
-#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, PartialOrd, Ord)]
-pub struct UtilizationScore(pub u64);
-
-impl UtilizationScore {
-    pub fn worst() -> Self {
-        UtilizationScore(u64::MAX)
-    }
-}
-
-#[derive(Serialize, Clone, Copy)]
-#[serde(into = "NodeAvailabilityWrapper")]
+#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
 pub enum NodeAvailability {
    // Normal, happy state
-    Active(UtilizationScore),
+    Active,
    // Offline: Tenants shouldn't try to attach here, but they may assume that their
    // secondary locations on this node still exist.  Newly added nodes are in this
    // state until we successfully contact them.
    Offline,
 }

-impl PartialEq for NodeAvailability {
-    fn eq(&self, other: &Self) -> bool {
-        use NodeAvailability::*;
-        matches!((self, other), (Active(_), Active(_)) | (Offline, Offline))
-    }
-}
-
-impl Eq for NodeAvailability {}
-
-// This wrapper provides serde functionality and it should only be used to
-// communicate with external callers which don't know or care about the
-// utilisation score of the pageserver it is targeting.
-#[derive(Serialize, Deserialize, Clone)]
-pub enum NodeAvailabilityWrapper {
-    Active,
-    Offline,
-}
-
-impl From<NodeAvailabilityWrapper> for NodeAvailability {
-    fn from(val: NodeAvailabilityWrapper) -> Self {
-        match val {
-            // Assume the worst utilisation score to begin with. It will later be updated by
-            // the heartbeats.
-            NodeAvailabilityWrapper::Active => NodeAvailability::Active(UtilizationScore::worst()),
-            NodeAvailabilityWrapper::Offline => NodeAvailability::Offline,
-        }
-    }
-}
-
-impl From<NodeAvailability> for NodeAvailabilityWrapper {
-    fn from(val: NodeAvailability) -> Self {
-        match val {
-            NodeAvailability::Active(_) => NodeAvailabilityWrapper::Active,
-            NodeAvailability::Offline => NodeAvailabilityWrapper::Offline,
-        }
-    }
-}
-
 impl FromStr for NodeAvailability {
    type Err = anyhow::Error;

    fn from_str(s: &str) -> Result<Self, Self::Err> {
        match s {
-            // This is used when parsing node configuration requests from neon-local.
-            // Assume the worst possible utilisation score
-            // and let it get updated via the heartbeats.
-            "active" => Ok(Self::Active(UtilizationScore::worst())),
+            "active" => Ok(Self::Active),
            "offline" => Ok(Self::Offline),
            _ => Err(anyhow::anyhow!("Unknown availability state '{s}'")),
        }
@@ -209,8 +127,11 @@ impl From<NodeSchedulingPolicy> for String {
 /// to create secondary locations.
 #[derive(Clone, Serialize, Deserialize, Debug, PartialEq, Eq)]
 pub enum PlacementPolicy {
-    /// Normal live state: one attached pageserver and zero or more secondaries.
-    Attached(usize),
+    /// Cheapest way to attach a tenant: just one pageserver, no secondary
+    Single,
+    /// Production-ready way to attach a tenant: one attached pageserver and
+    /// some number of secondaries.
+    Double(usize),
    /// Create one secondary mode locations. This is useful when onboarding
    /// a tenant, or for an idle tenant that we might want to bring online quickly.
    Secondary,
@@ -232,14 +153,14 @@ mod test {
    /// Check stability of PlacementPolicy's serialization
    #[test]
    fn placement_policy_encoding() -> anyhow::Result<()> {
-        let v = PlacementPolicy::Attached(1);
+        let v = PlacementPolicy::Double(1);
        let encoded = serde_json::to_string(&v)?;
-        assert_eq!(encoded, "{\"Attached\":1}");
+        assert_eq!(encoded, "{\"Double\":1}");
        assert_eq!(serde_json::from_str::<PlacementPolicy>(&encoded)?, v);

-        let v = PlacementPolicy::Detached;
+        let v = PlacementPolicy::Single;
        let encoded = serde_json::to_string(&v)?;
-        assert_eq!(encoded, "\"Detached\"");
+        assert_eq!(encoded, "\"Single\"");
        assert_eq!(serde_json::from_str::<PlacementPolicy>(&encoded)?, v);
        Ok(())
    }
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -4,7 +4,6 @@ pub mod utilization;
 pub use utilization::PageserverUtilization;

 use std::{
-    borrow::Cow,
    collections::HashMap,
    io::{BufRead, Read},
    num::{NonZeroU64, NonZeroUsize},
@@ -578,7 +577,7 @@ pub struct TimelineInfo {
    pub walreceiver_status: String,
 }

-#[derive(Debug, Clone, Serialize, Deserialize)]
+#[derive(Debug, Clone, Serialize)]
 pub struct LayerMapInfo {
    pub in_memory_layers: Vec<InMemoryLayerInfo>,
    pub historic_layers: Vec<HistoricLayerInfo>,
@@ -596,7 +595,7 @@ pub enum LayerAccessKind {
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct LayerAccessStatFullDetails {
    pub when_millis_since_epoch: u64,
-    pub task_kind: Cow<'static, str>,
+    pub task_kind: &'static str,
    pub access_kind: LayerAccessKind,
 }

@@ -655,23 +654,23 @@ impl LayerResidenceEvent {
    }
 }

-#[derive(Debug, Clone, Serialize, Deserialize)]
+#[derive(Debug, Clone, Serialize)]
 pub struct LayerAccessStats {
    pub access_count_by_access_kind: HashMap<LayerAccessKind, u64>,
-    pub task_kind_access_flag: Vec<Cow<'static, str>>,
+    pub task_kind_access_flag: Vec<&'static str>,
    pub first: Option<LayerAccessStatFullDetails>,
    pub accesses_history: HistoryBufferWithDropCounter<LayerAccessStatFullDetails, 16>,
    pub residence_events_history: HistoryBufferWithDropCounter<LayerResidenceEvent, 16>,
 }

-#[derive(Debug, Clone, Serialize, Deserialize)]
+#[derive(Debug, Clone, Serialize)]
 #[serde(tag = "kind")]
 pub enum InMemoryLayerInfo {
    Open { lsn_start: Lsn },
    Frozen { lsn_start: Lsn, lsn_end: Lsn },
 }

-#[derive(Debug, Clone, Serialize, Deserialize)]
+#[derive(Debug, Clone, Serialize)]
 #[serde(tag = "kind")]
 pub enum HistoricLayerInfo {
    Delta {
@@ -693,32 +692,6 @@ pub enum HistoricLayerInfo {
    },
 }

-impl HistoricLayerInfo {
-    pub fn layer_file_name(&self) -> &str {
-        match self {
-            HistoricLayerInfo::Delta {
-                layer_file_name, ..
-            } => layer_file_name,
-            HistoricLayerInfo::Image {
-                layer_file_name, ..
-            } => layer_file_name,
-        }
-    }
-    pub fn is_remote(&self) -> bool {
-        match self {
-            HistoricLayerInfo::Delta { remote, .. } => *remote,
-            HistoricLayerInfo::Image { remote, .. } => *remote,
-        }
-    }
-    pub fn set_remote(&mut self, value: bool) {
-        let field = match self {
-            HistoricLayerInfo::Delta { remote, .. } => remote,
-            HistoricLayerInfo::Image { remote, .. } => remote,
-        };
-        *field = value;
-    }
-}
-
 #[derive(Debug, Serialize, Deserialize)]
 pub struct DownloadRemoteLayersTaskSpawnRequest {
    pub max_concurrent_downloads: NonZeroUsize,
@@ -751,52 +724,6 @@ pub struct WalRedoManagerStatus {
    pub pid: Option<u32>,
 }

-/// The progress of a secondary tenant is mostly useful when doing a long running download: e.g. initiating
-/// a download job, timing out while waiting for it to run, and then inspecting this status to understand
-/// what's happening.
-#[derive(Default, Debug, Serialize, Deserialize, Clone)]
-pub struct SecondaryProgress {
-    /// The remote storage LastModified time of the heatmap object we last downloaded.
-    #[serde(
-        serialize_with = "opt_ser_rfc3339_millis",
-        deserialize_with = "opt_deser_rfc3339_millis"
-    )]
-    pub heatmap_mtime: Option<SystemTime>,
-
-    /// The number of layers currently on-disk
-    pub layers_downloaded: usize,
-    /// The number of layers in the most recently seen heatmap
-    pub layers_total: usize,
-
-    /// The number of layer bytes currently on-disk
-    pub bytes_downloaded: u64,
-    /// The number of layer bytes in the most recently seen heatmap
-    pub bytes_total: u64,
-}
-
-fn opt_ser_rfc3339_millis<S: serde::Serializer>(
-    ts: &Option<SystemTime>,
-    serializer: S,
-) -> Result<S::Ok, S::Error> {
-    match ts {
-        Some(ts) => serializer.collect_str(&humantime::format_rfc3339_millis(*ts)),
-        None => serializer.serialize_none(),
-    }
-}
-
-fn opt_deser_rfc3339_millis<'de, D>(deserializer: D) -> Result<Option<SystemTime>, D::Error>
-where
-    D: serde::de::Deserializer<'de>,
-{
-    let s: Option<String> = serde::de::Deserialize::deserialize(deserializer)?;
-    match s {
-        None => Ok(None),
-        Some(s) => humantime::parse_rfc3339(&s)
-            .map_err(serde::de::Error::custom)
-            .map(Some),
-    }
-}
-
 pub mod virtual_file {
    #[derive(
        Copy,
--- a/libs/pageserver_api/src/models/utilization.rs
+++ b/libs/pageserver_api/src/models/utilization.rs
@@ -7,7 +7,7 @@ use std::time::SystemTime;
 ///
 /// `format: int64` fields must use `ser_saturating_u63` because openapi generated clients might
 /// not handle full u64 values properly.
-#[derive(serde::Serialize, serde::Deserialize, Debug, Clone)]
+#[derive(serde::Serialize, Debug)]
 pub struct PageserverUtilization {
    /// Used disk space
    #[serde(serialize_with = "ser_saturating_u63")]
@@ -21,10 +21,7 @@ pub struct PageserverUtilization {
    /// When was this snapshot captured, pageserver local time.
    ///
    /// Use millis to give confidence that the value is regenerated often enough.
-    #[serde(
-        serialize_with = "ser_rfc3339_millis",
-        deserialize_with = "deser_rfc3339_millis"
-    )]
+    #[serde(serialize_with = "ser_rfc3339_millis")]
    pub captured_at: SystemTime,
 }

@@ -35,14 +32,6 @@ fn ser_rfc3339_millis<S: serde::Serializer>(
    serializer.collect_str(&humantime::format_rfc3339_millis(*ts))
 }

-fn deser_rfc3339_millis<'de, D>(deserializer: D) -> Result<SystemTime, D::Error>
-where
-    D: serde::de::Deserializer<'de>,
-{
-    let s: String = serde::de::Deserialize::deserialize(deserializer)?;
-    humantime::parse_rfc3339(&s).map_err(serde::de::Error::custom)
-}
-
 /// openapi knows only `format: int64`, so avoid outputting a non-parseable value by generated clients.
 ///
 /// Instead of newtype, use this because a newtype would get require handling deserializing values
--- a/libs/pageserver_api/src/upcall_api.rs
+++ b/libs/pageserver_api/src/upcall_api.rs
@@ -6,9 +6,7 @@
 use serde::{Deserialize, Serialize};
 use utils::id::NodeId;

-use crate::{
-    controller_api::NodeRegisterRequest, models::LocationConfigMode, shard::TenantShardId,
-};
+use crate::{controller_api::NodeRegisterRequest, shard::TenantShardId};

 /// Upcall message sent by the pageserver to the configured `control_plane_api` on
 /// startup.
@@ -22,20 +20,12 @@ pub struct ReAttachRequest {
    pub register: Option<NodeRegisterRequest>,
 }

-fn default_mode() -> LocationConfigMode {
-    LocationConfigMode::AttachedSingle
-}
-
-#[derive(Serialize, Deserialize, Debug)]
+#[derive(Serialize, Deserialize)]
 pub struct ReAttachResponseTenant {
    pub id: TenantShardId,
-    /// Mandatory if LocationConfigMode is None or set to an Attached* mode
-    pub gen: Option<u32>,
-
-    /// Default value only for backward compat: this field should be set
-    #[serde(default = "default_mode")]
-    pub mode: LocationConfigMode,
+    pub gen: u32,
 }
+
 #[derive(Serialize, Deserialize)]
 pub struct ReAttachResponse {
    pub tenants: Vec<ReAttachResponseTenant>,
--- a/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs
+++ b/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs
@@ -1,6 +1,5 @@
 use anyhow::*;
 use clap::{value_parser, Arg, ArgMatches, Command};
-use postgres::Client;
 use std::{path::PathBuf, str::FromStr};
 use wal_craft::*;

@@ -9,8 +8,8 @@ fn main() -> Result<()> {
        .init();
    let arg_matches = cli().get_matches();

-    let wal_craft = |arg_matches: &ArgMatches, client: &mut Client| {
-        let intermediate_lsns = match arg_matches
+    let wal_craft = |arg_matches: &ArgMatches, client| {
+        let (intermediate_lsns, end_of_wal_lsn) = match arg_matches
            .get_one::<String>("type")
            .map(|s| s.as_str())
            .context("'type' is required")?
@@ -26,7 +25,6 @@ fn main() -> Result<()> {
            LastWalRecordCrossingSegment::NAME => LastWalRecordCrossingSegment::craft(client)?,
            a => panic!("Unknown --type argument: {a}"),
        };
-        let end_of_wal_lsn = client.pg_current_wal_insert_lsn()?;
        for lsn in intermediate_lsns {
            println!("intermediate_lsn = {lsn}");
        }
--- a/libs/postgres_ffi/wal_craft/src/lib.rs
+++ b/libs/postgres_ffi/wal_craft/src/lib.rs
@@ -5,6 +5,7 @@ use postgres::types::PgLsn;
 use postgres::Client;
 use postgres_ffi::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ};
 use postgres_ffi::{XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD};
+use std::cmp::Ordering;
 use std::path::{Path, PathBuf};
 use std::process::Command;
 use std::time::{Duration, Instant};
@@ -231,52 +232,59 @@ pub fn ensure_server_config(client: &mut impl postgres::GenericClient) -> anyhow
 pub trait Crafter {
    const NAME: &'static str;

-    /// Generates WAL using the client `client`. Returns a vector of some valid
-    /// "interesting" intermediate LSNs which one may start reading from.
-    /// test_end_of_wal uses this to check various starting points.
-    ///
-    /// Note that postgres is generally keen about writing some WAL. While we
-    /// try to disable it (autovacuum, big wal_writer_delay, etc) it is always
-    /// possible, e.g. xl_running_xacts are dumped each 15s. So checks about
-    /// stable WAL end would be flaky unless postgres is shut down. For this
-    /// reason returning potential end of WAL here is pointless. Most of the
-    /// time this doesn't happen though, so it is reasonable to create needed
-    /// WAL structure and immediately kill postgres like test_end_of_wal does.
-    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<Vec<PgLsn>>;
+    /// Generates WAL using the client `client`. Returns a pair of:
+    /// * A vector of some valid "interesting" intermediate LSNs which one may start reading from.
+    ///   May include or exclude Lsn(0) and the end-of-wal.
+    /// * The expected end-of-wal LSN.
+    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)>;
 }

-/// Wraps some WAL craft function, providing current LSN to it before the
-/// insertion and flushing WAL afterwards. Also pushes initial LSN to the
-/// result.
 fn craft_internal<C: postgres::GenericClient>(
    client: &mut C,
-    f: impl Fn(&mut C, PgLsn) -> anyhow::Result<Vec<PgLsn>>,
-) -> anyhow::Result<Vec<PgLsn>> {
+    f: impl Fn(&mut C, PgLsn) -> anyhow::Result<(Vec<PgLsn>, Option<PgLsn>)>,
+) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
    ensure_server_config(client)?;

    let initial_lsn = client.pg_current_wal_insert_lsn()?;
    info!("LSN initial = {}", initial_lsn);

-    let mut intermediate_lsns = f(client, initial_lsn)?;
+    let (mut intermediate_lsns, last_lsn) = f(client, initial_lsn)?;
+    let last_lsn = match last_lsn {
+        None => client.pg_current_wal_insert_lsn()?,
+        Some(last_lsn) => {
+            let insert_lsn = client.pg_current_wal_insert_lsn()?;
+            match last_lsn.cmp(&insert_lsn) {
+                Ordering::Less => bail!(
+                    "Some records were inserted after the crafted WAL: {} vs {}",
+                    last_lsn,
+                    insert_lsn
+                ),
+                Ordering::Equal => last_lsn,
+                Ordering::Greater => bail!("Reported LSN is greater than insert_lsn"),
+            }
+        }
+    };
    if !intermediate_lsns.starts_with(&[initial_lsn]) {
        intermediate_lsns.insert(0, initial_lsn);
    }

    // Some records may be not flushed, e.g. non-transactional logical messages.
-    //
-    // Note: this is broken if pg_current_wal_insert_lsn is at page boundary
-    // because pg_current_wal_insert_lsn skips page headers.
    client.execute("select neon_xlogflush(pg_current_wal_insert_lsn())", &[])?;
-    Ok(intermediate_lsns)
+    match last_lsn.cmp(&client.pg_current_wal_flush_lsn()?) {
+        Ordering::Less => bail!("Some records were flushed after the crafted WAL"),
+        Ordering::Equal => {}
+        Ordering::Greater => bail!("Reported LSN is greater than flush_lsn"),
+    }
+    Ok((intermediate_lsns, last_lsn))
 }

 pub struct Simple;
 impl Crafter for Simple {
    const NAME: &'static str = "simple";
-    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<Vec<PgLsn>> {
+    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
        craft_internal(client, |client, _| {
            client.execute("CREATE table t(x int)", &[])?;
-            Ok(Vec::new())
+            Ok((Vec::new(), None))
        })
    }
 }
@@ -284,36 +292,29 @@ impl Crafter for Simple {
 pub struct LastWalRecordXlogSwitch;
 impl Crafter for LastWalRecordXlogSwitch {
    const NAME: &'static str = "last_wal_record_xlog_switch";
-    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<Vec<PgLsn>> {
-        // Do not use craft_internal because here we end up with flush_lsn exactly on
+    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
+        // Do not use generate_internal because here we end up with flush_lsn exactly on
        // the segment boundary and insert_lsn after the initial page header, which is unusual.
        ensure_server_config(client)?;

        client.execute("CREATE table t(x int)", &[])?;
        let before_xlog_switch = client.pg_current_wal_insert_lsn()?;
-        // pg_switch_wal returns end of last record of the switched segment,
-        // i.e. end of SWITCH itself.
-        let xlog_switch_record_end: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
-        let before_xlog_switch_u64 = u64::from(before_xlog_switch);
-        let next_segment = PgLsn::from(
-            before_xlog_switch_u64 - (before_xlog_switch_u64 % WAL_SEGMENT_SIZE as u64)
-                + WAL_SEGMENT_SIZE as u64,
-        );
+        let after_xlog_switch: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
+        let next_segment = PgLsn::from(0x0200_0000);
        ensure!(
-            xlog_switch_record_end <= next_segment,
-            "XLOG_SWITCH record ended after the expected segment boundary: {} > {}",
-            xlog_switch_record_end,
+            after_xlog_switch <= next_segment,
+            "XLOG_SWITCH message ended after the expected segment boundary: {} > {}",
+            after_xlog_switch,
            next_segment
        );
-        Ok(vec![before_xlog_switch, xlog_switch_record_end])
+        Ok((vec![before_xlog_switch, after_xlog_switch], next_segment))
    }
 }

 pub struct LastWalRecordXlogSwitchEndsOnPageBoundary;
-/// Craft xlog SWITCH record ending at page boundary.
 impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {
    const NAME: &'static str = "last_wal_record_xlog_switch_ends_on_page_boundary";
-    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<Vec<PgLsn>> {
+    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
        // Do not use generate_internal because here we end up with flush_lsn exactly on
        // the segment boundary and insert_lsn after the initial page header, which is unusual.
        ensure_server_config(client)?;
@@ -360,29 +361,28 @@ impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {

        // Emit the XLOG_SWITCH
        let before_xlog_switch = client.pg_current_wal_insert_lsn()?;
-        let xlog_switch_record_end: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
+        let after_xlog_switch: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
        let next_segment = PgLsn::from(0x0200_0000);
        ensure!(
-            xlog_switch_record_end < next_segment,
-            "XLOG_SWITCH record ended on or after the expected segment boundary: {} > {}",
-            xlog_switch_record_end,
+            after_xlog_switch < next_segment,
+            "XLOG_SWITCH message ended on or after the expected segment boundary: {} > {}",
+            after_xlog_switch,
            next_segment
        );
        ensure!(
-            u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ == XLOG_SIZE_OF_XLOG_SHORT_PHD,
+            u64::from(after_xlog_switch) as usize % XLOG_BLCKSZ == XLOG_SIZE_OF_XLOG_SHORT_PHD,
            "XLOG_SWITCH message ended not on page boundary: {}, offset = {}",
-            xlog_switch_record_end,
-            u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ
+            after_xlog_switch,
+            u64::from(after_xlog_switch) as usize % XLOG_BLCKSZ
        );
-        Ok(vec![before_xlog_switch, xlog_switch_record_end])
+        Ok((vec![before_xlog_switch, after_xlog_switch], next_segment))
    }
 }

-/// Write ~16MB logical message; it should cross WAL segment.
-fn craft_seg_size_logical_message(
+fn craft_single_logical_message(
    client: &mut impl postgres::GenericClient,
    transactional: bool,
-) -> anyhow::Result<Vec<PgLsn>> {
+) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
    craft_internal(client, |client, initial_lsn| {
        ensure!(
            initial_lsn < PgLsn::from(0x0200_0000 - 1024 * 1024),
@@ -405,24 +405,34 @@ fn craft_seg_size_logical_message(
            "Logical message crossed two segments"
        );

-        Ok(vec![message_lsn])
+        if transactional {
+            // Transactional logical messages are part of a transaction, so the one above is
+            // followed by a small COMMIT record.
+
+            let after_message_lsn = client.pg_current_wal_insert_lsn()?;
+            ensure!(
+                message_lsn < after_message_lsn,
+                "No record found after the emitted message"
+            );
+            Ok((vec![message_lsn], Some(after_message_lsn)))
+        } else {
+            Ok((Vec::new(), Some(message_lsn)))
+        }
    })
 }

 pub struct WalRecordCrossingSegmentFollowedBySmallOne;
 impl Crafter for WalRecordCrossingSegmentFollowedBySmallOne {
    const NAME: &'static str = "wal_record_crossing_segment_followed_by_small_one";
-    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<Vec<PgLsn>> {
-        // Transactional message crossing WAL segment will be followed by small
-        // commit record.
-        craft_seg_size_logical_message(client, true)
+    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
+        craft_single_logical_message(client, true)
    }
 }

 pub struct LastWalRecordCrossingSegment;
 impl Crafter for LastWalRecordCrossingSegment {
    const NAME: &'static str = "last_wal_record_crossing_segment";
-    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<Vec<PgLsn>> {
-        craft_seg_size_logical_message(client, false)
+    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
+        craft_single_logical_message(client, false)
    }
 }
--- a/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs
+++ b/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs
@@ -11,15 +11,13 @@ use utils::const_assert;
 use utils::lsn::Lsn;

 fn init_logging() {
-    let _ = env_logger::Builder::from_env(env_logger::Env::default().default_filter_or(format!(
-        "crate=info,postgres_ffi::{PG_MAJORVERSION}::xlog_utils=trace"
-    )))
+    let _ = env_logger::Builder::from_env(env_logger::Env::default().default_filter_or(
+        format!("crate=info,postgres_ffi::{PG_MAJORVERSION}::xlog_utils=trace"),
+    ))
    .is_test(true)
    .try_init();
 }

-/// Test that find_end_of_wal returns the same results as pg_dump on various
-/// WALs created by Crafter.
 fn test_end_of_wal<C: crate::Crafter>(test_name: &str) {
    use crate::*;

@@ -40,13 +38,13 @@ fn test_end_of_wal<C: crate::Crafter>(test_name: &str) {
    }
    cfg.initdb().unwrap();
    let srv = cfg.start_server().unwrap();
-    let intermediate_lsns = C::craft(&mut srv.connect_with_timeout().unwrap()).unwrap();
+    let (intermediate_lsns, expected_end_of_wal_partial) =
+        C::craft(&mut srv.connect_with_timeout().unwrap()).unwrap();
    let intermediate_lsns: Vec<Lsn> = intermediate_lsns
        .iter()
        .map(|&lsn| u64::from(lsn).into())
        .collect();
-    // Kill postgres. Note that it might have inserted to WAL something after
-    // 'craft' did its job.
+    let expected_end_of_wal: Lsn = u64::from(expected_end_of_wal_partial).into();
    srv.kill();

    // Check find_end_of_wal on the initial WAL
@@ -58,7 +56,7 @@ fn test_end_of_wal<C: crate::Crafter>(test_name: &str) {
        .filter(|fname| IsXLogFileName(fname))
        .max()
        .unwrap();
-    let expected_end_of_wal = find_pg_waldump_end_of_wal(&cfg, &last_segment);
+    check_pg_waldump_end_of_wal(&cfg, &last_segment, expected_end_of_wal);
    for start_lsn in intermediate_lsns
        .iter()
        .chain(std::iter::once(&expected_end_of_wal))
@@ -93,7 +91,11 @@ fn test_end_of_wal<C: crate::Crafter>(test_name: &str) {
    }
 }

-fn find_pg_waldump_end_of_wal(cfg: &crate::Conf, last_segment: &str) -> Lsn {
+fn check_pg_waldump_end_of_wal(
+    cfg: &crate::Conf,
+    last_segment: &str,
+    expected_end_of_wal: Lsn,
+) {
    // Get the actual end of WAL by pg_waldump
    let waldump_output = cfg
        .pg_waldump("000000010000000000000001", last_segment)
@@ -111,8 +113,11 @@ fn find_pg_waldump_end_of_wal(cfg: &crate::Conf, last_segment: &str) -> Lsn {
        }
    };
    let waldump_wal_end = Lsn::from_str(caps.get(1).unwrap().as_str()).unwrap();
-    info!("waldump erred on {}", waldump_wal_end);
-    waldump_wal_end
+    info!(
+        "waldump erred on {}, expected wal end at {}",
+        waldump_wal_end, expected_end_of_wal
+    );
+    assert_eq!(waldump_wal_end, expected_end_of_wal);
 }

 fn check_end_of_wal(
@@ -205,9 +210,9 @@ pub fn test_update_next_xid() {
 #[test]
 pub fn test_encode_logical_message() {
    let expected = [
-        64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 0, 0, 170, 34, 166, 227, 255, 38,
-        0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 112, 114, 101, 102,
-        105, 120, 0, 109, 101, 115, 115, 97, 103, 101,
+        64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 0, 0, 170, 34, 166, 227, 255,
+        38, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 112, 114,
+        101, 102, 105, 120, 0, 109, 101, 115, 115, 97, 103, 101,
    ];
    let actual = encode_logical_message("prefix", "message");
    assert_eq!(expected, actual[..]);
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -18,7 +18,6 @@ camino.workspace = true
 humantime.workspace = true
 hyper = { workspace = true, features = ["stream"] }
 futures.workspace = true
-rand.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 tokio = { workspace = true, features = ["sync", "fs", "io-util"] }
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -157,8 +157,9 @@ impl AzureBlobStorage {
            let mut bufs = Vec::new();
            while let Some(part) = response.next().await {
                let part = part?;
+                let etag_str: &str = part.blob.properties.etag.as_ref();
                if etag.is_none() {
-                    etag = Some(part.blob.properties.etag);
+                    etag = Some(etag.unwrap_or_else(|| etag_str.to_owned()));
                }
                if last_modified.is_none() {
                    last_modified = Some(part.blob.properties.last_modified.into());
@@ -173,16 +174,6 @@ impl AzureBlobStorage {
                    .map_err(|e| DownloadError::Other(e.into()))?;
                bufs.push(data);
            }
-
-            if bufs.is_empty() {
-                return Err(DownloadError::Other(anyhow::anyhow!(
-                    "Azure GET response contained no buffers"
-                )));
-            }
-            // unwrap safety: if these were None, bufs would be empty and we would have returned an error already
-            let etag = etag.unwrap();
-            let last_modified = last_modified.unwrap();
-
            Ok(Download {
                download_stream: Box::pin(futures::stream::iter(bufs.into_iter().map(Ok))),
                etag,
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -42,9 +42,6 @@ pub use self::{
 };
 use s3_bucket::RequestKind;

-/// Azure SDK's ETag type is a simple String wrapper: we use this internally instead of repeating it here.
-pub use azure_core::Etag;
-
 pub use error::{DownloadError, TimeTravelError, TimeoutOrCancel};

 /// Currently, sync happens with AWS S3, that has two limits on requests per second:
@@ -294,9 +291,9 @@ pub type DownloadStream =
 pub struct Download {
    pub download_stream: DownloadStream,
    /// The last time the file was modified (`last-modified` HTTP header)
-    pub last_modified: SystemTime,
+    pub last_modified: Option<SystemTime>,
    /// A way to identify this specific version of the resource (`etag` HTTP header)
-    pub etag: Etag,
+    pub etag: Option<String>,
    /// Extra key-value data, associated with the current remote file.
    pub metadata: Option<StorageMetadata>,
 }
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -10,7 +10,7 @@ use std::{
    io::ErrorKind,
    num::NonZeroU32,
    pin::Pin,
-    time::{Duration, SystemTime, UNIX_EPOCH},
+    time::{Duration, SystemTime},
 };

 use anyhow::{bail, ensure, Context};
@@ -30,7 +30,6 @@ use crate::{
 };

 use super::{RemoteStorage, StorageMetadata};
-use crate::Etag;

 const LOCAL_FS_TEMP_FILE_SUFFIX: &str = "___temp";

@@ -198,7 +197,6 @@ impl LocalFs {
            fs::OpenOptions::new()
                .write(true)
                .create(true)
-                .truncate(true)
                .open(&temp_file_path)
                .await
                .with_context(|| {
@@ -408,37 +406,35 @@ impl RemoteStorage for LocalFs {
        cancel: &CancellationToken,
    ) -> Result<Download, DownloadError> {
        let target_path = from.with_base(&self.storage_root);
+        if file_exists(&target_path).map_err(DownloadError::BadInput)? {
+            let source = ReaderStream::new(
+                fs::OpenOptions::new()
+                    .read(true)
+                    .open(&target_path)
+                    .await
+                    .with_context(|| {
+                        format!("Failed to open source file {target_path:?} to use in the download")
+                    })
+                    .map_err(DownloadError::Other)?,
+            );

-        let file_metadata = file_metadata(&target_path).await?;
-
-        let source = ReaderStream::new(
-            fs::OpenOptions::new()
-                .read(true)
-                .open(&target_path)
+            let metadata = self
+                .read_storage_metadata(&target_path)
                .await
-                .with_context(|| {
-                    format!("Failed to open source file {target_path:?} to use in the download")
-                })
-                .map_err(DownloadError::Other)?,
-        );
+                .map_err(DownloadError::Other)?;

-        let metadata = self
-            .read_storage_metadata(&target_path)
-            .await
-            .map_err(DownloadError::Other)?;
+            let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
+            let source = crate::support::DownloadStream::new(cancel_or_timeout, source);

-        let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
-        let source = crate::support::DownloadStream::new(cancel_or_timeout, source);
-
-        let etag = mock_etag(&file_metadata);
-        Ok(Download {
-            metadata,
-            last_modified: file_metadata
-                .modified()
-                .map_err(|e| DownloadError::Other(anyhow::anyhow!(e).context("Reading mtime")))?,
-            etag,
-            download_stream: Box::pin(source),
-        })
+            Ok(Download {
+                metadata,
+                last_modified: None,
+                etag: None,
+                download_stream: Box::pin(source),
+            })
+        } else {
+            Err(DownloadError::NotFound)
+        }
    }

    async fn download_byte_range(
@@ -456,51 +452,50 @@ impl RemoteStorage for LocalFs {
                return Err(DownloadError::Other(anyhow::anyhow!("Invalid range, start ({start_inclusive}) and end_exclusive ({end_exclusive:?}) difference is zero bytes")));
            }
        }
-
        let target_path = from.with_base(&self.storage_root);
-        let file_metadata = file_metadata(&target_path).await?;
-        let mut source = tokio::fs::OpenOptions::new()
-            .read(true)
-            .open(&target_path)
-            .await
-            .with_context(|| {
-                format!("Failed to open source file {target_path:?} to use in the download")
+        if file_exists(&target_path).map_err(DownloadError::BadInput)? {
+            let mut source = tokio::fs::OpenOptions::new()
+                .read(true)
+                .open(&target_path)
+                .await
+                .with_context(|| {
+                    format!("Failed to open source file {target_path:?} to use in the download")
+                })
+                .map_err(DownloadError::Other)?;
+
+            let len = source
+                .metadata()
+                .await
+                .context("query file length")
+                .map_err(DownloadError::Other)?
+                .len();
+
+            source
+                .seek(io::SeekFrom::Start(start_inclusive))
+                .await
+                .context("Failed to seek to the range start in a local storage file")
+                .map_err(DownloadError::Other)?;
+
+            let metadata = self
+                .read_storage_metadata(&target_path)
+                .await
+                .map_err(DownloadError::Other)?;
+
+            let source = source.take(end_exclusive.unwrap_or(len) - start_inclusive);
+            let source = ReaderStream::new(source);
+
+            let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
+            let source = crate::support::DownloadStream::new(cancel_or_timeout, source);
+
+            Ok(Download {
+                metadata,
+                last_modified: None,
+                etag: None,
+                download_stream: Box::pin(source),
            })
-            .map_err(DownloadError::Other)?;
-
-        let len = source
-            .metadata()
-            .await
-            .context("query file length")
-            .map_err(DownloadError::Other)?
-            .len();
-
-        source
-            .seek(io::SeekFrom::Start(start_inclusive))
-            .await
-            .context("Failed to seek to the range start in a local storage file")
-            .map_err(DownloadError::Other)?;
-
-        let metadata = self
-            .read_storage_metadata(&target_path)
-            .await
-            .map_err(DownloadError::Other)?;
-
-        let source = source.take(end_exclusive.unwrap_or(len) - start_inclusive);
-        let source = ReaderStream::new(source);
-
-        let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
-        let source = crate::support::DownloadStream::new(cancel_or_timeout, source);
-
-        let etag = mock_etag(&file_metadata);
-        Ok(Download {
-            metadata,
-            last_modified: file_metadata
-                .modified()
-                .map_err(|e| DownloadError::Other(anyhow::anyhow!(e).context("Reading mtime")))?,
-            etag,
-            download_stream: Box::pin(source),
-        })
+        } else {
+            Err(DownloadError::NotFound)
+        }
    }

    async fn delete(&self, path: &RemotePath, _cancel: &CancellationToken) -> anyhow::Result<()> {
@@ -615,22 +610,13 @@ async fn create_target_directory(target_file_path: &Utf8Path) -> anyhow::Result<
    Ok(())
 }

-async fn file_metadata(file_path: &Utf8Path) -> Result<std::fs::Metadata, DownloadError> {
-    tokio::fs::metadata(&file_path).await.map_err(|e| {
-        if e.kind() == ErrorKind::NotFound {
-            DownloadError::NotFound
-        } else {
-            DownloadError::BadInput(e.into())
-        }
-    })
-}
-
-// Use mtime as stand-in for ETag.  We could calculate a meaningful one by md5'ing the contents of files we
-// read, but that's expensive and the local_fs test helper's whole reason for existence is to run small tests
-// quickly, with less overhead than using a mock S3 server.
-fn mock_etag(meta: &std::fs::Metadata) -> Etag {
-    let mtime = meta.modified().expect("Filesystem mtime missing");
-    format!("{}", mtime.duration_since(UNIX_EPOCH).unwrap().as_millis()).into()
+fn file_exists(file_path: &Utf8Path) -> anyhow::Result<bool> {
+    if file_path.exists() {
+        ensure!(file_path.is_file(), "file path '{file_path}' is not a file");
+        Ok(true)
+    } else {
+        Ok(false)
+    }
 }

 #[cfg(test)]
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -35,8 +35,8 @@ use aws_sdk_s3::{
 };
 use aws_smithy_async::rt::sleep::TokioSleep;

+use aws_smithy_types::byte_stream::ByteStream;
 use aws_smithy_types::{body::SdkBody, DateTime};
-use aws_smithy_types::{byte_stream::ByteStream, date_time::ConversionError};
 use bytes::Bytes;
 use futures::stream::Stream;
 use hyper::Body;
@@ -287,17 +287,8 @@ impl S3Bucket {
        let remaining = self.timeout.saturating_sub(started_at.elapsed());

        let metadata = object_output.metadata().cloned().map(StorageMetadata);
-        let etag = object_output
-            .e_tag
-            .ok_or(DownloadError::Other(anyhow::anyhow!("Missing ETag header")))?
-            .into();
-        let last_modified = object_output
-            .last_modified
-            .ok_or(DownloadError::Other(anyhow::anyhow!(
-                "Missing LastModified header"
-            )))?
-            .try_into()
-            .map_err(|e: ConversionError| DownloadError::Other(e.into()))?;
+        let etag = object_output.e_tag;
+        let last_modified = object_output.last_modified.and_then(|t| t.try_into().ok());

        let body = object_output.body;
        let body = ByteStreamAsStream::from(body);
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -118,7 +118,7 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
    // A little check to ensure that our clock is not too far off from the S3 clock
    {
        let dl = retry(|| ctx.client.download(&path2, &cancel)).await?;
-        let last_modified = dl.last_modified;
+        let last_modified = dl.last_modified.unwrap();
        let half_wt = WAIT_TIME.mul_f32(0.5);
        let t0_hwt = t0 + half_wt;
        let t1_hwt = t1 - half_wt;
--- a/libs/tenant_size_model/tests/tests.rs
+++ b/libs/tenant_size_model/tests/tests.rs
@@ -247,7 +247,7 @@ fn scenario_4() {
    //
    // This is in total 5000 + 1000 + 5000 + 1000 = 12000
    //
-    // (If we used the method from the previous scenario, and
+    // (If we used the the method from the previous scenario, and
    // kept only snapshot at the branch point, we'd need to keep
    // all the WAL between 10000-18000 on the main branch, so
    // the total size would be 5000 + 1000 + 8000 = 14000. The
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -13,7 +13,6 @@ testing = ["fail/failpoints"]
 [dependencies]
 arc-swap.workspace = true
 sentry.workspace = true
-async-compression.workspace = true
 async-trait.workspace = true
 anyhow.workspace = true
 bincode.workspace = true
@@ -37,7 +36,6 @@ serde_json.workspace = true
 signal-hook.workspace = true
 thiserror.workspace = true
 tokio.workspace = true
-tokio-tar.workspace = true
 tokio-util.workspace = true
 tracing.workspace = true
 tracing-error.workspace = true
@@ -48,7 +46,6 @@ strum.workspace = true
 strum_macros.workspace = true
 url.workspace = true
 uuid.workspace = true
-walkdir.workspace = true

 pq_proto.workspace = true
 postgres_connection.workspace = true
--- a/libs/utils/src/history_buffer.rs
+++ b/libs/utils/src/history_buffer.rs
@@ -47,10 +47,9 @@ impl<T, const L: usize> ops::Deref for HistoryBufferWithDropCounter<T, L> {
    }
 }

-#[derive(serde::Serialize, serde::Deserialize)]
+#[derive(serde::Serialize)]
 struct SerdeRepr<T> {
    buffer: Vec<T>,
-    buffer_size: usize,
    drop_count: u64,
 }

@@ -62,7 +61,6 @@ where
        let HistoryBufferWithDropCounter { buffer, drop_count } = value;
        SerdeRepr {
            buffer: buffer.iter().cloned().collect(),
-            buffer_size: L,
            drop_count: *drop_count,
        }
    }
@@ -80,52 +78,19 @@ where
    }
 }

-impl<'de, T, const L: usize> serde::de::Deserialize<'de> for HistoryBufferWithDropCounter<T, L>
-where
-    T: Clone + serde::Deserialize<'de>,
-{
-    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
-    where
-        D: serde::Deserializer<'de>,
-    {
-        let SerdeRepr {
-            buffer: des_buffer,
-            drop_count,
-            buffer_size,
-        } = SerdeRepr::<T>::deserialize(deserializer)?;
-        if buffer_size != L {
-            use serde::de::Error;
-            return Err(D::Error::custom(format!(
-                "invalid buffer_size, expecting {L} got {buffer_size}"
-            )));
-        }
-        let mut buffer = HistoryBuffer::new();
-        buffer.extend(des_buffer);
-        Ok(HistoryBufferWithDropCounter { buffer, drop_count })
-    }
-}
-
 #[cfg(test)]
 mod test {
    use super::HistoryBufferWithDropCounter;

    #[test]
    fn test_basics() {
-        let mut b = HistoryBufferWithDropCounter::<usize, 2>::default();
+        let mut b = HistoryBufferWithDropCounter::<_, 2>::default();
        b.write(1);
        b.write(2);
        b.write(3);
        assert!(b.iter().any(|e| *e == 2));
        assert!(b.iter().any(|e| *e == 3));
        assert!(!b.iter().any(|e| *e == 1));
-
-        // round-trip serde
-        let round_tripped: HistoryBufferWithDropCounter<usize, 2> =
-            serde_json::from_str(&serde_json::to_string(&b).unwrap()).unwrap();
-        assert_eq!(
-            round_tripped.iter().cloned().collect::<Vec<_>>(),
-            b.iter().cloned().collect::<Vec<_>>()
-        );
    }

    #[test]
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -245,7 +245,7 @@ impl std::io::Write for ChannelWriter {
    }
 }

-pub async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
    SERVE_METRICS_COUNT.inc();

    let started_at = std::time::Instant::now();
@@ -367,6 +367,7 @@ pub fn make_router() -> RouterBuilder<hyper::Body, ApiError> {
        .middleware(Middleware::post_with_info(
            add_request_id_header_to_response,
        ))
+        .get("/metrics", |r| request_span(r, prometheus_metrics_handler))
        .err_handler(route_error_handler)
 }

--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -87,8 +87,6 @@ pub mod failpoint_support;

 pub mod yielding_loop;

-pub mod zstd;
-
 /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
 ///
 /// we have several cases:
--- a/libs/utils/src/lock_file.rs
+++ b/libs/utils/src/lock_file.rs
@@ -63,7 +63,6 @@ impl UnwrittenLockFile {
 pub fn create_exclusive(lock_file_path: &Utf8Path) -> anyhow::Result<UnwrittenLockFile> {
    let lock_file = fs::OpenOptions::new()
        .create(true) // O_CREAT
-        .truncate(true)
        .write(true)
        .open(lock_file_path)
        .context("open lock file")?;
--- a/libs/utils/src/pageserver_feedback.rs
+++ b/libs/utils/src/pageserver_feedback.rs
@@ -29,10 +29,12 @@ pub struct PageserverFeedback {
    // Serialize with RFC3339 format.
    #[serde(with = "serde_systemtime")]
    pub replytime: SystemTime,
-    /// Used to track feedbacks from different shards. Always zero for unsharded tenants.
-    pub shard_number: u32,
 }

+// NOTE: Do not forget to increment this number when adding new fields to PageserverFeedback.
+// Do not remove previously available fields because this might be backwards incompatible.
+pub const PAGESERVER_FEEDBACK_FIELDS_NUMBER: u8 = 5;
+
 impl PageserverFeedback {
    pub fn empty() -> PageserverFeedback {
        PageserverFeedback {
@@ -41,7 +43,6 @@ impl PageserverFeedback {
            remote_consistent_lsn: Lsn::INVALID,
            disk_consistent_lsn: Lsn::INVALID,
            replytime: *PG_EPOCH,
-            shard_number: 0,
        }
    }

@@ -58,26 +59,17 @@ impl PageserverFeedback {
    //
    // TODO: change serialized fields names once all computes migrate to rename.
    pub fn serialize(&self, buf: &mut BytesMut) {
-        let buf_ptr = buf.len();
-        buf.put_u8(0); // # of keys, will be filled later
-        let mut nkeys = 0;
-
-        nkeys += 1;
+        buf.put_u8(PAGESERVER_FEEDBACK_FIELDS_NUMBER); // # of keys
        buf.put_slice(b"current_timeline_size\0");
        buf.put_i32(8);
        buf.put_u64(self.current_timeline_size);

-        nkeys += 1;
        buf.put_slice(b"ps_writelsn\0");
        buf.put_i32(8);
        buf.put_u64(self.last_received_lsn.0);
-
-        nkeys += 1;
        buf.put_slice(b"ps_flushlsn\0");
        buf.put_i32(8);
        buf.put_u64(self.disk_consistent_lsn.0);
-
-        nkeys += 1;
        buf.put_slice(b"ps_applylsn\0");
        buf.put_i32(8);
        buf.put_u64(self.remote_consistent_lsn.0);
@@ -88,19 +80,9 @@ impl PageserverFeedback {
            .expect("failed to serialize pg_replytime earlier than PG_EPOCH")
            .as_micros() as i64;

-        nkeys += 1;
        buf.put_slice(b"ps_replytime\0");
        buf.put_i32(8);
        buf.put_i64(timestamp);
-
-        if self.shard_number > 0 {
-            nkeys += 1;
-            buf.put_slice(b"shard_number\0");
-            buf.put_i32(4);
-            buf.put_u32(self.shard_number);
-        }
-
-        buf[buf_ptr] = nkeys;
    }

    // Deserialize PageserverFeedback message
@@ -143,8 +125,9 @@ impl PageserverFeedback {
                }
                b"shard_number" => {
                    let len = buf.get_i32();
-                    assert_eq!(len, 4);
-                    rf.shard_number = buf.get_u32();
+                    // TODO: this will be implemented in the next update,
+                    //  for now, we just skip the value.
+                    buf.advance(len as usize);
                }
                _ => {
                    let len = buf.get_i32();
@@ -217,7 +200,10 @@ mod tests {
        rf.serialize(&mut data);

        // Add an extra field to the buffer and adjust number of keys
-        data[0] += 1;
+        if let Some(first) = data.first_mut() {
+            *first = PAGESERVER_FEEDBACK_FIELDS_NUMBER + 1;
+        }
+
        data.put_slice(b"new_field_one\0");
        data.put_i32(8);
        data.put_u64(42);
--- a/libs/utils/src/sync/heavier_once_cell.rs
+++ b/libs/utils/src/sync/heavier_once_cell.rs
@@ -110,49 +110,6 @@ impl<T> OnceCell<T> {
        }
    }

-    /// Returns a guard to an existing initialized value, or returns an unique initialization
-    /// permit which can be used to initialize this `OnceCell` using `OnceCell::set`.
-    pub async fn get_or_init_detached(&self) -> Result<Guard<'_, T>, InitPermit> {
-        // It looks like OnceCell::get_or_init could be implemented using this method instead of
-        // duplication. However, that makes the future be !Send due to possibly holding on to the
-        // MutexGuard over an await point.
-        loop {
-            let sem = {
-                let guard = self.inner.lock().unwrap();
-                if guard.value.is_some() {
-                    return Ok(Guard(guard));
-                }
-                guard.init_semaphore.clone()
-            };
-
-            {
-                let permit = {
-                    // increment the count for the duration of queued
-                    let _guard = CountWaitingInitializers::start(self);
-                    sem.acquire().await
-                };
-
-                let Ok(permit) = permit else {
-                    let guard = self.inner.lock().unwrap();
-                    if !Arc::ptr_eq(&sem, &guard.init_semaphore) {
-                        // there was a take_and_deinit in between
-                        continue;
-                    }
-                    assert!(
-                        guard.value.is_some(),
-                        "semaphore got closed, must be initialized"
-                    );
-                    return Ok(Guard(guard));
-                };
-
-                permit.forget();
-            }
-
-            let permit = InitPermit(sem);
-            return Err(permit);
-        }
-    }
-
    /// Assuming a permit is held after previous call to [`Guard::take_and_deinit`], it can be used
    /// to complete initializing the inner value.
    ///
@@ -245,7 +202,7 @@ impl<'a, T> Guard<'a, T> {
    ///
    /// The permit will be on a semaphore part of the new internal value, and any following
    /// [`OnceCell::get_or_init`] will wait on it to complete.
-    pub fn take_and_deinit(mut self) -> (T, InitPermit) {
+    pub fn take_and_deinit(&mut self) -> (T, InitPermit) {
        let mut swapped = Inner::default();
        let sem = swapped.init_semaphore.clone();
        // acquire and forget right away, moving the control over to InitPermit
@@ -524,39 +481,4 @@ mod tests {

        assert_eq!("t1", *cell.get().unwrap());
    }
-
-    #[tokio::test(start_paused = true)]
-    async fn detached_init_smoke() {
-        let target = OnceCell::default();
-
-        let Err(permit) = target.get_or_init_detached().await else {
-            unreachable!("it is not initialized")
-        };
-
-        tokio::time::timeout(
-            std::time::Duration::from_secs(3600 * 24 * 7 * 365),
-            target.get_or_init(|permit2| async { Ok::<_, Infallible>((11, permit2)) }),
-        )
-        .await
-        .expect_err("should timeout since we are already holding the permit");
-
-        target.set(42, permit);
-
-        let (_answer, permit) = {
-            let guard = target
-                .get_or_init(|permit| async { Ok::<_, Infallible>((11, permit)) })
-                .await
-                .unwrap();
-
-            assert_eq!(*guard, 42);
-
-            guard.take_and_deinit()
-        };
-
-        assert!(target.get().is_none());
-
-        target.set(11, permit);
-
-        assert_eq!(*target.get().unwrap(), 11);
-    }
 }
--- a/libs/utils/src/vec_map.rs
+++ b/libs/utils/src/vec_map.rs
@@ -1,60 +1,27 @@
 use std::{alloc::Layout, cmp::Ordering, ops::RangeBounds};

-#[derive(Clone, Copy, Debug, PartialEq, Eq)]
-pub enum VecMapOrdering {
-    Greater,
-    GreaterOrEqual,
-}
-
 /// Ordered map datastructure implemented in a Vec.
 /// Append only - can only add keys that are larger than the
 /// current max key.
-/// Ordering can be adjusted using [`VecMapOrdering`]
-/// during `VecMap` construction.
 #[derive(Clone, Debug)]
-pub struct VecMap<K, V> {
-    data: Vec<(K, V)>,
-    ordering: VecMapOrdering,
-}
+pub struct VecMap<K, V>(Vec<(K, V)>);

 impl<K, V> Default for VecMap<K, V> {
    fn default() -> Self {
-        VecMap {
-            data: Default::default(),
-            ordering: VecMapOrdering::Greater,
-        }
+        VecMap(Default::default())
    }
 }

-#[derive(thiserror::Error, Debug)]
-pub enum VecMapError {
-    #[error("Key violates ordering constraint")]
-    InvalidKey,
-    #[error("Mismatched ordering constraints")]
-    ExtendOrderingError,
-}
+#[derive(Debug)]
+pub struct InvalidKey;

 impl<K: Ord, V> VecMap<K, V> {
-    pub fn new(ordering: VecMapOrdering) -> Self {
-        Self {
-            data: Vec::new(),
-            ordering,
-        }
-    }
-
-    pub fn with_capacity(capacity: usize, ordering: VecMapOrdering) -> Self {
-        Self {
-            data: Vec::with_capacity(capacity),
-            ordering,
-        }
-    }
-
    pub fn is_empty(&self) -> bool {
-        self.data.is_empty()
+        self.0.is_empty()
    }

    pub fn as_slice(&self) -> &[(K, V)] {
-        self.data.as_slice()
+        self.0.as_slice()
    }

    /// This function may panic if given a range where the lower bound is
@@ -62,7 +29,7 @@ impl<K: Ord, V> VecMap<K, V> {
    pub fn slice_range<R: RangeBounds<K>>(&self, range: R) -> &[(K, V)] {
        use std::ops::Bound::*;

-        let binary_search = |k: &K| self.data.binary_search_by_key(&k, extract_key);
+        let binary_search = |k: &K| self.0.binary_search_by_key(&k, extract_key);

        let start_idx = match range.start_bound() {
            Unbounded => 0,
@@ -74,7 +41,7 @@ impl<K: Ord, V> VecMap<K, V> {
        };

        let end_idx = match range.end_bound() {
-            Unbounded => self.data.len(),
+            Unbounded => self.0.len(),
            Included(k) => match binary_search(k) {
                Ok(idx) => idx + 1,
                Err(idx) => idx,
@@ -82,30 +49,34 @@ impl<K: Ord, V> VecMap<K, V> {
            Excluded(k) => binary_search(k).unwrap_or_else(std::convert::identity),
        };

-        &self.data[start_idx..end_idx]
+        &self.0[start_idx..end_idx]
    }

    /// Add a key value pair to the map.
-    /// If `key` is not respective of the `self` ordering the
-    /// pair will not be added and `InvalidKey` error will be returned.
-    pub fn append(&mut self, key: K, value: V) -> Result<usize, VecMapError> {
-        self.validate_key_order(&key)?;
+    /// If `key` is less than or equal to the current maximum key
+    /// the pair will not be added and InvalidKey error will be returned.
+    pub fn append(&mut self, key: K, value: V) -> Result<usize, InvalidKey> {
+        if let Some((last_key, _last_value)) = self.0.last() {
+            if &key <= last_key {
+                return Err(InvalidKey);
+            }
+        }

        let delta_size = self.instrument_vec_op(|vec| vec.push((key, value)));
        Ok(delta_size)
    }

    /// Update the maximum key value pair or add a new key value pair to the map.
-    /// If `key` is not respective of the `self` ordering no updates or additions
-    /// will occur and `InvalidKey` error will be returned.
+    /// If `key` is less than the current maximum key no updates or additions
+    /// will occur and InvalidKey error will be returned.
    pub fn append_or_update_last(
        &mut self,
        key: K,
        mut value: V,
-    ) -> Result<(Option<V>, usize), VecMapError> {
-        if let Some((last_key, last_value)) = self.data.last_mut() {
+    ) -> Result<(Option<V>, usize), InvalidKey> {
+        if let Some((last_key, last_value)) = self.0.last_mut() {
            match key.cmp(last_key) {
-                Ordering::Less => return Err(VecMapError::InvalidKey),
+                Ordering::Less => return Err(InvalidKey),
                Ordering::Equal => {
                    std::mem::swap(last_value, &mut value);
                    const DELTA_SIZE: usize = 0;
@@ -129,67 +100,40 @@ impl<K: Ord, V> VecMap<K, V> {
        V: Clone,
    {
        let split_idx = self
-            .data
+            .0
            .binary_search_by_key(&cutoff, extract_key)
            .unwrap_or_else(std::convert::identity);

        (
-            VecMap {
-                data: self.data[..split_idx].to_vec(),
-                ordering: self.ordering,
-            },
-            VecMap {
-                data: self.data[split_idx..].to_vec(),
-                ordering: self.ordering,
-            },
+            VecMap(self.0[..split_idx].to_vec()),
+            VecMap(self.0[split_idx..].to_vec()),
        )
    }

    /// Move items from `other` to the end of `self`, leaving `other` empty.
-    /// If the `other` ordering is different from `self` ordering
-    /// `ExtendOrderingError` error will be returned.
-    /// If any keys in `other` is not respective of the ordering defined in
-    /// `self`, `InvalidKey` error will be returned and no mutation will occur.
-    pub fn extend(&mut self, other: &mut Self) -> Result<usize, VecMapError> {
-        if self.ordering != other.ordering {
-            return Err(VecMapError::ExtendOrderingError);
-        }
+    /// If any keys in `other` is less than or equal to any key in `self`,
+    /// `InvalidKey` error will be returned and no mutation will occur.
+    pub fn extend(&mut self, other: &mut Self) -> Result<usize, InvalidKey> {
+        let self_last_opt = self.0.last().map(extract_key);
+        let other_first_opt = other.0.last().map(extract_key);

-        let other_first_opt = other.data.last().map(extract_key);
-        if let Some(other_first) = other_first_opt {
-            self.validate_key_order(other_first)?;
-        }
-
-        let delta_size = self.instrument_vec_op(|vec| vec.append(&mut other.data));
-        Ok(delta_size)
-    }
-
-    /// Validate the current last key in `self` and key being
-    /// inserted against the order defined in `self`.
-    fn validate_key_order(&self, key: &K) -> Result<(), VecMapError> {
-        if let Some(last_key) = self.data.last().map(extract_key) {
-            match (&self.ordering, &key.cmp(last_key)) {
-                (VecMapOrdering::Greater, Ordering::Less | Ordering::Equal) => {
-                    return Err(VecMapError::InvalidKey);
-                }
-                (VecMapOrdering::Greater, Ordering::Greater) => {}
-                (VecMapOrdering::GreaterOrEqual, Ordering::Less) => {
-                    return Err(VecMapError::InvalidKey);
-                }
-                (VecMapOrdering::GreaterOrEqual, Ordering::Equal | Ordering::Greater) => {}
+        if let (Some(self_last), Some(other_first)) = (self_last_opt, other_first_opt) {
+            if self_last >= other_first {
+                return Err(InvalidKey);
            }
        }

-        Ok(())
+        let delta_size = self.instrument_vec_op(|vec| vec.append(&mut other.0));
+        Ok(delta_size)
    }

    /// Instrument an operation on the underlying [`Vec`].
    /// Will panic if the operation decreases capacity.
    /// Returns the increase in memory usage caused by the op.
    fn instrument_vec_op(&mut self, op: impl FnOnce(&mut Vec<(K, V)>)) -> usize {
-        let old_cap = self.data.capacity();
-        op(&mut self.data);
-        let new_cap = self.data.capacity();
+        let old_cap = self.0.capacity();
+        op(&mut self.0);
+        let new_cap = self.0.capacity();

        match old_cap.cmp(&new_cap) {
            Ordering::Less => {
@@ -201,36 +145,6 @@ impl<K: Ord, V> VecMap<K, V> {
            Ordering::Greater => panic!("VecMap capacity shouldn't ever decrease"),
        }
    }
-
-    /// Similar to `from_iter` defined in `FromIter` trait except
-    /// that it accepts an [`VecMapOrdering`]
-    pub fn from_iter<I: IntoIterator<Item = (K, V)>>(iter: I, ordering: VecMapOrdering) -> Self {
-        let iter = iter.into_iter();
-        let initial_capacity = {
-            match iter.size_hint() {
-                (lower_bound, None) => lower_bound,
-                (_, Some(upper_bound)) => upper_bound,
-            }
-        };
-
-        let mut vec_map = VecMap::with_capacity(initial_capacity, ordering);
-        for (key, value) in iter {
-            vec_map
-                .append(key, value)
-                .expect("The passed collection needs to be sorted!");
-        }
-
-        vec_map
-    }
-}
-
-impl<K: Ord, V> IntoIterator for VecMap<K, V> {
-    type Item = (K, V);
-    type IntoIter = std::vec::IntoIter<(K, V)>;
-
-    fn into_iter(self) -> Self::IntoIter {
-        self.data.into_iter()
-    }
 }

 fn extract_key<K, V>(entry: &(K, V)) -> &K {
@@ -241,7 +155,7 @@ fn extract_key<K, V>(entry: &(K, V)) -> &K {
 mod tests {
    use std::{collections::BTreeMap, ops::Bound};

-    use super::{VecMap, VecMapOrdering};
+    use super::VecMap;

    #[test]
    fn unbounded_range() {
@@ -396,59 +310,5 @@ mod tests {
        left.extend(&mut one_map).unwrap_err();
        assert_eq!(left.as_slice(), &[(0, ()), (1, ())]);
        assert_eq!(one_map.as_slice(), &[(1, ())]);
-
-        let mut map_greater_or_equal = VecMap::new(VecMapOrdering::GreaterOrEqual);
-        map_greater_or_equal.append(2, ()).unwrap();
-        map_greater_or_equal.append(2, ()).unwrap();
-
-        left.extend(&mut map_greater_or_equal).unwrap_err();
-        assert_eq!(left.as_slice(), &[(0, ()), (1, ())]);
-        assert_eq!(map_greater_or_equal.as_slice(), &[(2, ()), (2, ())]);
-    }
-
-    #[test]
-    fn extend_with_ordering() {
-        let mut left = VecMap::new(VecMapOrdering::GreaterOrEqual);
-        left.append(0, ()).unwrap();
-        assert_eq!(left.as_slice(), &[(0, ())]);
-
-        let mut greater_right = VecMap::new(VecMapOrdering::Greater);
-        greater_right.append(0, ()).unwrap();
-        left.extend(&mut greater_right).unwrap_err();
-        assert_eq!(left.as_slice(), &[(0, ())]);
-
-        let mut greater_or_equal_right = VecMap::new(VecMapOrdering::GreaterOrEqual);
-        greater_or_equal_right.append(2, ()).unwrap();
-        greater_or_equal_right.append(2, ()).unwrap();
-        left.extend(&mut greater_or_equal_right).unwrap();
-        assert_eq!(left.as_slice(), &[(0, ()), (2, ()), (2, ())]);
-    }
-
-    #[test]
-    fn vec_map_from_sorted() {
-        let vec = vec![(1, ()), (2, ()), (3, ()), (6, ())];
-        let vec_map = VecMap::from_iter(vec, VecMapOrdering::Greater);
-        assert_eq!(vec_map.as_slice(), &[(1, ()), (2, ()), (3, ()), (6, ())]);
-
-        let vec = vec![(1, ()), (2, ()), (3, ()), (3, ()), (6, ()), (6, ())];
-        let vec_map = VecMap::from_iter(vec, VecMapOrdering::GreaterOrEqual);
-        assert_eq!(
-            vec_map.as_slice(),
-            &[(1, ()), (2, ()), (3, ()), (3, ()), (6, ()), (6, ())]
-        );
-    }
-
-    #[test]
-    #[should_panic]
-    fn vec_map_from_unsorted_greater() {
-        let vec = vec![(1, ()), (2, ()), (2, ()), (3, ()), (6, ())];
-        let _ = VecMap::from_iter(vec, VecMapOrdering::Greater);
-    }
-
-    #[test]
-    #[should_panic]
-    fn vec_map_from_unsorted_greater_or_equal() {
-        let vec = vec![(1, ()), (2, ()), (3, ()), (6, ()), (5, ())];
-        let _ = VecMap::from_iter(vec, VecMapOrdering::GreaterOrEqual);
    }
 }
--- a/libs/utils/src/zstd.rs
+++ b/libs/utils/src/zstd.rs
@@ -1,78 +0,0 @@
-use std::io::SeekFrom;
-
-use anyhow::{Context, Result};
-use async_compression::{
-    tokio::{bufread::ZstdDecoder, write::ZstdEncoder},
-    zstd::CParameter,
-    Level,
-};
-use camino::Utf8Path;
-use nix::NixPath;
-use tokio::{
-    fs::{File, OpenOptions},
-    io::AsyncBufRead,
-    io::AsyncSeekExt,
-    io::AsyncWriteExt,
-};
-use tokio_tar::{Archive, Builder, HeaderMode};
-use walkdir::WalkDir;
-
-/// Creates a Zstandard tarball.
-pub async fn create_zst_tarball(path: &Utf8Path, tarball: &Utf8Path) -> Result<(File, u64)> {
-    let file = OpenOptions::new()
-        .create(true)
-        .truncate(true)
-        .read(true)
-        .write(true)
-        .open(&tarball)
-        .await
-        .with_context(|| format!("tempfile creation {tarball}"))?;
-
-    let mut paths = Vec::new();
-    for entry in WalkDir::new(path) {
-        let entry = entry?;
-        let metadata = entry.metadata().expect("error getting dir entry metadata");
-        // Also allow directories so that we also get empty directories
-        if !(metadata.is_file() || metadata.is_dir()) {
-            continue;
-        }
-        let path = entry.into_path();
-        paths.push(path);
-    }
-    // Do a sort to get a more consistent listing
-    paths.sort_unstable();
-    let zstd = ZstdEncoder::with_quality_and_params(
-        file,
-        Level::Default,
-        &[CParameter::enable_long_distance_matching(true)],
-    );
-    let mut builder = Builder::new(zstd);
-    // Use reproducible header mode
-    builder.mode(HeaderMode::Deterministic);
-    for p in paths {
-        let rel_path = p.strip_prefix(path)?;
-        if rel_path.is_empty() {
-            // The top directory should not be compressed,
-            // the tar crate doesn't like that
-            continue;
-        }
-        builder.append_path_with_name(&p, rel_path).await?;
-    }
-    let mut zstd = builder.into_inner().await?;
-    zstd.shutdown().await?;
-    let mut compressed = zstd.into_inner();
-    let compressed_len = compressed.metadata().await?.len();
-    compressed.seek(SeekFrom::Start(0)).await?;
-    Ok((compressed, compressed_len))
-}
-
-/// Creates a Zstandard tarball.
-pub async fn extract_zst_tarball(
-    path: &Utf8Path,
-    tarball: impl AsyncBufRead + Unpin,
-) -> Result<()> {
-    let decoder = Box::pin(ZstdDecoder::new(tarball));
-    let mut archive = Archive::new(decoder);
-    archive.unpack(path).await?;
-    Ok(())
-}
--- a/libs/vm_monitor/src/runner.rs
+++ b/libs/vm_monitor/src/runner.rs
@@ -69,7 +69,7 @@ pub struct Config {
    /// should be removed once we have a better solution there.
    sys_buffer_bytes: u64,

-    /// Minimum fraction of total system memory reserved *before* the cgroup threshold; in
+    /// Minimum fraction of total system memory reserved *before* the the cgroup threshold; in
    /// other words, providing a ceiling for the highest value of the threshold by enforcing that
    /// there's at least `cgroup_min_overhead_fraction` of the total memory remaining beyond the
    /// threshold.
--- a/libs/walproposer/src/api_bindings.rs
+++ b/libs/walproposer/src/api_bindings.rs
@@ -324,11 +324,11 @@ extern "C" fn finish_sync_safekeepers(wp: *mut WalProposer, lsn: XLogRecPtr) {
    }
 }

-extern "C" fn process_safekeeper_feedback(wp: *mut WalProposer, sk: *mut Safekeeper) {
+extern "C" fn process_safekeeper_feedback(wp: *mut WalProposer) {
    unsafe {
        let callback_data = (*(*wp).config).callback_data;
        let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).process_safekeeper_feedback(&mut (*wp), &mut (*sk));
+        (*api).process_safekeeper_feedback(&mut (*wp))
    }
 }

--- a/libs/walproposer/src/walproposer.rs
+++ b/libs/walproposer/src/walproposer.rs
@@ -142,7 +142,7 @@ pub trait ApiImpl {
        todo!()
    }

-    fn process_safekeeper_feedback(&mut self, _wp: &mut WalProposer, _sk: &mut Safekeeper) {
+    fn process_safekeeper_feedback(&mut self, _wp: &mut WalProposer) {
        todo!()
    }

--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -59,7 +59,6 @@ signal-hook.workspace = true
 smallvec = { workspace = true, features = ["write"] }
 svg_fmt.workspace = true
 sync_wrapper.workspace = true
-sysinfo.workspace = true
 tokio-tar.workspace = true
 thiserror.workspace = true
 tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
@@ -90,9 +89,6 @@ enumset = { workspace = true, features = ["serde"]}
 strum.workspace = true
 strum_macros.workspace = true

-[target.'cfg(target_os = "linux")'.dependencies]
-procfs.workspace = true
-
 [dev-dependencies]
 criterion.workspace = true
 hex-literal.workspace = true
--- a/pageserver/benches/bench_walredo.rs
+++ b/pageserver/benches/bench_walredo.rs
@@ -1,156 +1,160 @@
-//! Quantify a single walredo manager's throughput under N concurrent callers.
+//! Simple benchmarking around walredo.
 //!
-//! The benchmark implementation ([`bench_impl`]) is parametrized by
-//! - `redo_work` => [`Request::short_request`] or [`Request::medium_request`]
-//! - `n_redos` => number of times the benchmark shell execute the `redo_work`
-//! - `nclients` => number of clients (more on this shortly).
+//! Right now they hope to just set a baseline. Later we can try to expand into latency and
+//! throughput after figuring out the coordinated omission problems below.
 //!
-//! The benchmark impl sets up a multi-threaded tokio runtime with default parameters.
-//! It spawns `nclients` times [`client`] tokio tasks.
-//! Each task executes the `redo_work` `n_redos/nclients` times.
+//! There are two sets of inputs; `short` and `medium`. They were collected on postgres v14 by
+//! logging what happens when a sequential scan is requested on a small table, then picking out two
+//! suitable from logs.
 //!
-//! We exercise the following combinations:
-//! - `redo_work = short / medium``
-//! - `nclients = [1, 2, 4, 8, 16, 32, 64, 128]`
 //!
-//! We let `criterion` determine the `n_redos` using `iter_custom`.
-//! The idea is that for each `(redo_work, nclients)` combination,
-//! criterion will run the `bench_impl` multiple times with different `n_redos`.
-//! The `bench_impl` reports the aggregate wall clock time from the clients' perspective.
-//! Criterion will divide that by `n_redos` to compute the "time per iteration".
-//! In our case, "time per iteration" means "time per redo_work execution".
-//!
-//! NB: the way by which `iter_custom` determines the "number of iterations"
-//! is called sampling. Apparently the idea here is to detect outliers.
-//! We're not sure whether the current choice of sampling method makes sense.
-//! See https://bheisler.github.io/criterion.rs/book/user_guide/command_line_output.html#collecting-samples
-//!
-//! # Reference Numbers
-//!
-//! 2024-03-20 on i3en.3xlarge
-//!
-//! ```text
-//! short/1                 time:   [26.483 µs 26.614 µs 26.767 µs]
-//! short/2                 time:   [32.223 µs 32.465 µs 32.767 µs]
-//! short/4                 time:   [47.203 µs 47.583 µs 47.984 µs]
-//! short/8                 time:   [89.135 µs 89.612 µs 90.139 µs]
-//! short/16                time:   [190.12 µs 191.52 µs 192.88 µs]
-//! short/32                time:   [380.96 µs 382.63 µs 384.20 µs]
-//! short/64                time:   [736.86 µs 741.07 µs 745.03 µs]
-//! short/128               time:   [1.4106 ms 1.4206 ms 1.4294 ms]
-//! medium/1                time:   [111.81 µs 112.25 µs 112.79 µs]
-//! medium/2                time:   [158.26 µs 159.13 µs 160.21 µs]
-//! medium/4                time:   [334.65 µs 337.14 µs 340.07 µs]
-//! medium/8                time:   [675.32 µs 679.91 µs 685.25 µs]
-//! medium/16               time:   [1.2929 ms 1.2996 ms 1.3067 ms]
-//! medium/32               time:   [2.4295 ms 2.4461 ms 2.4623 ms]
-//! medium/64               time:   [4.3973 ms 4.4458 ms 4.4875 ms]
-//! medium/128              time:   [7.5955 ms 7.7847 ms 7.9481 ms]
-//! ```
+//! Reference data (git blame to see commit) on an i3en.3xlarge
+// ```text
+//! short/short/1           time:   [39.175 µs 39.348 µs 39.536 µs]
+//! short/short/2           time:   [51.227 µs 51.487 µs 51.755 µs]
+//! short/short/4           time:   [76.048 µs 76.362 µs 76.674 µs]
+//! short/short/8           time:   [128.94 µs 129.82 µs 130.74 µs]
+//! short/short/16          time:   [227.84 µs 229.00 µs 230.28 µs]
+//! short/short/32          time:   [455.97 µs 457.81 µs 459.90 µs]
+//! short/short/64          time:   [902.46 µs 904.84 µs 907.32 µs]
+//! short/short/128         time:   [1.7416 ms 1.7487 ms 1.7561 ms]
+//! ``
+
+use std::sync::Arc;

 use bytes::{Buf, Bytes};
-use criterion::{BenchmarkId, Criterion};
-use pageserver::{config::PageServerConf, walrecord::NeonWalRecord, walredo::PostgresRedoManager};
-use pageserver_api::{key::Key, shard::TenantShardId};
-use std::{
-    sync::Arc,
-    time::{Duration, Instant},
+use pageserver::{
+    config::PageServerConf, repository::Key, walrecord::NeonWalRecord, walredo::PostgresRedoManager,
 };
-use tokio::{sync::Barrier, task::JoinSet};
+use pageserver_api::shard::TenantShardId;
+use tokio::task::JoinSet;
 use utils::{id::TenantId, lsn::Lsn};

-fn bench(c: &mut Criterion) {
-    {
-        let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
-        for nclients in nclients {
-            let mut group = c.benchmark_group("short");
-            group.bench_with_input(
-                BenchmarkId::from_parameter(nclients),
-                &nclients,
-                |b, nclients| {
-                    let redo_work = Arc::new(Request::short_input());
-                    b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
-                },
-            );
-        }
-    }
+use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};

-    {
-        let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
-        for nclients in nclients {
-            let mut group = c.benchmark_group("medium");
-            group.bench_with_input(
-                BenchmarkId::from_parameter(nclients),
-                &nclients,
-                |b, nclients| {
-                    let redo_work = Arc::new(Request::medium_input());
-                    b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
-                },
-            );
-        }
-    }
-}
-criterion::criterion_group!(benches, bench);
-criterion::criterion_main!(benches);
+fn redo_scenarios(c: &mut Criterion) {
+    // logging should be enabled when adding more inputs, since walredo will only report malformed
+    // input to the stderr.
+    // utils::logging::init(utils::logging::LogFormat::Plain).unwrap();

-// Returns the sum of each client's wall-clock time spent executing their share of the n_redos.
-fn bench_impl(redo_work: Arc<Request>, n_redos: u64, nclients: u64) -> Duration {
    let repo_dir = camino_tempfile::tempdir_in(env!("CARGO_TARGET_TMPDIR")).unwrap();

    let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
    let conf = Box::leak(Box::new(conf));
    let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());

+    let manager = PostgresRedoManager::new(conf, tenant_shard_id);
+
+    let manager = Arc::new(manager);
+
+    {
+        let rt = tokio::runtime::Builder::new_current_thread()
+            .enable_all()
+            .build()
+            .unwrap();
+        tracing::info!("executing first");
+        rt.block_on(short().execute(&manager)).unwrap();
+        tracing::info!("first executed");
+    }
+
+    let thread_counts = [1, 2, 4, 8, 16, 32, 64, 128];
+
+    let mut group = c.benchmark_group("short");
+    group.sampling_mode(criterion::SamplingMode::Flat);
+
+    for thread_count in thread_counts {
+        group.bench_with_input(
+            BenchmarkId::new("short", thread_count),
+            &thread_count,
+            |b, thread_count| {
+                add_multithreaded_walredo_requesters(b, *thread_count, &manager, short);
+            },
+        );
+    }
+    drop(group);
+
+    let mut group = c.benchmark_group("medium");
+    group.sampling_mode(criterion::SamplingMode::Flat);
+
+    for thread_count in thread_counts {
+        group.bench_with_input(
+            BenchmarkId::new("medium", thread_count),
+            &thread_count,
+            |b, thread_count| {
+                add_multithreaded_walredo_requesters(b, *thread_count, &manager, medium);
+            },
+        );
+    }
+    drop(group);
+}
+
+/// Sets up a multi-threaded tokio runtime with default worker thread count,
+/// then, spawn `requesters` tasks that repeatedly:
+/// - get input from `input_factor()`
+/// - call `manager.request_redo()` with their input
+///
+/// This stress-tests the scalability of a single walredo manager at high tokio-level concurrency.
+///
+/// Using tokio's default worker thread count means the results will differ on machines
+/// with different core countrs. We don't care about that, the performance will always
+/// be different on different hardware. To compare performance of different software versions,
+/// use the same hardware.
+fn add_multithreaded_walredo_requesters(
+    b: &mut criterion::Bencher,
+    nrequesters: usize,
+    manager: &Arc<PostgresRedoManager>,
+    input_factory: fn() -> Request,
+) {
+    assert_ne!(nrequesters, 0);
+
    let rt = tokio::runtime::Builder::new_multi_thread()
        .enable_all()
        .build()
        .unwrap();

-    let start = Arc::new(Barrier::new(nclients as usize));
+    let barrier = Arc::new(tokio::sync::Barrier::new(nrequesters + 1));

-    let mut tasks = JoinSet::new();
-
-    let manager = PostgresRedoManager::new(conf, tenant_shard_id);
-    let manager = Arc::new(manager);
-
-    for _ in 0..nclients {
-        rt.block_on(async {
-            tasks.spawn(client(
-                Arc::clone(&manager),
-                Arc::clone(&start),
-                Arc::clone(&redo_work),
-                // divide the amount of work equally among the clients
-                n_redos / nclients,
-            ))
+    let mut requesters = JoinSet::new();
+    for _ in 0..nrequesters {
+        let _entered = rt.enter();
+        let manager = manager.clone();
+        let barrier = barrier.clone();
+        requesters.spawn(async move {
+            loop {
+                let input = input_factory();
+                barrier.wait().await;
+                let page = input.execute(&manager).await.unwrap();
+                assert_eq!(page.remaining(), 8192);
+                barrier.wait().await;
+            }
        });
    }

-    rt.block_on(async move {
-        let mut total_wallclock_time = std::time::Duration::from_millis(0);
-        while let Some(res) = tasks.join_next().await {
-            total_wallclock_time += res.unwrap();
-        }
-        total_wallclock_time
-    })
+    let do_one_iteration = || {
+        rt.block_on(async {
+            barrier.wait().await;
+            // wait for work to complete
+            barrier.wait().await;
+        })
+    };
+
+    b.iter_batched(
+        || {
+            // warmup
+            do_one_iteration();
+        },
+        |()| {
+            // work loop
+            do_one_iteration();
+        },
+        criterion::BatchSize::PerIteration,
+    );
+
+    rt.block_on(requesters.shutdown());
 }

-async fn client(
-    mgr: Arc<PostgresRedoManager>,
-    start: Arc<Barrier>,
-    redo_work: Arc<Request>,
-    n_redos: u64,
-) -> Duration {
-    start.wait().await;
-    let start = Instant::now();
-    for _ in 0..n_redos {
-        let page = redo_work.execute(&mgr).await.unwrap();
-        assert_eq!(page.remaining(), 8192);
-        // The real pageserver will rarely if ever do 2 walredos in a row without
-        // yielding to the executor.
-        tokio::task::yield_now().await;
-    }
-    start.elapsed()
-}
+criterion_group!(benches, redo_scenarios);
+criterion_main!(benches);

 macro_rules! lsn {
    ($input:expr) => {{
@@ -162,46 +166,12 @@ macro_rules! lsn {
    }};
 }

-/// Simple wrapper around `WalRedoManager::request_redo`.
-///
-/// In benchmarks this is cloned around.
-#[derive(Clone)]
-struct Request {
-    key: Key,
-    lsn: Lsn,
-    base_img: Option<(Lsn, Bytes)>,
-    records: Vec<(Lsn, NeonWalRecord)>,
-    pg_version: u32,
-}
-
-impl Request {
-    async fn execute(&self, manager: &PostgresRedoManager) -> anyhow::Result<Bytes> {
-        let Request {
-            key,
-            lsn,
-            base_img,
-            records,
-            pg_version,
-        } = self;
-
-        // TODO: avoid these clones
-        manager
-            .request_redo(*key, *lsn, base_img.clone(), records.clone(), *pg_version)
-            .await
-    }
-
-    fn pg_record(will_init: bool, bytes: &'static [u8]) -> NeonWalRecord {
-        let rec = Bytes::from_static(bytes);
-        NeonWalRecord::Postgres { will_init, rec }
-    }
-
-    /// Short payload, 1132 bytes.
-    // pg_records are copypasted from log, where they are put with Debug impl of Bytes, which uses \0
-    // for null bytes.
-    #[allow(clippy::octal_escapes)]
-    pub fn short_input() -> Request {
-        let pg_record = Self::pg_record;
-        Request {
+/// Short payload, 1132 bytes.
+// pg_records are copypasted from log, where they are put with Debug impl of Bytes, which uses \0
+// for null bytes.
+#[allow(clippy::octal_escapes)]
+fn short() -> Request {
+    Request {
        key: Key {
            field1: 0,
            field2: 1663,
@@ -224,14 +194,13 @@ impl Request {
        ],
        pg_version: 14,
    }
-    }
+}

-    /// Medium sized payload, serializes as 26393 bytes.
-    // see [`short`]
-    #[allow(clippy::octal_escapes)]
-    pub fn medium_input() -> Request {
-        let pg_record = Self::pg_record;
-        Request {
+/// Medium sized payload, serializes as 26393 bytes.
+// see [`short`]
+#[allow(clippy::octal_escapes)]
+fn medium() -> Request {
+    Request {
        key: Key {
            field1: 0,
            field2: 1663,
@@ -473,5 +442,37 @@ impl Request {
        ],
        pg_version: 14,
    }
+}
+
+fn pg_record(will_init: bool, bytes: &'static [u8]) -> NeonWalRecord {
+    let rec = Bytes::from_static(bytes);
+    NeonWalRecord::Postgres { will_init, rec }
+}
+
+/// Simple wrapper around `WalRedoManager::request_redo`.
+///
+/// In benchmarks this is cloned around.
+#[derive(Clone)]
+struct Request {
+    key: Key,
+    lsn: Lsn,
+    base_img: Option<(Lsn, Bytes)>,
+    records: Vec<(Lsn, NeonWalRecord)>,
+    pg_version: u32,
+}
+
+impl Request {
+    async fn execute(self, manager: &PostgresRedoManager) -> anyhow::Result<Bytes> {
+        let Request {
+            key,
+            lsn,
+            base_img,
+            records,
+            pg_version,
+        } = self;
+
+        manager
+            .request_redo(key, lsn, base_img, records, pg_version)
+            .await
    }
 }
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -169,7 +169,7 @@ impl Client {
        self.request(Method::GET, uri, ()).await
    }

-    async fn request_noerror<B: serde::Serialize, U: reqwest::IntoUrl>(
+    async fn request<B: serde::Serialize, U: reqwest::IntoUrl>(
        &self,
        method: Method,
        uri: U,
@@ -181,16 +181,7 @@ impl Client {
        } else {
            req
        };
-        req.json(&body).send().await.map_err(Error::ReceiveBody)
-    }
-
-    async fn request<B: serde::Serialize, U: reqwest::IntoUrl>(
-        &self,
-        method: Method,
-        uri: U,
-        body: B,
-    ) -> Result<reqwest::Response> {
-        let res = self.request_noerror(method, uri, body).await?;
+        let res = req.json(&body).send().await.map_err(Error::ReceiveBody)?;
        let response = res.error_from_body().await?;
        Ok(response)
    }
@@ -249,26 +240,13 @@ impl Client {
        Ok(())
    }

-    pub async fn tenant_secondary_download(
-        &self,
-        tenant_id: TenantShardId,
-        wait: Option<std::time::Duration>,
-    ) -> Result<(StatusCode, SecondaryProgress)> {
-        let mut path = reqwest::Url::parse(&format!(
+    pub async fn tenant_secondary_download(&self, tenant_id: TenantShardId) -> Result<()> {
+        let uri = format!(
            "{}/v1/tenant/{}/secondary/download",
            self.mgmt_api_endpoint, tenant_id
-        ))
-        .expect("Cannot build URL");
-
-        if let Some(wait) = wait {
-            path.query_pairs_mut()
-                .append_pair("wait_ms", &format!("{}", wait.as_millis()));
-        }
-
-        let response = self.request(Method::POST, path, ()).await?;
-        let status = response.status();
-        let progress: SecondaryProgress = response.json().await.map_err(Error::ReceiveBody)?;
-        Ok((status, progress))
+        );
+        self.request(Method::POST, &uri, ()).await?;
+        Ok(())
    }

    pub async fn location_config(
@@ -438,77 +416,4 @@ impl Client {
            .await
            .map_err(Error::ReceiveBody)
    }
-
-    pub async fn get_utilization(&self) -> Result<PageserverUtilization> {
-        let uri = format!("{}/v1/utilization", self.mgmt_api_endpoint);
-        self.get(uri)
-            .await?
-            .json()
-            .await
-            .map_err(Error::ReceiveBody)
-    }
-
-    pub async fn layer_map_info(
-        &self,
-        tenant_shard_id: TenantShardId,
-        timeline_id: TimelineId,
-    ) -> Result<LayerMapInfo> {
-        let uri = format!(
-            "{}/v1/tenant/{}/timeline/{}/layer",
-            self.mgmt_api_endpoint, tenant_shard_id, timeline_id,
-        );
-        self.get(&uri)
-            .await?
-            .json()
-            .await
-            .map_err(Error::ReceiveBody)
-    }
-
-    pub async fn layer_evict(
-        &self,
-        tenant_shard_id: TenantShardId,
-        timeline_id: TimelineId,
-        layer_file_name: &str,
-    ) -> Result<bool> {
-        let uri = format!(
-            "{}/v1/tenant/{}/timeline/{}/layer/{}",
-            self.mgmt_api_endpoint, tenant_shard_id, timeline_id, layer_file_name
-        );
-        let resp = self.request_noerror(Method::DELETE, &uri, ()).await?;
-        match resp.status() {
-            StatusCode::OK => Ok(true),
-            StatusCode::NOT_MODIFIED => Ok(false),
-            // TODO: dedupe this pattern / introduce separate error variant?
-            status => Err(match resp.json::<HttpErrorBody>().await {
-                Ok(HttpErrorBody { msg }) => Error::ApiError(status, msg),
-                Err(_) => {
-                    Error::ReceiveErrorBody(format!("Http error ({}) at {}.", status.as_u16(), uri))
-                }
-            }),
-        }
-    }
-
-    pub async fn layer_ondemand_download(
-        &self,
-        tenant_shard_id: TenantShardId,
-        timeline_id: TimelineId,
-        layer_file_name: &str,
-    ) -> Result<bool> {
-        let uri = format!(
-            "{}/v1/tenant/{}/timeline/{}/layer/{}",
-            self.mgmt_api_endpoint, tenant_shard_id, timeline_id, layer_file_name
-        );
-        let resp = self.request_noerror(Method::GET, &uri, ()).await?;
-        match resp.status() {
-            StatusCode::OK => Ok(true),
-            StatusCode::NOT_MODIFIED => Ok(false),
-            // TODO: dedupe this pattern / introduce separate error variant?
-            status => Err(match resp.json::<HttpErrorBody>().await {
-                Ok(HttpErrorBody { msg }) => Error::ApiError(status, msg),
-                Err(_) => {
-                    Error::ReceiveErrorBody(format!("Http error ({}) at {}.", status.as_u16(), uri))
-                }
-            }),
-        }
-    }
 }
--- a/pageserver/pagebench/src/cmd/ondemand_download_churn.rs
+++ b/pageserver/pagebench/src/cmd/ondemand_download_churn.rs
@@ -1,272 +0,0 @@
-use pageserver_api::{models::HistoricLayerInfo, shard::TenantShardId};
-
-use pageserver_client::mgmt_api;
-use rand::seq::SliceRandom;
-use tracing::{debug, info};
-use utils::id::{TenantTimelineId, TimelineId};
-
-use tokio::{
-    sync::{mpsc, OwnedSemaphorePermit},
-    task::JoinSet,
-};
-
-use std::{
-    num::NonZeroUsize,
-    sync::{
-        atomic::{AtomicU64, Ordering},
-        Arc,
-    },
-    time::{Duration, Instant},
-};
-
-/// Evict & on-demand download random layers.
-#[derive(clap::Parser)]
-pub(crate) struct Args {
-    #[clap(long, default_value = "http://localhost:9898")]
-    mgmt_api_endpoint: String,
-    #[clap(long)]
-    pageserver_jwt: Option<String>,
-    #[clap(long)]
-    runtime: Option<humantime::Duration>,
-    #[clap(long, default_value = "1")]
-    tasks_per_target: NonZeroUsize,
-    #[clap(long, default_value = "1")]
-    concurrency_per_target: NonZeroUsize,
-    /// Probability for sending `latest=true` in the request (uniform distribution).
-    #[clap(long)]
-    limit_to_first_n_targets: Option<usize>,
-    /// Before starting the benchmark, live-reconfigure the pageserver to use the given
-    /// [`pageserver_api::models::virtual_file::IoEngineKind`].
-    #[clap(long)]
-    set_io_engine: Option<pageserver_api::models::virtual_file::IoEngineKind>,
-    targets: Option<Vec<TenantTimelineId>>,
-}
-
-pub(crate) fn main(args: Args) -> anyhow::Result<()> {
-    let rt = tokio::runtime::Builder::new_multi_thread()
-        .enable_all()
-        .build()?;
-    let task = rt.spawn(main_impl(args));
-    rt.block_on(task).unwrap().unwrap();
-    Ok(())
-}
-
-#[derive(Debug, Default)]
-struct LiveStats {
-    evictions: AtomicU64,
-    downloads: AtomicU64,
-    timeline_restarts: AtomicU64,
-}
-
-impl LiveStats {
-    fn eviction_done(&self) {
-        self.evictions.fetch_add(1, Ordering::Relaxed);
-    }
-    fn download_done(&self) {
-        self.downloads.fetch_add(1, Ordering::Relaxed);
-    }
-    fn timeline_restart_done(&self) {
-        self.timeline_restarts.fetch_add(1, Ordering::Relaxed);
-    }
-}
-
-async fn main_impl(args: Args) -> anyhow::Result<()> {
-    let args: &'static Args = Box::leak(Box::new(args));
-
-    let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
-        args.mgmt_api_endpoint.clone(),
-        args.pageserver_jwt.as_deref(),
-    ));
-
-    if let Some(engine_str) = &args.set_io_engine {
-        mgmt_api_client.put_io_engine(engine_str).await?;
-    }
-
-    // discover targets
-    let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
-        &mgmt_api_client,
-        crate::util::cli::targets::Spec {
-            limit_to_first_n_targets: args.limit_to_first_n_targets,
-            targets: args.targets.clone(),
-        },
-    )
-    .await?;
-
-    let mut tasks = JoinSet::new();
-
-    let live_stats = Arc::new(LiveStats::default());
-    tasks.spawn({
-        let live_stats = Arc::clone(&live_stats);
-        async move {
-            let mut last_at = Instant::now();
-            loop {
-                tokio::time::sleep_until((last_at + Duration::from_secs(1)).into()).await;
-                let now = Instant::now();
-                let delta: Duration = now - last_at;
-                last_at = now;
-
-                let LiveStats {
-                    evictions,
-                    downloads,
-                    timeline_restarts,
-                } = &*live_stats;
-                let evictions = evictions.swap(0, Ordering::Relaxed) as f64 / delta.as_secs_f64();
-                let downloads = downloads.swap(0, Ordering::Relaxed) as f64 / delta.as_secs_f64();
-                let timeline_restarts = timeline_restarts.swap(0, Ordering::Relaxed);
-                info!("evictions={evictions:.2}/s downloads={downloads:.2}/s timeline_restarts={timeline_restarts}");
-            }
-        }
-    });
-
-    for tl in timelines {
-        for _ in 0..args.tasks_per_target.get() {
-            tasks.spawn(timeline_actor(
-                args,
-                Arc::clone(&mgmt_api_client),
-                tl,
-                Arc::clone(&live_stats),
-            ));
-        }
-    }
-
-    while let Some(res) = tasks.join_next().await {
-        res.unwrap();
-    }
-    Ok(())
-}
-
-async fn timeline_actor(
-    args: &'static Args,
-    mgmt_api_client: Arc<pageserver_client::mgmt_api::Client>,
-    timeline: TenantTimelineId,
-    live_stats: Arc<LiveStats>,
-) {
-    // TODO: support sharding
-    let tenant_shard_id = TenantShardId::unsharded(timeline.tenant_id);
-
-    struct Timeline {
-        joinset: JoinSet<()>,
-        layers: Vec<mpsc::Sender<OwnedSemaphorePermit>>,
-        concurrency: Arc<tokio::sync::Semaphore>,
-    }
-    loop {
-        debug!("restarting timeline");
-        let layer_map_info = mgmt_api_client
-            .layer_map_info(tenant_shard_id, timeline.timeline_id)
-            .await
-            .unwrap();
-        let concurrency = Arc::new(tokio::sync::Semaphore::new(
-            args.concurrency_per_target.get(),
-        ));
-
-        let mut joinset = JoinSet::new();
-        let layers = layer_map_info
-            .historic_layers
-            .into_iter()
-            .map(|historic_layer| {
-                let (tx, rx) = mpsc::channel(1);
-                joinset.spawn(layer_actor(
-                    tenant_shard_id,
-                    timeline.timeline_id,
-                    historic_layer,
-                    rx,
-                    Arc::clone(&mgmt_api_client),
-                    Arc::clone(&live_stats),
-                ));
-                tx
-            })
-            .collect::<Vec<_>>();
-
-        let mut timeline = Timeline {
-            joinset,
-            layers,
-            concurrency,
-        };
-
-        live_stats.timeline_restart_done();
-
-        loop {
-            assert!(!timeline.joinset.is_empty());
-            if let Some(res) = timeline.joinset.try_join_next() {
-                debug!(?res, "a layer actor exited, should not happen");
-                timeline.joinset.shutdown().await;
-                break;
-            }
-
-            let mut permit = Some(
-                Arc::clone(&timeline.concurrency)
-                    .acquire_owned()
-                    .await
-                    .unwrap(),
-            );
-
-            loop {
-                let layer_tx = {
-                    let mut rng = rand::thread_rng();
-                    timeline.layers.choose_mut(&mut rng).expect("no layers")
-                };
-                match layer_tx.try_send(permit.take().unwrap()) {
-                    Ok(_) => break,
-                    Err(e) => match e {
-                        mpsc::error::TrySendError::Full(back) => {
-                            // TODO: retrying introduces bias away from slow downloaders
-                            permit.replace(back);
-                        }
-                        mpsc::error::TrySendError::Closed(_) => panic!(),
-                    },
-                }
-            }
-        }
-    }
-}
-
-async fn layer_actor(
-    tenant_shard_id: TenantShardId,
-    timeline_id: TimelineId,
-    mut layer: HistoricLayerInfo,
-    mut rx: mpsc::Receiver<tokio::sync::OwnedSemaphorePermit>,
-    mgmt_api_client: Arc<mgmt_api::Client>,
-    live_stats: Arc<LiveStats>,
-) {
-    #[derive(Clone, Copy)]
-    enum Action {
-        Evict,
-        OnDemandDownload,
-    }
-
-    while let Some(_permit) = rx.recv().await {
-        let action = if layer.is_remote() {
-            Action::OnDemandDownload
-        } else {
-            Action::Evict
-        };
-
-        let did_it = match action {
-            Action::Evict => {
-                let did_it = mgmt_api_client
-                    .layer_evict(tenant_shard_id, timeline_id, layer.layer_file_name())
-                    .await
-                    .unwrap();
-                live_stats.eviction_done();
-                did_it
-            }
-            Action::OnDemandDownload => {
-                let did_it = mgmt_api_client
-                    .layer_ondemand_download(tenant_shard_id, timeline_id, layer.layer_file_name())
-                    .await
-                    .unwrap();
-                live_stats.download_done();
-                did_it
-            }
-        };
-        if !did_it {
-            debug!("local copy of layer map appears out of sync, re-downloading");
-            return;
-        }
-        debug!("did it");
-        layer.set_remote(match action {
-            Action::Evict => true,
-            Action::OnDemandDownload => false,
-        });
-    }
-}
--- a/pageserver/pagebench/src/main.rs
+++ b/pageserver/pagebench/src/main.rs
@@ -16,7 +16,6 @@ mod util {
 mod cmd {
    pub(super) mod basebackup;
    pub(super) mod getpage_latest_lsn;
-    pub(super) mod ondemand_download_churn;
    pub(super) mod trigger_initial_size_calculation;
 }

@@ -26,7 +25,6 @@ enum Args {
    Basebackup(cmd::basebackup::Args),
    GetPageLatestLsn(cmd::getpage_latest_lsn::Args),
    TriggerInitialSizeCalculation(cmd::trigger_initial_size_calculation::Args),
-    OndemandDownloadChurn(cmd::ondemand_download_churn::Args),
 }

 fn main() {
@@ -45,7 +43,6 @@ fn main() {
        Args::TriggerInitialSizeCalculation(args) => {
            cmd::trigger_initial_size_calculation::main(args)
        }
-        Args::OndemandDownloadChurn(args) => cmd::ondemand_download_churn::main(args),
    }
    .unwrap()
 }
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -120,9 +120,6 @@ fn main() -> anyhow::Result<()> {
        &[("node_id", &conf.id.to_string())],
    );

-    // after setting up logging, log the effective IO engine choice
-    info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
-
    let tenants_path = conf.tenants_path();
    if !tenants_path.exists() {
        utils::crashsafe::create_dir_all(conf.tenants_path())
@@ -317,7 +314,6 @@ fn start_pageserver(
    let http_listener = tcp_listener::bind(http_addr)?;

    let pg_addr = &conf.listen_pg_addr;
-
    info!("Starting pageserver pg protocol handler on {pg_addr}");
    let pageserver_listener = tcp_listener::bind(pg_addr)?;

@@ -550,7 +546,7 @@ fn start_pageserver(
        let router_state = Arc::new(
            http::routes::State::new(
                conf,
-                tenant_manager.clone(),
+                tenant_manager,
                http_auth.clone(),
                remote_storage.clone(),
                broker_client.clone(),
@@ -600,37 +596,32 @@ fn start_pageserver(
            None,
            "consumption metrics collection",
            true,
-            {
-                let tenant_manager = tenant_manager.clone();
-                async move {
-                    // first wait until background jobs are cleared to launch.
-                    //
-                    // this is because we only process active tenants and timelines, and the
-                    // Timeline::get_current_logical_size will spawn the logical size calculation,
-                    // which will not be rate-limited.
-                    let cancel = task_mgr::shutdown_token();
+            async move {
+                // first wait until background jobs are cleared to launch.
+                //
+                // this is because we only process active tenants and timelines, and the
+                // Timeline::get_current_logical_size will spawn the logical size calculation,
+                // which will not be rate-limited.
+                let cancel = task_mgr::shutdown_token();

-                    tokio::select! {
-                        _ = cancel.cancelled() => { return Ok(()); },
-                        _ = background_jobs_barrier.wait() => {}
-                    };
+                tokio::select! {
+                    _ = cancel.cancelled() => { return Ok(()); },
+                    _ = background_jobs_barrier.wait() => {}
+                };

-                    pageserver::consumption_metrics::collect_metrics(
-                        tenant_manager,
-                        metric_collection_endpoint,
-                        &conf.metric_collection_bucket,
-                        conf.metric_collection_interval,
-                        conf.cached_metric_collection_interval,
-                        conf.synthetic_size_calculation_interval,
-                        conf.id,
-                        local_disk_storage,
-                        cancel,
-                        metrics_ctx,
-                    )
-                    .instrument(info_span!("metrics_collection"))
-                    .await?;
-                    Ok(())
-                }
+                pageserver::consumption_metrics::collect_metrics(
+                    metric_collection_endpoint,
+                    conf.metric_collection_interval,
+                    conf.cached_metric_collection_interval,
+                    conf.synthetic_size_calculation_interval,
+                    conf.id,
+                    local_disk_storage,
+                    cancel,
+                    metrics_ctx,
+                )
+                .instrument(info_span!("metrics_collection"))
+                .await?;
+                Ok(())
            },
        );
    }
@@ -699,7 +690,6 @@ fn start_pageserver(
                let bg_remote_storage = remote_storage.clone();
                let bg_deletion_queue = deletion_queue.clone();
                BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(
-                    &tenant_manager,
                    bg_remote_storage.map(|_| bg_deletion_queue),
                    0,
                ));
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -30,17 +30,18 @@ use utils::{
    logging::LogFormat,
 };

+use crate::disk_usage_eviction_task::DiskUsageEvictionTaskConfig;
+use crate::tenant::config::TenantConf;
 use crate::tenant::config::TenantConfOpt;
 use crate::tenant::timeline::GetVectoredImpl;
 use crate::tenant::vectored_blob_io::MaxVectoredReadBytes;
 use crate::tenant::{
    TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
 };
-use crate::{disk_usage_eviction_task::DiskUsageEvictionTaskConfig, virtual_file::io_engine};
-use crate::{tenant::config::TenantConf, virtual_file};
+use crate::virtual_file;
 use crate::{
    IGNORED_TENANT_FILE_NAME, TENANT_CONFIG_NAME, TENANT_HEATMAP_BASENAME,
-    TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX,
+    TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX, TIMELINE_UNINIT_MARK_SUFFIX,
 };

 use self::defaults::DEFAULT_CONCURRENT_TENANT_WARMUP;
@@ -95,8 +96,6 @@ pub mod defaults {

    pub const DEFAULT_VALIDATE_VECTORED_GET: bool = true;

-    pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
-
    ///
    /// Default built-in configuration file.
    ///
@@ -158,8 +157,6 @@ pub mod defaults {
 #heatmap_upload_concurrency = {DEFAULT_HEATMAP_UPLOAD_CONCURRENCY}
 #secondary_download_concurrency = {DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY}

-#ephemeral_bytes_per_memory_kb = {DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB}
-
 [remote_storage]

 "#
@@ -238,7 +235,6 @@ pub struct PageServerConf {
    // How often to send unchanged cached metrics to the metrics endpoint.
    pub cached_metric_collection_interval: Duration,
    pub metric_collection_endpoint: Option<Url>,
-    pub metric_collection_bucket: Option<RemoteStorageConfig>,
    pub synthetic_size_calculation_interval: Duration,

    pub disk_usage_based_eviction: Option<DiskUsageEvictionTaskConfig>,
@@ -283,13 +279,6 @@ pub struct PageServerConf {
    pub max_vectored_read_bytes: MaxVectoredReadBytes,

    pub validate_vectored_get: bool,
-
-    /// How many bytes of ephemeral layer content will we allow per kilobyte of RAM.  When this
-    /// is exceeded, we start proactively closing ephemeral layers to limit the total amount
-    /// of ephemeral data.
-    ///
-    /// Setting this to zero disables limits on total ephemeral layer size.
-    pub ephemeral_bytes_per_memory_kb: usize,
 }

 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -302,23 +291,16 @@ pub static SAFEKEEPER_AUTH_TOKEN: OnceCell<Arc<String>> = OnceCell::new();

 // use dedicated enum for builder to better indicate the intention
 // and avoid possible confusion with nested options
-#[derive(Clone, Default)]
 pub enum BuilderValue<T> {
    Set(T),
-    #[default]
    NotSet,
 }

-impl<T: Clone> BuilderValue<T> {
-    pub fn ok_or(&self, field_name: &'static str, default: BuilderValue<T>) -> anyhow::Result<T> {
+impl<T> BuilderValue<T> {
+    pub fn ok_or<E>(self, err: E) -> Result<T, E> {
        match self {
-            Self::Set(v) => Ok(v.clone()),
-            Self::NotSet => match default {
-                BuilderValue::Set(v) => Ok(v.clone()),
-                BuilderValue::NotSet => {
-                    anyhow::bail!("missing config value {field_name:?}")
-                }
-            },
+            Self::Set(v) => Ok(v),
+            Self::NotSet => Err(err),
        }
    }
 }
@@ -344,7 +326,6 @@ pub(crate) struct NodeMetadata {
 }

 // needed to simplify config construction
-#[derive(Default)]
 struct PageServerConfigBuilder {
    listen_pg_addr: BuilderValue<String>,

@@ -385,7 +366,6 @@ struct PageServerConfigBuilder {
    cached_metric_collection_interval: BuilderValue<Duration>,
    metric_collection_endpoint: BuilderValue<Option<Url>>,
    synthetic_size_calculation_interval: BuilderValue<Duration>,
-    metric_collection_bucket: BuilderValue<Option<RemoteStorageConfig>>,

    disk_usage_based_eviction: BuilderValue<Option<DiskUsageEvictionTaskConfig>>,

@@ -411,13 +391,10 @@ struct PageServerConfigBuilder {
    max_vectored_read_bytes: BuilderValue<MaxVectoredReadBytes>,

    validate_vectored_get: BuilderValue<bool>,
-
-    ephemeral_bytes_per_memory_kb: BuilderValue<usize>,
 }

-impl PageServerConfigBuilder {
-    #[inline(always)]
-    fn default_values() -> Self {
+impl Default for PageServerConfigBuilder {
+    fn default() -> Self {
        use self::BuilderValue::*;
        use defaults::*;
        Self {
@@ -470,8 +447,6 @@ impl PageServerConfigBuilder {
            .expect("cannot parse default synthetic size calculation interval")),
            metric_collection_endpoint: Set(DEFAULT_METRIC_COLLECTION_ENDPOINT),

-            metric_collection_bucket: Set(None),
-
            disk_usage_based_eviction: Set(None),

            test_remote_failures: Set(0),
@@ -499,7 +474,6 @@ impl PageServerConfigBuilder {
                NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
            )),
            validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET),
-            ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
        }
    }
 }
@@ -604,13 +578,6 @@ impl PageServerConfigBuilder {
        self.metric_collection_endpoint = BuilderValue::Set(metric_collection_endpoint)
    }

-    pub fn metric_collection_bucket(
-        &mut self,
-        metric_collection_bucket: Option<RemoteStorageConfig>,
-    ) {
-        self.metric_collection_bucket = BuilderValue::Set(metric_collection_bucket)
-    }
-
    pub fn synthetic_size_calculation_interval(
        &mut self,
        synthetic_size_calculation_interval: Duration,
@@ -679,103 +646,126 @@ impl PageServerConfigBuilder {
        self.validate_vectored_get = BuilderValue::Set(value);
    }

-    pub fn get_ephemeral_bytes_per_memory_kb(&mut self, value: usize) {
-        self.ephemeral_bytes_per_memory_kb = BuilderValue::Set(value);
-    }
-
    pub fn build(self) -> anyhow::Result<PageServerConf> {
-        let default = Self::default_values();
-
-        macro_rules! conf {
-            (USING DEFAULT { $($field:ident,)* } CUSTOM LOGIC { $($custom_field:ident : $custom_value:expr,)* } ) => {
-                PageServerConf {
-                    $(
-                        $field: self.$field.ok_or(stringify!($field), default.$field)?,
-                    )*
-                    $(
-                        $custom_field: $custom_value,
-                    )*
-                }
-            };
-        }
-
-        Ok(conf!(
-            USING DEFAULT
-            {
-                listen_pg_addr,
-                listen_http_addr,
-                availability_zone,
-                wait_lsn_timeout,
-                wal_redo_timeout,
-                superuser,
-                page_cache_size,
-                max_file_descriptors,
-                workdir,
-                pg_distrib_dir,
-                http_auth_type,
-                pg_auth_type,
-                auth_validation_public_key_path,
-                remote_storage_config,
-                id,
-                broker_endpoint,
-                broker_keepalive_interval,
-                log_format,
-                metric_collection_interval,
-                cached_metric_collection_interval,
-                metric_collection_endpoint,
-                metric_collection_bucket,
-                synthetic_size_calculation_interval,
-                disk_usage_based_eviction,
-                test_remote_failures,
-                ondemand_download_behavior_treat_error_as_warn,
-                background_task_maximum_delay,
-                control_plane_api,
-                control_plane_api_token,
-                control_plane_emergency_mode,
-                heatmap_upload_concurrency,
-                secondary_download_concurrency,
-                ingest_batch_size,
-                get_vectored_impl,
-                max_vectored_read_bytes,
-                validate_vectored_get,
-                ephemeral_bytes_per_memory_kb,
-            }
-            CUSTOM LOGIC
-            {
-                // TenantConf is handled separately
-                default_tenant_conf: TenantConf::default(),
-                concurrent_tenant_warmup: ConfigurableSemaphore::new({
-                    self
-                        .concurrent_tenant_warmup
-                        .ok_or("concurrent_tenant_warmpup",
-                               default.concurrent_tenant_warmup)?
-                }),
-                concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::new(
-                    self
-                        .concurrent_tenant_size_logical_size_queries
-                        .ok_or("concurrent_tenant_size_logical_size_queries",
-                               default.concurrent_tenant_size_logical_size_queries.clone())?
-                ),
-                eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::new(
-                    // re-use `concurrent_tenant_size_logical_size_queries`
-                    self
-                        .concurrent_tenant_size_logical_size_queries
-                        .ok_or("eviction_task_immitated_concurrent_logical_size_queries",
-                               default.concurrent_tenant_size_logical_size_queries.clone())?,
-                ),
-                virtual_file_io_engine: match self.virtual_file_io_engine {
-                    BuilderValue::Set(v) => v,
-                    BuilderValue::NotSet => match crate::virtual_file::io_engine_feature_test().context("auto-detect virtual_file_io_engine")? {
-                        io_engine::FeatureTestResult::PlatformPreferred(v) => v, // make no noise
-                        io_engine::FeatureTestResult::Worse { engine, remark } => {
-                            // TODO: bubble this up to the caller so we can tracing::warn! it.
-                            eprintln!("auto-detected IO engine is not platform-preferred: engine={engine:?} remark={remark:?}");
-                            engine
-                        }
-                    },
-                },
-            }
-        ))
+        let concurrent_tenant_warmup = self
+            .concurrent_tenant_warmup
+            .ok_or(anyhow!("missing concurrent_tenant_warmup"))?;
+        let concurrent_tenant_size_logical_size_queries = self
+            .concurrent_tenant_size_logical_size_queries
+            .ok_or(anyhow!(
+                "missing concurrent_tenant_size_logical_size_queries"
+            ))?;
+        Ok(PageServerConf {
+            listen_pg_addr: self
+                .listen_pg_addr
+                .ok_or(anyhow!("missing listen_pg_addr"))?,
+            listen_http_addr: self
+                .listen_http_addr
+                .ok_or(anyhow!("missing listen_http_addr"))?,
+            availability_zone: self
+                .availability_zone
+                .ok_or(anyhow!("missing availability_zone"))?,
+            wait_lsn_timeout: self
+                .wait_lsn_timeout
+                .ok_or(anyhow!("missing wait_lsn_timeout"))?,
+            wal_redo_timeout: self
+                .wal_redo_timeout
+                .ok_or(anyhow!("missing wal_redo_timeout"))?,
+            superuser: self.superuser.ok_or(anyhow!("missing superuser"))?,
+            page_cache_size: self
+                .page_cache_size
+                .ok_or(anyhow!("missing page_cache_size"))?,
+            max_file_descriptors: self
+                .max_file_descriptors
+                .ok_or(anyhow!("missing max_file_descriptors"))?,
+            workdir: self.workdir.ok_or(anyhow!("missing workdir"))?,
+            pg_distrib_dir: self
+                .pg_distrib_dir
+                .ok_or(anyhow!("missing pg_distrib_dir"))?,
+            http_auth_type: self
+                .http_auth_type
+                .ok_or(anyhow!("missing http_auth_type"))?,
+            pg_auth_type: self.pg_auth_type.ok_or(anyhow!("missing pg_auth_type"))?,
+            auth_validation_public_key_path: self
+                .auth_validation_public_key_path
+                .ok_or(anyhow!("missing auth_validation_public_key_path"))?,
+            remote_storage_config: self
+                .remote_storage_config
+                .ok_or(anyhow!("missing remote_storage_config"))?,
+            id: self.id.ok_or(anyhow!("missing id"))?,
+            // TenantConf is handled separately
+            default_tenant_conf: TenantConf::default(),
+            broker_endpoint: self
+                .broker_endpoint
+                .ok_or(anyhow!("No broker endpoints provided"))?,
+            broker_keepalive_interval: self
+                .broker_keepalive_interval
+                .ok_or(anyhow!("No broker keepalive interval provided"))?,
+            log_format: self.log_format.ok_or(anyhow!("missing log_format"))?,
+            concurrent_tenant_warmup: ConfigurableSemaphore::new(concurrent_tenant_warmup),
+            concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::new(
+                concurrent_tenant_size_logical_size_queries,
+            ),
+            eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::new(
+                concurrent_tenant_size_logical_size_queries,
+            ),
+            metric_collection_interval: self
+                .metric_collection_interval
+                .ok_or(anyhow!("missing metric_collection_interval"))?,
+            cached_metric_collection_interval: self
+                .cached_metric_collection_interval
+                .ok_or(anyhow!("missing cached_metric_collection_interval"))?,
+            metric_collection_endpoint: self
+                .metric_collection_endpoint
+                .ok_or(anyhow!("missing metric_collection_endpoint"))?,
+            synthetic_size_calculation_interval: self
+                .synthetic_size_calculation_interval
+                .ok_or(anyhow!("missing synthetic_size_calculation_interval"))?,
+            disk_usage_based_eviction: self
+                .disk_usage_based_eviction
+                .ok_or(anyhow!("missing disk_usage_based_eviction"))?,
+            test_remote_failures: self
+                .test_remote_failures
+                .ok_or(anyhow!("missing test_remote_failuers"))?,
+            ondemand_download_behavior_treat_error_as_warn: self
+                .ondemand_download_behavior_treat_error_as_warn
+                .ok_or(anyhow!(
+                    "missing ondemand_download_behavior_treat_error_as_warn"
+                ))?,
+            background_task_maximum_delay: self
+                .background_task_maximum_delay
+                .ok_or(anyhow!("missing background_task_maximum_delay"))?,
+            control_plane_api: self
+                .control_plane_api
+                .ok_or(anyhow!("missing control_plane_api"))?,
+            control_plane_api_token: self
+                .control_plane_api_token
+                .ok_or(anyhow!("missing control_plane_api_token"))?,
+            control_plane_emergency_mode: self
+                .control_plane_emergency_mode
+                .ok_or(anyhow!("missing control_plane_emergency_mode"))?,
+            heatmap_upload_concurrency: self
+                .heatmap_upload_concurrency
+                .ok_or(anyhow!("missing heatmap_upload_concurrency"))?,
+            secondary_download_concurrency: self
+                .secondary_download_concurrency
+                .ok_or(anyhow!("missing secondary_download_concurrency"))?,
+            ingest_batch_size: self
+                .ingest_batch_size
+                .ok_or(anyhow!("missing ingest_batch_size"))?,
+            virtual_file_io_engine: self
+                .virtual_file_io_engine
+                .ok_or(anyhow!("missing virtual_file_io_engine"))?,
+            get_vectored_impl: self
+                .get_vectored_impl
+                .ok_or(anyhow!("missing get_vectored_impl"))?,
+            max_vectored_read_bytes: self
+                .max_vectored_read_bytes
+                .ok_or(anyhow!("missing max_vectored_read_bytes"))?,
+            validate_vectored_get: self
+                .validate_vectored_get
+                .ok_or(anyhow!("missing validate_vectored_get"))?,
+        })
    }
 }

@@ -855,7 +845,18 @@ impl PageServerConf {
            .join(timeline_id.to_string())
    }

-    pub(crate) fn timeline_delete_mark_file_path(
+    pub fn timeline_uninit_mark_file_path(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+    ) -> Utf8PathBuf {
+        path_with_suffix_extension(
+            self.timeline_path(&tenant_shard_id, &timeline_id),
+            TIMELINE_UNINIT_MARK_SUFFIX,
+        )
+    }
+
+    pub fn timeline_delete_mark_file_path(
        &self,
        tenant_shard_id: TenantShardId,
        timeline_id: TimelineId,
@@ -866,10 +867,7 @@ impl PageServerConf {
        )
    }

-    pub(crate) fn tenant_deleted_mark_file_path(
-        &self,
-        tenant_shard_id: &TenantShardId,
-    ) -> Utf8PathBuf {
+    pub fn tenant_deleted_mark_file_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
        self.tenant_path(tenant_shard_id)
            .join(TENANT_DELETED_MARKER_FILE_NAME)
    }
@@ -973,9 +971,6 @@ impl PageServerConf {
                    let endpoint = parse_toml_string(key, item)?.parse().context("failed to parse metric_collection_endpoint")?;
                    builder.metric_collection_endpoint(Some(endpoint));
                },
-                "metric_collection_bucket" => {
-                    builder.metric_collection_bucket(RemoteStorageConfig::from_toml(item)?)
-                }
                "synthetic_size_calculation_interval" =>
                    builder.synthetic_size_calculation_interval(parse_toml_duration(key, item)?),
                "test_remote_failures" => builder.test_remote_failures(parse_toml_u64(key, item)?),
@@ -1029,9 +1024,6 @@ impl PageServerConf {
                "validate_vectored_get" => {
                    builder.get_validate_vectored_get(parse_toml_bool("validate_vectored_get", item)?)
                }
-                "ephemeral_bytes_per_memory_kb" => {
-                    builder.get_ephemeral_bytes_per_memory_kb(parse_toml_u64("ephemeral_bytes_per_memory_kb", item)? as usize)
-                }
                _ => bail!("unrecognized pageserver option '{key}'"),
            }
        }
@@ -1094,7 +1086,6 @@ impl PageServerConf {
            metric_collection_interval: Duration::from_secs(60),
            cached_metric_collection_interval: Duration::from_secs(60 * 60),
            metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
-            metric_collection_bucket: None,
            synthetic_size_calculation_interval: Duration::from_secs(60),
            disk_usage_based_eviction: None,
            test_remote_failures: 0,
@@ -1113,7 +1104,6 @@ impl PageServerConf {
                    .expect("Invalid default constant"),
            ),
            validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
-            ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
        }
    }
 }
@@ -1328,7 +1318,6 @@ background_task_maximum_delay = '334 s'
                    defaults::DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL
                )?,
                metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
-                metric_collection_bucket: None,
                synthetic_size_calculation_interval: humantime::parse_duration(
                    defaults::DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL
                )?,
@@ -1351,7 +1340,6 @@ background_task_maximum_delay = '334 s'
                        .expect("Invalid default constant")
                ),
                validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
-                ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB
            },
            "Correct defaults should be used when no config values are provided"
        );
@@ -1404,7 +1392,6 @@ background_task_maximum_delay = '334 s'
                metric_collection_interval: Duration::from_secs(222),
                cached_metric_collection_interval: Duration::from_secs(22200),
                metric_collection_endpoint: Some(Url::parse("http://localhost:80/metrics")?),
-                metric_collection_bucket: None,
                synthetic_size_calculation_interval: Duration::from_secs(333),
                disk_usage_based_eviction: None,
                test_remote_failures: 0,
@@ -1423,7 +1410,6 @@ background_task_maximum_delay = '334 s'
                        .expect("Invalid default constant")
                ),
                validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
-                ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB
            },
            "Should be able to parse all basic config values correctly"
        );
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -3,13 +3,10 @@
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
 use crate::tenant::tasks::BackgroundLoopKind;
-use crate::tenant::{
-    mgr::TenantManager, LogicalSizeCalculationCause, PageReconstructError, Tenant,
-};
+use crate::tenant::{mgr, LogicalSizeCalculationCause, PageReconstructError, Tenant};
 use camino::Utf8PathBuf;
 use consumption_metrics::EventType;
 use pageserver_api::models::TenantState;
-use remote_storage::{GenericRemoteStorage, RemoteStorageConfig};
 use reqwest::Url;
 use std::collections::HashMap;
 use std::sync::Arc;
@@ -43,9 +40,7 @@ type Cache = HashMap<MetricsKey, (EventType, u64)>;
 /// Main thread that serves metrics collection
 #[allow(clippy::too_many_arguments)]
 pub async fn collect_metrics(
-    tenant_manager: Arc<TenantManager>,
    metric_collection_endpoint: &Url,
-    metric_collection_bucket: &Option<RemoteStorageConfig>,
    metric_collection_interval: Duration,
    _cached_metric_collection_interval: Duration,
    synthetic_size_calculation_interval: Duration,
@@ -70,19 +65,15 @@ pub async fn collect_metrics(
        None,
        "synthetic size calculation",
        false,
-        {
-            let tenant_manager = tenant_manager.clone();
-            async move {
-                calculate_synthetic_size_worker(
-                    tenant_manager,
-                    synthetic_size_calculation_interval,
-                    &cancel,
-                    &worker_ctx,
-                )
-                .instrument(info_span!("synthetic_size_worker"))
-                .await?;
-                Ok(())
-            }
+        async move {
+            calculate_synthetic_size_worker(
+                synthetic_size_calculation_interval,
+                &cancel,
+                &worker_ctx,
+            )
+            .instrument(info_span!("synthetic_size_worker"))
+            .await?;
+            Ok(())
        },
    );

@@ -103,27 +94,13 @@ pub async fn collect_metrics(
        .build()
        .expect("Failed to create http client with timeout");

-    let bucket_client = if let Some(bucket_config) = metric_collection_bucket {
-        match GenericRemoteStorage::from_config(bucket_config) {
-            Ok(client) => Some(client),
-            Err(e) => {
-                // Non-fatal error: if we were given an invalid config, we will proceed
-                // with sending metrics over the network, but not to S3.
-                tracing::warn!("Invalid configuration for metric_collection_bucket: {e}");
-                None
-            }
-        }
-    } else {
-        None
-    };
-
    let node_id = node_id.to_string();

    loop {
        let started_at = Instant::now();

        // these are point in time, with variable "now"
-        let metrics = metrics::collect_all_metrics(&tenant_manager, &cached_metrics, &ctx).await;
+        let metrics = metrics::collect_all_metrics(&cached_metrics, &ctx).await;

        let metrics = Arc::new(metrics);

@@ -141,18 +118,10 @@ pub async fn collect_metrics(
                    tracing::error!("failed to persist metrics to {path:?}: {e:#}");
                }
            }
-
-            if let Some(bucket_client) = &bucket_client {
-                let res =
-                    upload::upload_metrics_bucket(bucket_client, &cancel, &node_id, &metrics).await;
-                if let Err(e) = res {
-                    tracing::error!("failed to upload to S3: {e:#}");
-                }
-            }
        };

        let upload = async {
-            let res = upload::upload_metrics_http(
+            let res = upload::upload_metrics(
                &client,
                metric_collection_endpoint,
                &cancel,
@@ -163,7 +132,7 @@ pub async fn collect_metrics(
            .await;
            if let Err(e) = res {
                // serialization error which should never happen
-                tracing::error!("failed to upload via HTTP due to {e:#}");
+                tracing::error!("failed to upload due to {e:#}");
            }
        };

@@ -278,7 +247,6 @@ async fn reschedule(

 /// Caclculate synthetic size for each active tenant
 async fn calculate_synthetic_size_worker(
-    tenant_manager: Arc<TenantManager>,
    synthetic_size_calculation_interval: Duration,
    cancel: &CancellationToken,
    ctx: &RequestContext,
@@ -291,7 +259,7 @@ async fn calculate_synthetic_size_worker(
    loop {
        let started_at = Instant::now();

-        let tenants = match tenant_manager.list_tenants() {
+        let tenants = match mgr::list_tenants().await {
            Ok(tenants) => tenants,
            Err(e) => {
                warn!("cannot get tenant list: {e:#}");
@@ -310,14 +278,10 @@ async fn calculate_synthetic_size_worker(
                continue;
            }

-            let Ok(tenant) = tenant_manager.get_attached_tenant_shard(tenant_shard_id) else {
+            let Ok(tenant) = mgr::get_tenant(tenant_shard_id, true) else {
                continue;
            };

-            if !tenant.is_active() {
-                continue;
-            }
-
            // there is never any reason to exit calculate_synthetic_size_worker following any
            // return value -- we don't need to care about shutdown because no tenant is found when
            // pageserver is shut down.
@@ -355,7 +319,9 @@ async fn calculate_and_log(tenant: &Tenant, cancel: &CancellationToken, ctx: &Re
    };

    // this error can be returned if timeline is shutting down, but it does not
-    // mean the synthetic size worker should terminate.
+    // mean the synthetic size worker should terminate. we do not need any checks
+    // in this function because `mgr::get_tenant` will error out after shutdown has
+    // progressed to shutting down tenants.
    let shutting_down = matches!(
        e.downcast_ref::<PageReconstructError>(),
        Some(PageReconstructError::Cancelled | PageReconstructError::AncestorStopping(_))
--- a/pageserver/src/consumption_metrics/metrics.rs
+++ b/pageserver/src/consumption_metrics/metrics.rs
@@ -1,4 +1,3 @@
-use crate::tenant::mgr::TenantManager;
 use crate::{context::RequestContext, tenant::timeline::logical_size::CurrentLogicalSize};
 use chrono::{DateTime, Utc};
 use consumption_metrics::EventType;
@@ -182,7 +181,6 @@ impl MetricsKey {
 }

 pub(super) async fn collect_all_metrics(
-    tenant_manager: &Arc<TenantManager>,
    cached_metrics: &Cache,
    ctx: &RequestContext,
 ) -> Vec<RawMetric> {
@@ -190,7 +188,7 @@ pub(super) async fn collect_all_metrics(

    let started_at = std::time::Instant::now();

-    let tenants = match tenant_manager.list_tenants() {
+    let tenants = match crate::tenant::mgr::list_tenants().await {
        Ok(tenants) => tenants,
        Err(err) => {
            tracing::error!("failed to list tenants: {:?}", err);
@@ -202,8 +200,7 @@ pub(super) async fn collect_all_metrics(
        if state != TenantState::Active || !id.is_zero() {
            None
        } else {
-            tenant_manager
-                .get_attached_tenant_shard(id)
+            crate::tenant::mgr::get_tenant(id, true)
                .ok()
                .map(|tenant| (id.tenant_id, tenant))
        }
--- a/pageserver/src/consumption_metrics/upload.rs
+++ b/pageserver/src/consumption_metrics/upload.rs
@@ -1,9 +1,4 @@
-use std::time::SystemTime;
-
-use chrono::{DateTime, Utc};
 use consumption_metrics::{Event, EventChunk, IdempotencyKey, CHUNK_SIZE};
-use remote_storage::{GenericRemoteStorage, RemotePath};
-use tokio::io::AsyncWriteExt;
 use tokio_util::sync::CancellationToken;
 use tracing::Instrument;

@@ -18,9 +13,8 @@ struct Ids {
    pub(super) timeline_id: Option<TimelineId>,
 }

-/// Serialize and write metrics to an HTTP endpoint
 #[tracing::instrument(skip_all, fields(metrics_total = %metrics.len()))]
-pub(super) async fn upload_metrics_http(
+pub(super) async fn upload_metrics(
    client: &reqwest::Client,
    metric_collection_endpoint: &reqwest::Url,
    cancel: &CancellationToken,
@@ -80,60 +74,6 @@ pub(super) async fn upload_metrics_http(
    Ok(())
 }

-/// Serialize and write metrics to a remote storage object
-#[tracing::instrument(skip_all, fields(metrics_total = %metrics.len()))]
-pub(super) async fn upload_metrics_bucket(
-    client: &GenericRemoteStorage,
-    cancel: &CancellationToken,
-    node_id: &str,
-    metrics: &[RawMetric],
-) -> anyhow::Result<()> {
-    if metrics.is_empty() {
-        // Skip uploads if we have no metrics, so that readers don't have to handle the edge case
-        // of an empty object.
-        return Ok(());
-    }
-
-    // Compose object path
-    let datetime: DateTime<Utc> = SystemTime::now().into();
-    let ts_prefix = datetime.format("year=%Y/month=%m/day=%d/%H:%M:%SZ");
-    let path = RemotePath::from_string(&format!("{ts_prefix}_{node_id}.ndjson.gz"))?;
-
-    // Set up a gzip writer into a buffer
-    let mut compressed_bytes: Vec<u8> = Vec::new();
-    let compressed_writer = std::io::Cursor::new(&mut compressed_bytes);
-    let mut gzip_writer = async_compression::tokio::write::GzipEncoder::new(compressed_writer);
-
-    // Serialize and write into compressed buffer
-    let started_at = std::time::Instant::now();
-    for res in serialize_in_chunks(CHUNK_SIZE, metrics, node_id) {
-        let (_chunk, body) = res?;
-        gzip_writer.write_all(&body).await?;
-    }
-    gzip_writer.flush().await?;
-    gzip_writer.shutdown().await?;
-    let compressed_length = compressed_bytes.len();
-
-    // Write to remote storage
-    client
-        .upload_storage_object(
-            futures::stream::once(futures::future::ready(Ok(compressed_bytes.into()))),
-            compressed_length,
-            &path,
-            cancel,
-        )
-        .await?;
-    let elapsed = started_at.elapsed();
-
-    tracing::info!(
-        compressed_length,
-        elapsed_ms = elapsed.as_millis(),
-        "write metrics bucket at {path}",
-    );
-
-    Ok(())
-}
-
 // The return type is quite ugly, but we gain testability in isolation
 fn serialize_in_chunks<'a, F>(
    chunk_size: usize,
--- a/pageserver/src/control_plane_client.rs
+++ b/pageserver/src/control_plane_client.rs
@@ -5,8 +5,7 @@ use pageserver_api::{
    controller_api::NodeRegisterRequest,
    shard::TenantShardId,
    upcall_api::{
-        ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest,
-        ValidateRequestTenant, ValidateResponse,
+        ReAttachRequest, ReAttachResponse, ValidateRequest, ValidateRequestTenant, ValidateResponse,
    },
 };
 use serde::{de::DeserializeOwned, Serialize};
@@ -38,9 +37,7 @@ pub trait ControlPlaneGenerationsApi {
    fn re_attach(
        &self,
        conf: &PageServerConf,
-    ) -> impl Future<
-        Output = Result<HashMap<TenantShardId, ReAttachResponseTenant>, RetryForeverError>,
-    > + Send;
+    ) -> impl Future<Output = Result<HashMap<TenantShardId, Generation>, RetryForeverError>> + Send;
    fn validate(
        &self,
        tenants: Vec<(TenantShardId, Generation)>,
@@ -121,7 +118,7 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
    async fn re_attach(
        &self,
        conf: &PageServerConf,
-    ) -> Result<HashMap<TenantShardId, ReAttachResponseTenant>, RetryForeverError> {
+    ) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
        let re_attach_path = self
            .base_url
            .join("re-attach")
@@ -184,7 +181,7 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
        Ok(response
            .tenants
            .into_iter()
-            .map(|rart| (rart.id, rart))
+            .map(|t| (t.id, Generation::new(t.gen)))
            .collect::<HashMap<_, _>>())
    }

--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -724,8 +724,8 @@ impl DeletionQueue {
 mod test {
    use camino::Utf8Path;
    use hex_literal::hex;
-    use pageserver_api::{shard::ShardIndex, upcall_api::ReAttachResponseTenant};
-    use std::{io::ErrorKind, time::Duration};
+    use pageserver_api::shard::ShardIndex;
+    use std::io::ErrorKind;
    use tracing::info;

    use remote_storage::{RemoteStorageConfig, RemoteStorageKind};
@@ -834,10 +834,9 @@ mod test {
        async fn re_attach(
            &self,
            _conf: &PageServerConf,
-        ) -> Result<HashMap<TenantShardId, ReAttachResponseTenant>, RetryForeverError> {
+        ) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
            unimplemented!()
        }
-
        async fn validate(
            &self,
            tenants: Vec<(TenantShardId, Generation)>,
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -61,6 +61,7 @@ use crate::{
    metrics::disk_usage_based_eviction::METRICS,
    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
    tenant::{
+        self,
        mgr::TenantManager,
        remote_timeline_client::LayerFileMetadata,
        secondary::SecondaryTenant,
@@ -813,8 +814,8 @@ async fn collect_eviction_candidates(
    const LOG_DURATION_THRESHOLD: std::time::Duration = std::time::Duration::from_secs(10);

    // get a snapshot of the list of tenants
-    let tenants = tenant_manager
-        .list_tenants()
+    let tenants = tenant::mgr::list_tenants()
+        .await
        .context("get list of tenants")?;

    // TODO: avoid listing every layer in every tenant: this loop can block the executor,
@@ -826,12 +827,8 @@ async fn collect_eviction_candidates(
        if cancel.is_cancelled() {
            return Ok(EvictionCandidates::Cancelled);
        }
-        let tenant = match tenant_manager.get_attached_tenant_shard(tenant_id) {
-            Ok(tenant) if tenant.is_active() => tenant,
-            Ok(_) => {
-                debug!(tenant_id=%tenant_id.tenant_id, shard_id=%tenant_id.shard_slug(), "Tenant shard is not active");
-                continue;
-            }
+        let tenant = match tenant::mgr::get_tenant(tenant_id, true) {
+            Ok(tenant) => tenant,
            Err(e) => {
                // this can happen if tenant has lifecycle transition after we fetched it
                debug!("failed to get tenant: {e:#}");
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -965,28 +965,12 @@ paths:
        required: true
        schema:
          type: string
-      - name: wait_ms
-        description: If set, we will wait this long for download to complete, and if it isn't complete then return 202
-        in: query
-        required: false
-        schema:
-          type: integer
    post:
      description: |
        If the location is in secondary mode, download latest heatmap and layers
      responses:
        "200":
          description: Success
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/SecondaryProgress"
-        "202":
-          description: Download has started but not yet finished
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/SecondaryProgress"
        "500":
          description: Generic operation error
          content:
@@ -1038,7 +1022,7 @@ paths:
                  format: hex
      responses:
        "201":
-          description: Timeline was created, or already existed with matching parameters
+          description: TimelineInfo
          content:
            application/json:
              schema:
@@ -1068,17 +1052,11 @@ paths:
              schema:
                $ref: "#/components/schemas/Error"
        "409":
-          description: Timeline already exists, with different parameters.  Creation cannot proceed.
+          description: Timeline already exists, creation skipped
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ConflictError"
-        "429":
-          description: A creation request was sent for the same Timeline Id while a creation was already in progress.  Back off and retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
        "500":
          description: Generic operation error
          content:
@@ -1645,37 +1623,6 @@ components:
            Lower is better score for how good this pageserver would be for the next tenant.
            The default or maximum value can be returned in situations when a proper score cannot (yet) be calculated.

-    SecondaryProgress:
-      type: object
-      required:
-        - heatmap_mtime
-        - layers_downloaded
-        - layers_total
-        - bytes_downloaded
-        - bytes_total
-      properties:
-        heatmap_mtime:
-          type: string
-          format: date-time
-          description: Modification time of the most recently downloaded layer heatmap (RFC 3339 format)
-        layers_downloaded:
-          type: integer
-          format: int64
-          description: How many layers from the latest layer heatmap are present on disk
-        bytes_downloaded:
-          type: integer
-          format: int64
-          description: How many bytes of layer content from the latest layer heatmap are present on disk
-        layers_total:
-          type: integer
-          format: int64
-          description: How many layers were in the latest layer heatmap
-        bytes_total:
-          type: integer
-          format: int64
-          description: How many bytes of layer content were in the latest layer heatmap
-
-
    Error:
      type: object
      required:
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -36,7 +36,6 @@ use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::auth::JwtAuth;
 use utils::failpoint_support::failpoints_handler;
-use utils::http::endpoint::prometheus_metrics_handler;
 use utils::http::endpoint::request_span;
 use utils::http::json::json_request_or_empty_body;
 use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
@@ -49,8 +48,8 @@ use crate::task_mgr::TaskKind;
 use crate::tenant::config::{LocationConf, TenantConfOpt};
 use crate::tenant::mgr::GetActiveTenantError;
 use crate::tenant::mgr::{
-    GetTenantError, TenantManager, TenantMapError, TenantMapInsertError, TenantSlotError,
-    TenantSlotUpsertError, TenantStateError,
+    GetTenantError, SetNewTenantConfigError, TenantManager, TenantMapError, TenantMapInsertError,
+    TenantSlotError, TenantSlotUpsertError, TenantStateError,
 };
 use crate::tenant::mgr::{TenantSlot, UpsertLocationError};
 use crate::tenant::remote_timeline_client;
@@ -249,11 +248,16 @@ impl From<GetTenantError> for ApiError {
    fn from(tse: GetTenantError) -> ApiError {
        match tse {
            GetTenantError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid).into()),
+            GetTenantError::Broken(reason) => {
+                ApiError::InternalServerError(anyhow!("tenant is broken: {}", reason))
+            }
            GetTenantError::NotActive(_) => {
                // Why is this not `ApiError::NotFound`?
                // Because we must be careful to never return 404 for a tenant if it does
                // in fact exist locally. If we did, the caller could draw the conclusion
                // that it can attach the tenant to another PS and we'd be in split-brain.
+                //
+                // (We can produce this variant only in `mgr::get_tenant(..., active=true)` calls).
                ApiError::ResourceUnavailable("Tenant not yet active".into())
            }
            GetTenantError::MapState(e) => ApiError::ResourceUnavailable(format!("{e}").into()),
@@ -264,9 +268,6 @@ impl From<GetTenantError> for ApiError {
 impl From<GetActiveTenantError> for ApiError {
    fn from(e: GetActiveTenantError) -> ApiError {
        match e {
-            GetActiveTenantError::Broken(reason) => {
-                ApiError::InternalServerError(anyhow!("tenant is broken: {}", reason))
-            }
            GetActiveTenantError::WillNotBecomeActive(_) => ApiError::Conflict(format!("{}", e)),
            GetActiveTenantError::Cancelled => ApiError::ShuttingDown,
            GetActiveTenantError::NotFound(gte) => gte.into(),
@@ -277,6 +278,19 @@ impl From<GetActiveTenantError> for ApiError {
    }
 }

+impl From<SetNewTenantConfigError> for ApiError {
+    fn from(e: SetNewTenantConfigError) -> ApiError {
+        match e {
+            SetNewTenantConfigError::GetTenant(tid) => {
+                ApiError::NotFound(anyhow!("tenant {}", tid).into())
+            }
+            e @ (SetNewTenantConfigError::Persist(_) | SetNewTenantConfigError::Other(_)) => {
+                ApiError::InternalServerError(anyhow::Error::new(e))
+            }
+        }
+    }
+}
+
 impl From<crate::tenant::DeleteTimelineError> for ApiError {
    fn from(value: crate::tenant::DeleteTimelineError) -> Self {
        use crate::tenant::DeleteTimelineError::*;
@@ -480,7 +494,7 @@ async fn timeline_create_handler(
    async {
        let tenant = state
            .tenant_manager
-            .get_attached_tenant_shard(tenant_shard_id)?;
+            .get_attached_tenant_shard(tenant_shard_id, false)?;

        tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;

@@ -520,13 +534,10 @@ async fn timeline_create_handler(
                    HttpErrorBody::from_msg("Tenant shutting down".to_string()),
                )
            }
-            Err(e @ tenant::CreateTimelineError::Conflict) => {
-                json_response(StatusCode::CONFLICT, HttpErrorBody::from_msg(e.to_string()))
-            }
-            Err(e @ tenant::CreateTimelineError::AlreadyCreating) => json_response(
-                StatusCode::TOO_MANY_REQUESTS,
-                HttpErrorBody::from_msg(e.to_string()),
-            ),
+            Err(
+                tenant::CreateTimelineError::Conflict
+                | tenant::CreateTimelineError::AlreadyCreating,
+            ) => json_response(StatusCode::CONFLICT, ()),
            Err(tenant::CreateTimelineError::AncestorLsn(err)) => json_response(
                StatusCode::NOT_ACCEPTABLE,
                HttpErrorBody::from_msg(format!("{err:#}")),
@@ -569,7 +580,7 @@ async fn timeline_list_handler(
    let response_data = async {
        let tenant = state
            .tenant_manager
-            .get_attached_tenant_shard(tenant_shard_id)?;
+            .get_attached_tenant_shard(tenant_shard_id, false)?;

        tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;

@@ -607,7 +618,6 @@ async fn timeline_preserve_initdb_handler(
    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
-    let state = get_state(&request);

    // Part of the process for disaster recovery from safekeeper-stored WAL:
    // If we don't recover into a new timeline but want to keep the timeline ID,
@@ -615,9 +625,7 @@ async fn timeline_preserve_initdb_handler(
    // location where timeline recreation cand find it.

    async {
-        let tenant = state
-            .tenant_manager
-            .get_attached_tenant_shard(tenant_shard_id)?;
+        let tenant = mgr::get_tenant(tenant_shard_id, false)?;

        let timeline = tenant
            .get_timeline(timeline_id, false)
@@ -659,7 +667,7 @@ async fn timeline_detail_handler(
    let timeline_info = async {
        let tenant = state
            .tenant_manager
-            .get_attached_tenant_shard(tenant_shard_id)?;
+            .get_attached_tenant_shard(tenant_shard_id, false)?;

        tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;

@@ -846,7 +854,7 @@ async fn timeline_delete_handler(

    let tenant = state
        .tenant_manager
-        .get_attached_tenant_shard(tenant_shard_id)
+        .get_attached_tenant_shard(tenant_shard_id, false)
        .map_err(|e| {
            match e {
                // GetTenantError has a built-in conversion to ApiError, but in this context we don't
@@ -877,16 +885,14 @@ async fn tenant_detach_handler(

    let state = get_state(&request);
    let conf = state.conf;
-    state
-        .tenant_manager
-        .detach_tenant(
-            conf,
-            tenant_shard_id,
-            detach_ignored.unwrap_or(false),
-            &state.deletion_queue_client,
-        )
-        .instrument(info_span!("tenant_detach", %tenant_id, shard_id=%tenant_shard_id.shard_slug()))
-        .await?;
+    mgr::detach_tenant(
+        conf,
+        tenant_shard_id,
+        detach_ignored.unwrap_or(false),
+        &state.deletion_queue_client,
+    )
+    .instrument(info_span!("tenant_detach", %tenant_id, shard_id=%tenant_shard_id.shard_slug()))
+    .await?;

    json_response(StatusCode::OK, ())
 }
@@ -964,11 +970,10 @@ async fn tenant_list_handler(
    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
    check_permission(&request, None)?;
-    let state = get_state(&request);

-    let response_data = state
-        .tenant_manager
-        .list_tenants()
+    let response_data = mgr::list_tenants()
+        .instrument(info_span!("tenant_list"))
+        .await
        .map_err(|_| {
            ApiError::ResourceUnavailable("Tenant map is initializing or shutting down".into())
        })?
@@ -991,12 +996,9 @@ async fn tenant_status(
 ) -> Result<Response<Body>, ApiError> {
    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
-    let state = get_state(&request);

    let tenant_info = async {
-        let tenant = state
-            .tenant_manager
-            .get_attached_tenant_shard(tenant_shard_id)?;
+        let tenant = mgr::get_tenant(tenant_shard_id, false)?;

        // Calculate total physical size of all timelines
        let mut current_physical_size = 0;
@@ -1069,7 +1071,9 @@ async fn tenant_size_handler(
    let inputs_only: Option<bool> = parse_query_param(&request, "inputs_only")?;
    let retention_period: Option<u64> = parse_query_param(&request, "retention_period")?;
    let headers = request.headers();
-    let state = get_state(&request);
+
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
+    let tenant = mgr::get_tenant(tenant_shard_id, true)?;

    if !tenant_shard_id.is_zero() {
        return Err(ApiError::BadRequest(anyhow!(
@@ -1077,12 +1081,6 @@ async fn tenant_size_handler(
        )));
    }

-    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-    let tenant = state
-        .tenant_manager
-        .get_attached_tenant_shard(tenant_shard_id)?;
-    tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
-
    // this can be long operation
    let inputs = tenant
        .gather_size_inputs(
@@ -1151,15 +1149,10 @@ async fn tenant_shard_split_handler(
    let state = get_state(&request);
    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);

-    let tenant = state
-        .tenant_manager
-        .get_attached_tenant_shard(tenant_shard_id)?;
-    tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
-
    let new_shards = state
        .tenant_manager
        .shard_split(
-            tenant,
+            tenant_shard_id,
            ShardCount::new(req.new_shard_count),
            req.new_stripe_size,
            &ctx,
@@ -1377,11 +1370,8 @@ async fn get_tenant_config_handler(
 ) -> Result<Response<Body>, ApiError> {
    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
-    let state = get_state(&request);

-    let tenant = state
-        .tenant_manager
-        .get_attached_tenant_shard(tenant_shard_id)?;
+    let tenant = mgr::get_tenant(tenant_shard_id, false)?;

    let response = HashMap::from([
        (
@@ -1409,31 +1399,13 @@ async fn update_tenant_config_handler(
    let tenant_id = request_data.tenant_id;
    check_permission(&request, Some(tenant_id))?;

-    let new_tenant_conf =
+    let tenant_conf =
        TenantConfOpt::try_from(&request_data.config).map_err(ApiError::BadRequest)?;

    let state = get_state(&request);
-
-    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
-
-    let tenant = state
-        .tenant_manager
-        .get_attached_tenant_shard(tenant_shard_id)?;
-    tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
-
-    // This is a legacy API that only operates on attached tenants: the preferred
-    // API to use is the location_config/ endpoint, which lets the caller provide
-    // the full LocationConf.
-    let location_conf = LocationConf::attached_single(
-        new_tenant_conf.clone(),
-        tenant.get_generation(),
-        &ShardParameters::default(),
-    );
-
-    crate::tenant::Tenant::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf)
-        .await
-        .map_err(ApiError::InternalServerError)?;
-    tenant.set_new_tenant_config(new_tenant_conf);
+    mgr::set_new_tenant_config(state.conf, tenant_conf, tenant_id)
+        .instrument(info_span!("tenant_config", %tenant_id))
+        .await?;

    json_response(StatusCode::OK, ())
 }
@@ -1456,14 +1428,13 @@ async fn put_tenant_location_config_handler(
    // The `Detached` state is special, it doesn't upsert a tenant, it removes
    // its local disk content and drops it from memory.
    if let LocationConfigMode::Detached = request_data.config.mode {
-        if let Err(e) = state
-            .tenant_manager
-            .detach_tenant(conf, tenant_shard_id, true, &state.deletion_queue_client)
-            .instrument(info_span!("tenant_detach",
-                tenant_id = %tenant_shard_id.tenant_id,
-                shard_id = %tenant_shard_id.shard_slug()
-            ))
-            .await
+        if let Err(e) =
+            mgr::detach_tenant(conf, tenant_shard_id, true, &state.deletion_queue_client)
+                .instrument(info_span!("tenant_detach",
+                    tenant_id = %tenant_shard_id.tenant_id,
+                    shard_id = %tenant_shard_id.shard_slug()
+                ))
+                .await
        {
            match e {
                TenantStateError::SlotError(TenantSlotError::NotFound(_)) => {
@@ -1657,12 +1628,10 @@ async fn handle_tenant_break(
 ) -> Result<Response<Body>, ApiError> {
    let tenant_shard_id: TenantShardId = parse_request_param(&r, "tenant_shard_id")?;

-    let state = get_state(&r);
-    state
-        .tenant_manager
-        .get_attached_tenant_shard(tenant_shard_id)?
-        .set_broken("broken from test".to_owned())
-        .await;
+    let tenant = crate::tenant::mgr::get_tenant(tenant_shard_id, true)
+        .map_err(|_| ApiError::Conflict(String::from("no active tenant found")))?;
+
+    tenant.set_broken("broken from test".to_owned()).await;

    json_response(StatusCode::OK, ())
 }
@@ -1679,7 +1648,8 @@ async fn timeline_gc_handler(
    let gc_req: TimelineGcRequest = json_request(&mut request).await?;

    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-    let wait_task_done = mgr::immediate_gc(tenant_shard_id, timeline_id, gc_req, cancel, &ctx)?;
+    let wait_task_done =
+        mgr::immediate_gc(tenant_shard_id, timeline_id, gc_req, cancel, &ctx).await?;
    let gc_result = wait_task_done
        .await
        .context("wait for gc task")
@@ -1906,7 +1876,7 @@ async fn active_timeline_of_active_tenant(
    tenant_shard_id: TenantShardId,
    timeline_id: TimelineId,
 ) -> Result<Arc<Timeline>, ApiError> {
-    let tenant = tenant_manager.get_attached_tenant_shard(tenant_shard_id)?;
+    let tenant = tenant_manager.get_attached_tenant_shard(tenant_shard_id, false)?;

    tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;

@@ -2017,42 +1987,13 @@ async fn secondary_download_handler(
 ) -> Result<Response<Body>, ApiError> {
    let state = get_state(&request);
    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
-    let wait = parse_query_param(&request, "wait_ms")?.map(Duration::from_millis);
+    state
+        .secondary_controller
+        .download_tenant(tenant_shard_id)
+        .await
+        .map_err(ApiError::InternalServerError)?;

-    // We don't need this to issue the download request, but:
-    // - it enables us to cleanly return 404 if we get a request for an absent shard
-    // - we will use this to provide status feedback in the response
-    let Some(secondary_tenant) = state
-        .tenant_manager
-        .get_secondary_tenant_shard(tenant_shard_id)
-    else {
-        return Err(ApiError::NotFound(
-            anyhow::anyhow!("Shard {} not found", tenant_shard_id).into(),
-        ));
-    };
-
-    let timeout = wait.unwrap_or(Duration::MAX);
-
-    let status = match tokio::time::timeout(
-        timeout,
-        state.secondary_controller.download_tenant(tenant_shard_id),
-    )
-    .await
-    {
-        // Download job ran to completion.
-        Ok(Ok(())) => StatusCode::OK,
-        // Edge case: downloads aren't usually fallible: things like a missing heatmap are considered
-        // okay.  We could get an error here in the unlikely edge case that the tenant
-        // was detached between our check above and executing the download job.
-        Ok(Err(e)) => return Err(ApiError::InternalServerError(e)),
-        // A timeout is not an error: we have started the download, we're just not done
-        // yet.  The caller will get a response body indicating status.
-        Err(_) => StatusCode::ACCEPTED,
-    };
-
-    let progress = secondary_tenant.progress.lock().unwrap().clone();
-
-    json_response(status, progress)
+    json_response(StatusCode::OK, ())
 }

 async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
@@ -2112,10 +2053,6 @@ async fn get_utilization(
    r: Request<Body>,
    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    fail::fail_point!("get-utilization-http-handler", |_| {
-        Err(ApiError::ResourceUnavailable("failpoint".into()))
-    });
-
    // this probably could be completely public, but lets make that change later.
    check_permission(&r, None)?;

@@ -2292,7 +2229,6 @@ pub fn make_router(

    Ok(router
        .data(state)
-        .get("/metrics", |r| request_span(r, prometheus_metrics_handler))
        .get("/v1/status", |r| api_handler(r, status_handler))
        .put("/v1/failpoints", |r| {
            testing_api_handler("manage failpoints", r, failpoints_handler)
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -2,20 +2,28 @@
 //! Import data and WAL from a PostgreSQL data directory and WAL segments into
 //! a neon Timeline.
 //!
+use std::io::SeekFrom;
 use std::path::{Path, PathBuf};

 use anyhow::{bail, ensure, Context, Result};
+use async_compression::tokio::bufread::ZstdDecoder;
+use async_compression::{tokio::write::ZstdEncoder, zstd::CParameter, Level};
 use bytes::Bytes;
 use camino::Utf8Path;
 use futures::StreamExt;
-use tokio::io::{AsyncRead, AsyncReadExt};
+use nix::NixPath;
+use tokio::fs::{File, OpenOptions};
+use tokio::io::{AsyncBufRead, AsyncRead, AsyncReadExt, AsyncSeekExt, AsyncWriteExt};
 use tokio_tar::Archive;
+use tokio_tar::Builder;
+use tokio_tar::HeaderMode;
 use tracing::*;
 use walkdir::WalkDir;

 use crate::context::RequestContext;
 use crate::metrics::WAL_INGEST;
 use crate::pgdatadir_mapping::*;
+use crate::tenant::remote_timeline_client::INITDB_PATH;
 use crate::tenant::Timeline;
 use crate::walingest::WalIngest;
 use crate::walrecord::DecodedWALRecord;
@@ -625,3 +633,65 @@ async fn read_all_bytes(reader: &mut (impl AsyncRead + Unpin)) -> Result<Bytes>
    reader.read_to_end(&mut buf).await?;
    Ok(Bytes::from(buf))
 }
+
+pub async fn create_tar_zst(pgdata_path: &Utf8Path, tmp_path: &Utf8Path) -> Result<(File, u64)> {
+    let file = OpenOptions::new()
+        .create(true)
+        .truncate(true)
+        .read(true)
+        .write(true)
+        .open(&tmp_path)
+        .await
+        .with_context(|| format!("tempfile creation {tmp_path}"))?;
+
+    let mut paths = Vec::new();
+    for entry in WalkDir::new(pgdata_path) {
+        let entry = entry?;
+        let metadata = entry.metadata().expect("error getting dir entry metadata");
+        // Also allow directories so that we also get empty directories
+        if !(metadata.is_file() || metadata.is_dir()) {
+            continue;
+        }
+        let path = entry.into_path();
+        paths.push(path);
+    }
+    // Do a sort to get a more consistent listing
+    paths.sort_unstable();
+    let zstd = ZstdEncoder::with_quality_and_params(
+        file,
+        Level::Default,
+        &[CParameter::enable_long_distance_matching(true)],
+    );
+    let mut builder = Builder::new(zstd);
+    // Use reproducible header mode
+    builder.mode(HeaderMode::Deterministic);
+    for path in paths {
+        let rel_path = path.strip_prefix(pgdata_path)?;
+        if rel_path.is_empty() {
+            // The top directory should not be compressed,
+            // the tar crate doesn't like that
+            continue;
+        }
+        builder.append_path_with_name(&path, rel_path).await?;
+    }
+    let mut zstd = builder.into_inner().await?;
+    zstd.shutdown().await?;
+    let mut compressed = zstd.into_inner();
+    let compressed_len = compressed.metadata().await?.len();
+    const INITDB_TAR_ZST_WARN_LIMIT: u64 = 2 * 1024 * 1024;
+    if compressed_len > INITDB_TAR_ZST_WARN_LIMIT {
+        warn!("compressed {INITDB_PATH} size of {compressed_len} is above limit {INITDB_TAR_ZST_WARN_LIMIT}.");
+    }
+    compressed.seek(SeekFrom::Start(0)).await?;
+    Ok((compressed, compressed_len))
+}
+
+pub async fn extract_tar_zst(
+    pgdata_path: &Utf8Path,
+    tar_zst: impl AsyncBufRead + Unpin,
+) -> Result<()> {
+    let tar = Box::pin(ZstdDecoder::new(tar_zst));
+    let mut archive = Archive::new(tar);
+    archive.unpack(pgdata_path).await?;
+    Ok(())
+}
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -31,7 +31,6 @@ pub mod walredo;
 use crate::task_mgr::TaskKind;
 use camino::Utf8Path;
 use deletion_queue::DeletionQueue;
-use tenant::mgr::TenantManager;
 use tracing::info;

 /// Current storage format version
@@ -54,11 +53,7 @@ static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]);
 pub use crate::metrics::preinitialize_metrics;

 #[tracing::instrument(skip_all, fields(%exit_code))]
-pub async fn shutdown_pageserver(
-    tenant_manager: &TenantManager,
-    deletion_queue: Option<DeletionQueue>,
-    exit_code: i32,
-) {
+pub async fn shutdown_pageserver(deletion_queue: Option<DeletionQueue>, exit_code: i32) {
    use std::time::Duration;
    // Shut down the libpq endpoint task. This prevents new connections from
    // being accepted.
@@ -72,7 +67,7 @@ pub async fn shutdown_pageserver(
    // Shut down all the tenants. This flushes everything to disk and kills
    // the checkpoint and GC tasks.
    timed(
-        tenant_manager.shutdown(),
+        tenant::mgr::shutdown_all_tenants(),
        "shutdown all tenants",
        Duration::from_secs(5),
    )
@@ -119,27 +114,27 @@ pub const METADATA_FILE_NAME: &str = "metadata";

 /// Per-tenant configuration file.
 /// Full path: `tenants/<tenant_id>/config`.
-pub(crate) const TENANT_CONFIG_NAME: &str = "config";
+pub const TENANT_CONFIG_NAME: &str = "config";

 /// Per-tenant configuration file.
 /// Full path: `tenants/<tenant_id>/config`.
-pub(crate) const TENANT_LOCATION_CONFIG_NAME: &str = "config-v1";
+pub const TENANT_LOCATION_CONFIG_NAME: &str = "config-v1";

 /// Per-tenant copy of their remote heatmap, downloaded into the local
 /// tenant path while in secondary mode.
-pub(crate) const TENANT_HEATMAP_BASENAME: &str = "heatmap-v1.json";
+pub const TENANT_HEATMAP_BASENAME: &str = "heatmap-v1.json";

 /// A suffix used for various temporary files. Any temporary files found in the
 /// data directory at pageserver startup can be automatically removed.
-pub(crate) const TEMP_FILE_SUFFIX: &str = "___temp";
+pub const TEMP_FILE_SUFFIX: &str = "___temp";

 /// A marker file to mark that a timeline directory was not fully initialized.
 /// If a timeline directory with this marker is encountered at pageserver startup,
 /// the timeline directory and the marker file are both removed.
 /// Full path: `tenants/<tenant_id>/timelines/<timeline_id>___uninit`.
-pub(crate) const TIMELINE_UNINIT_MARK_SUFFIX: &str = "___uninit";
+pub const TIMELINE_UNINIT_MARK_SUFFIX: &str = "___uninit";

-pub(crate) const TIMELINE_DELETE_MARK_SUFFIX: &str = "___delete";
+pub const TIMELINE_DELETE_MARK_SUFFIX: &str = "___delete";

 /// A marker file to prevent pageserver from loading a certain tenant on restart.
 /// Different from [`TIMELINE_UNINIT_MARK_SUFFIX`] due to semantics of the corresponding
@@ -166,11 +161,11 @@ fn ends_with_suffix(path: &Utf8Path, suffix: &str) -> bool {
 // from the directory name. Instead create type "UninitMark(TimelineId)" and only parse it once
 // from the name.

-pub(crate) fn is_uninit_mark(path: &Utf8Path) -> bool {
+pub fn is_uninit_mark(path: &Utf8Path) -> bool {
    ends_with_suffix(path, TIMELINE_UNINIT_MARK_SUFFIX)
 }

-pub(crate) fn is_delete_mark(path: &Utf8Path) -> bool {
+pub fn is_delete_mark(path: &Utf8Path) -> bool {
    ends_with_suffix(path, TIMELINE_DELETE_MARK_SUFFIX)
 }

--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -435,7 +435,7 @@ pub(crate) static RESIDENT_PHYSICAL_SIZE_GLOBAL: Lazy<UIntGauge> = Lazy::new(||
 static REMOTE_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
    register_uint_gauge_vec!(
        "pageserver_remote_physical_size",
-        "The size of the layer files present in the remote storage that are listed in the remote index_part.json.",
+        "The size of the layer files present in the remote storage that are listed in the the remote index_part.json.",
        // Corollary: If any files are missing from the index part, they won't be included here.
        &["tenant_id", "shard_id", "timeline_id"]
    )
@@ -699,14 +699,6 @@ pub static STARTUP_IS_LOADING: Lazy<UIntGauge> = Lazy::new(|| {
    .expect("Failed to register pageserver_startup_is_loading")
 });

-pub(crate) static TIMELINE_EPHEMERAL_BYTES: Lazy<UIntGauge> = Lazy::new(|| {
-    register_uint_gauge!(
-        "pageserver_timeline_ephemeral_bytes",
-        "Total number of bytes in ephemeral layers, summed for all timelines.  Approximate, lazily updated."
-    )
-    .expect("Failed to register metric")
-});
-
 /// Metrics related to the lifecycle of a [`crate::tenant::Tenant`] object: things
 /// like how long it took to load.
 ///
@@ -2473,8 +2465,7 @@ impl<F: Future<Output = Result<O, E>>, O, E> Future for MeasuredRemoteOp<F> {
 }

 pub mod tokio_epoll_uring {
-    use metrics::{register_int_counter, UIntGauge};
-    use once_cell::sync::Lazy;
+    use metrics::UIntGauge;

    pub struct Collector {
        descs: Vec<metrics::core::Desc>,
@@ -2482,13 +2473,15 @@ pub mod tokio_epoll_uring {
        systems_destroyed: UIntGauge,
    }

+    const NMETRICS: usize = 2;
+
    impl metrics::core::Collector for Collector {
        fn desc(&self) -> Vec<&metrics::core::Desc> {
            self.descs.iter().collect()
        }

        fn collect(&self) -> Vec<metrics::proto::MetricFamily> {
-            let mut mfs = Vec::with_capacity(Self::NMETRICS);
+            let mut mfs = Vec::with_capacity(NMETRICS);
            let tokio_epoll_uring::metrics::Metrics {
                systems_created,
                systems_destroyed,
@@ -2502,8 +2495,6 @@ pub mod tokio_epoll_uring {
    }

    impl Collector {
-        const NMETRICS: usize = 2;
-
        #[allow(clippy::new_without_default)]
        pub fn new() -> Self {
            let mut descs = Vec::new();
@@ -2537,22 +2528,6 @@ pub mod tokio_epoll_uring {
            }
        }
    }
-
-    pub(crate) static THREAD_LOCAL_LAUNCH_SUCCESSES: Lazy<metrics::IntCounter> = Lazy::new(|| {
-        register_int_counter!(
-            "pageserver_tokio_epoll_uring_pageserver_thread_local_launch_success_count",
-            "Number of times where thread_local_system creation spanned multiple executor threads",
-        )
-        .unwrap()
-    });
-
-    pub(crate) static THREAD_LOCAL_LAUNCH_FAILURES: Lazy<metrics::IntCounter> = Lazy::new(|| {
-        register_int_counter!(
-            "pageserver_tokio_epoll_uring_pageserver_thread_local_launch_failures_count",
-            "Number of times thread_local_system creation failed and was retried after back-off.",
-        )
-        .unwrap()
-    });
 }

 pub(crate) mod tenant_throttling {
@@ -2681,8 +2656,6 @@ pub fn preinitialize_metrics() {
        &WALRECEIVER_BROKER_UPDATES,
        &WALRECEIVER_CANDIDATES_ADDED,
        &WALRECEIVER_CANDIDATES_REMOVED,
-        &tokio_epoll_uring::THREAD_LOCAL_LAUNCH_FAILURES,
-        &tokio_epoll_uring::THREAD_LOCAL_LAUNCH_SUCCESSES,
    ]
    .into_iter()
    .for_each(|c| {
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -760,7 +760,6 @@ impl PageServerHandler {
        let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb, &tenant.cancel)));
        timeline
            .import_basebackup_from_tar(
-                tenant.clone(),
                &mut copyin_reader,
                base_lsn,
                self.broker_client.clone(),
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -34,7 +34,6 @@ use strum::IntoEnumIterator;
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, trace, warn};
 use utils::bin_ser::DeserializeError;
-use utils::vec_map::{VecMap, VecMapOrdering};
 use utils::{bin_ser::BeSer, lsn::Lsn};

 const MAX_AUX_FILE_DELTAS: usize = 1024;
@@ -1547,13 +1546,12 @@ impl<'a> DatadirModification<'a> {
        if !self.pending_updates.is_empty() {
            // The put_batch call below expects expects the inputs to be sorted by Lsn,
            // so we do that first.
-            let lsn_ordered_batch: VecMap<Lsn, (Key, Value)> = VecMap::from_iter(
-                self.pending_updates
-                    .drain()
-                    .map(|(key, vals)| vals.into_iter().map(move |(lsn, val)| (lsn, (key, val))))
-                    .kmerge_by(|lhs, rhs| lhs.0 < rhs.0),
-                VecMapOrdering::GreaterOrEqual,
-            );
+            let lsn_ordered_batch: Vec<(Key, Lsn, Value)> = self
+                .pending_updates
+                .drain()
+                .map(|(key, vals)| vals.into_iter().map(move |(lsn, val)| (key, lsn, val)))
+                .kmerge_by(|lhs, rhs| lhs.1 .0 < rhs.1 .0)
+                .collect();

            writer.put_batch(lsn_ordered_batch, ctx).await?;
        }
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -50,6 +50,8 @@ use once_cell::sync::Lazy;

 use utils::id::TimelineId;

+use crate::shutdown_pageserver;
+
 //
 // There are four runtimes:
 //
@@ -451,7 +453,7 @@ async fn task_finish(
    }

    if shutdown_process {
-        std::process::exit(1);
+        shutdown_pageserver(None, 1).await;
    }
 }

--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -43,8 +43,6 @@ use utils::sync::gate::Gate;
 use utils::sync::gate::GateGuard;
 use utils::timeout::timeout_cancellable;
 use utils::timeout::TimeoutCancellableError;
-use utils::zstd::create_zst_tarball;
-use utils::zstd::extract_zst_tarball;

 use self::config::AttachedLocationConfig;
 use self::config::AttachmentMode;
@@ -57,8 +55,8 @@ use self::mgr::GetTenantError;
 use self::mgr::TenantsMap;
 use self::remote_timeline_client::upload::upload_index_part;
 use self::remote_timeline_client::RemoteTimelineClient;
-use self::timeline::uninit::TimelineCreateGuard;
 use self::timeline::uninit::TimelineExclusionError;
+use self::timeline::uninit::TimelineUninitMark;
 use self::timeline::uninit::UninitializedTimeline;
 use self::timeline::EvictionTaskTenantState;
 use self::timeline::TimelineResources;
@@ -202,13 +200,6 @@ pub(super) struct AttachedTenantConf {
 }

 impl AttachedTenantConf {
-    fn new(tenant_conf: TenantConfOpt, location: AttachedLocationConfig) -> Self {
-        Self {
-            tenant_conf,
-            location,
-        }
-    }
-
    fn try_from(location_conf: LocationConf) -> anyhow::Result<Self> {
        match &location_conf.mode {
            LocationMode::Attached(attach_conf) => Ok(Self {
@@ -574,8 +565,9 @@ impl Tenant {
            // avoiding holding it across awaits
            let mut timelines_accessor = self.timelines.lock().unwrap();
            match timelines_accessor.entry(timeline_id) {
-                // We should never try and load the same timeline twice during startup
                Entry::Occupied(_) => {
+                    // The uninit mark file acts as a lock that prevents another task from
+                    // initializing the timeline at the same time.
                    unreachable!(
                        "Timeline {tenant_id}/{timeline_id} already exists in the tenant map"
                    );
@@ -685,20 +677,9 @@ impl Tenant {
                }

                // Ideally we should use Tenant::set_broken_no_wait, but it is not supposed to be used when tenant is in loading state.
-                enum BrokenVerbosity {
-                    Error,
-                    Info
-                }
                let make_broken =
-                    |t: &Tenant, err: anyhow::Error, verbosity: BrokenVerbosity| {
-                        match verbosity {
-                            BrokenVerbosity::Info => {
-                                info!("attach cancelled, setting tenant state to Broken: {err}");
-                            },
-                            BrokenVerbosity::Error => {
-                                error!("attach failed, setting tenant state to Broken: {err:?}");
-                            }
-                        }
+                    |t: &Tenant, err: anyhow::Error| {
+                        error!("attach failed, setting tenant state to Broken: {err:?}");
                        t.state.send_modify(|state| {
                            // The Stopping case is for when we have passed control on to DeleteTenantFlow:
                            // if it errors, we will call make_broken when tenant is already in Stopping.
@@ -762,7 +743,7 @@ impl Tenant {
                            // Make the tenant broken so that set_stopping will not hang waiting for it to leave
                            // the Attaching state.  This is an over-reaction (nothing really broke, the tenant is
                            // just shutting down), but ensures progress.
-                            make_broken(&tenant_clone, anyhow::anyhow!("Shut down while Attaching"), BrokenVerbosity::Info);
+                            make_broken(&tenant_clone, anyhow::anyhow!("Shut down while Attaching"));
                            return Ok(());
                        },
                    )
@@ -784,7 +765,7 @@ impl Tenant {
                        match res {
                            Ok(p) => Some(p),
                            Err(e) => {
-                                make_broken(&tenant_clone, anyhow::anyhow!(e), BrokenVerbosity::Error);
+                                make_broken(&tenant_clone, anyhow::anyhow!(e));
                                return Ok(());
                            }
                        }
@@ -808,7 +789,7 @@ impl Tenant {
                    {
                        Ok(should_resume_deletion) => should_resume_deletion,
                        Err(err) => {
-                            make_broken(&tenant_clone, anyhow::anyhow!(err), BrokenVerbosity::Error);
+                            make_broken(&tenant_clone, anyhow::anyhow!(err));
                            return Ok(());
                        }
                    }
@@ -838,7 +819,7 @@ impl Tenant {
                    .await;

                    if let Err(e) = deleted {
-                        make_broken(&tenant_clone, anyhow::anyhow!(e), BrokenVerbosity::Error);
+                        make_broken(&tenant_clone, anyhow::anyhow!(e));
                    }

                    return Ok(());
@@ -859,7 +840,7 @@ impl Tenant {
                        tenant_clone.activate(broker_client, None, &ctx);
                    }
                    Err(e) => {
-                        make_broken(&tenant_clone, anyhow::anyhow!(e), BrokenVerbosity::Error);
+                        make_broken(&tenant_clone, anyhow::anyhow!(e));
                    }
                }

@@ -1083,7 +1064,8 @@ impl Tenant {
            let entry_path = entry.path();

            let purge = if crate::is_temporary(entry_path)
-                // TODO: remove uninit mark code (https://github.com/neondatabase/neon/issues/5718)
+                // TODO: uninit_mark isn't needed any more, since uninitialized timelines are already
+                // covered by the check that the timeline must exist in remote storage.
                || is_uninit_mark(entry_path)
                || crate::is_delete_mark(entry_path)
            {
@@ -1316,6 +1298,11 @@ impl Tenant {
    /// Until that happens, the on-disk state is invalid (disk_consistent_lsn=Lsn(0))
    /// and the timeline will fail to load at a restart.
    ///
+    /// That's why we add an uninit mark file, and wrap it together witht the Timeline
+    /// in-memory object into UninitializedTimeline.
+    /// Once the caller is done setting up the timeline, they should call
+    /// `UninitializedTimeline::initialize_with_lock` to remove the uninit mark.
+    ///
    /// For tests, use `DatadirModification::init_empty_test_timeline` + `commit` to setup the
    /// minimum amount of keys required to get a writable timeline.
    /// (Without it, `put` might fail due to `repartition` failing.)
@@ -1331,9 +1318,7 @@ impl Tenant {
            "Cannot create empty timelines on inactive tenant"
        );

-        // Protect against concurrent attempts to use this TimelineId
-        let create_guard = self.create_timeline_create_guard(new_timeline_id)?;
-
+        let timeline_uninit_mark = self.create_timeline_uninit_mark(new_timeline_id)?;
        let new_metadata = TimelineMetadata::new(
            // Initialize disk_consistent LSN to 0, The caller must import some data to
            // make it valid, before calling finish_creation()
@@ -1348,7 +1333,7 @@ impl Tenant {
        self.prepare_new_timeline(
            new_timeline_id,
            &new_metadata,
-            create_guard,
+            timeline_uninit_mark,
            initdb_lsn,
            None,
        )
@@ -1411,7 +1396,7 @@ impl Tenant {
    /// the same timeline ID already exists, returns CreateTimelineError::AlreadyExists.
    #[allow(clippy::too_many_arguments)]
    pub(crate) async fn create_timeline(
-        self: &Arc<Tenant>,
+        &self,
        new_timeline_id: TimelineId,
        ancestor_timeline_id: Option<TimelineId>,
        mut ancestor_start_lsn: Option<Lsn>,
@@ -1436,8 +1421,9 @@ impl Tenant {
            .map_err(|_| CreateTimelineError::ShuttingDown)?;

        // Get exclusive access to the timeline ID: this ensures that it does not already exist,
-        // and that no other creation attempts will be allowed in while we are working.
-        let create_guard = match self.create_timeline_create_guard(new_timeline_id) {
+        // and that no other creation attempts will be allowed in while we are working.  The
+        // uninit_mark is a guard.
+        let uninit_mark = match self.create_timeline_uninit_mark(new_timeline_id) {
            Ok(m) => m,
            Err(TimelineExclusionError::AlreadyCreating) => {
                // Creation is in progress, we cannot create it again, and we cannot
@@ -1480,8 +1466,6 @@ impl Tenant {
            }
        };

-        pausable_failpoint!("timeline-creation-after-uninit");
-
        let loaded_timeline = match ancestor_timeline_id {
            Some(ancestor_timeline_id) => {
                let ancestor_timeline = self
@@ -1529,7 +1513,7 @@ impl Tenant {
                    &ancestor_timeline,
                    new_timeline_id,
                    ancestor_start_lsn,
-                    create_guard,
+                    uninit_mark,
                    ctx,
                )
                .await?
@@ -1539,7 +1523,7 @@ impl Tenant {
                    new_timeline_id,
                    pg_version,
                    load_existing_initdb,
-                    create_guard,
+                    uninit_mark,
                    ctx,
                )
                .await?
@@ -1559,7 +1543,7 @@ impl Tenant {
            })?;
        }

-        loaded_timeline.activate(self.clone(), broker_client, None, ctx);
+        loaded_timeline.activate(broker_client, None, ctx);

        Ok(loaded_timeline)
    }
@@ -1731,12 +1715,7 @@ impl Tenant {
            let mut activated_timelines = 0;

            for timeline in timelines_to_activate {
-                timeline.activate(
-                    self.clone(),
-                    broker_client.clone(),
-                    background_jobs_can_start,
-                    ctx,
-                );
+                timeline.activate(broker_client.clone(), background_jobs_can_start, ctx);
                activated_timelines += 1;
            }

@@ -2068,12 +2047,7 @@ impl Tenant {
                TenantState::Active { .. } => {
                    return Ok(());
                }
-                TenantState::Broken { reason, .. } => {
-                    // This is fatal, and reported distinctly from the general case of "will never be active" because
-                    // it's logically a 500 to external API users (broken is always a bug).
-                    return Err(GetActiveTenantError::Broken(reason));
-                }
-                TenantState::Stopping { .. } => {
+                TenantState::Broken { .. } | TenantState::Stopping { .. } => {
                    // There's no chance the tenant can transition back into ::Active
                    return Err(GetActiveTenantError::WillNotBecomeActive(current_state));
                }
@@ -2151,7 +2125,7 @@ impl Tenant {

            // Shut down the timeline's remote client: this means that the indices we write
            // for child shards will not be invalidated by the parent shard deleting layers.
-            tl_client.shutdown().await;
+            tl_client.shutdown().await?;

            // Download methods can still be used after shutdown, as they don't flow through the remote client's
            // queue.  In principal the RemoteTimelineClient could provide this without downloading it, but this
@@ -2896,9 +2870,9 @@ impl Tenant {
        start_lsn: Option<Lsn>,
        ctx: &RequestContext,
    ) -> Result<Arc<Timeline>, CreateTimelineError> {
-        let create_guard = self.create_timeline_create_guard(dst_id).unwrap();
+        let uninit_mark = self.create_timeline_uninit_mark(dst_id).unwrap();
        let tl = self
-            .branch_timeline_impl(src_timeline, dst_id, start_lsn, create_guard, ctx)
+            .branch_timeline_impl(src_timeline, dst_id, start_lsn, uninit_mark, ctx)
            .await?;
        tl.set_state(TimelineState::Active);
        Ok(tl)
@@ -2912,10 +2886,10 @@ impl Tenant {
        src_timeline: &Arc<Timeline>,
        dst_id: TimelineId,
        start_lsn: Option<Lsn>,
-        timeline_create_guard: TimelineCreateGuard<'_>,
+        timeline_uninit_mark: TimelineUninitMark<'_>,
        ctx: &RequestContext,
    ) -> Result<Arc<Timeline>, CreateTimelineError> {
-        self.branch_timeline_impl(src_timeline, dst_id, start_lsn, timeline_create_guard, ctx)
+        self.branch_timeline_impl(src_timeline, dst_id, start_lsn, timeline_uninit_mark, ctx)
            .await
    }

@@ -2924,7 +2898,7 @@ impl Tenant {
        src_timeline: &Arc<Timeline>,
        dst_id: TimelineId,
        start_lsn: Option<Lsn>,
-        timeline_create_guard: TimelineCreateGuard<'_>,
+        timeline_uninit_mark: TimelineUninitMark<'_>,
        _ctx: &RequestContext,
    ) -> Result<Arc<Timeline>, CreateTimelineError> {
        let src_id = src_timeline.timeline_id;
@@ -3008,7 +2982,7 @@ impl Tenant {
            .prepare_new_timeline(
                dst_id,
                &metadata,
-                timeline_create_guard,
+                timeline_uninit_mark,
                start_lsn + 1,
                Some(Arc::clone(src_timeline)),
            )
@@ -3040,12 +3014,12 @@ impl Tenant {
        load_existing_initdb: Option<TimelineId>,
        ctx: &RequestContext,
    ) -> anyhow::Result<Arc<Timeline>> {
-        let create_guard = self.create_timeline_create_guard(timeline_id).unwrap();
+        let uninit_mark = self.create_timeline_uninit_mark(timeline_id).unwrap();
        self.bootstrap_timeline(
            timeline_id,
            pg_version,
            load_existing_initdb,
-            create_guard,
+            uninit_mark,
            ctx,
        )
        .await
@@ -3072,13 +3046,8 @@ impl Tenant {
            }
        }

-        let (pgdata_zstd, tar_zst_size) = create_zst_tarball(pgdata_path, &temp_path).await?;
-        const INITDB_TAR_ZST_WARN_LIMIT: u64 = 2 * 1024 * 1024;
-        if tar_zst_size > INITDB_TAR_ZST_WARN_LIMIT {
-            warn!(
-                "compressed {temp_path} size of {tar_zst_size} is above limit {INITDB_TAR_ZST_WARN_LIMIT}."
-            );
-        }
+        let (pgdata_zstd, tar_zst_size) =
+            import_datadir::create_tar_zst(pgdata_path, &temp_path).await?;

        pausable_failpoint!("before-initdb-upload");

@@ -3114,7 +3083,7 @@ impl Tenant {
        timeline_id: TimelineId,
        pg_version: u32,
        load_existing_initdb: Option<TimelineId>,
-        timeline_create_guard: TimelineCreateGuard<'_>,
+        timeline_uninit_mark: TimelineUninitMark<'_>,
        ctx: &RequestContext,
    ) -> anyhow::Result<Arc<Timeline>> {
        // create a `tenant/{tenant_id}/timelines/basebackup-{timeline_id}.{TEMP_FILE_SUFFIX}/`
@@ -3126,14 +3095,13 @@ impl Tenant {
            TEMP_FILE_SUFFIX,
        );

-        // Remove whatever was left from the previous runs: safe because TimelineCreateGuard guarantees
-        // we won't race with other creations or existent timelines with the same path.
+        // an uninit mark was placed before, nothing else can access this timeline files
+        // current initdb was not run yet, so remove whatever was left from the previous runs
        if pgdata_path.exists() {
            fs::remove_dir_all(&pgdata_path).with_context(|| {
                format!("Failed to remove already existing initdb directory: {pgdata_path}")
            })?;
        }
-
        // this new directory is very temporary, set to remove it immediately after bootstrap, we don't need it
        scopeguard::defer! {
            if let Err(e) = fs::remove_dir_all(&pgdata_path) {
@@ -3178,7 +3146,7 @@ impl Tenant {

            let buf_read =
                BufReader::with_capacity(remote_timeline_client::BUFFER_SIZE, initdb_tar_zst);
-            extract_zst_tarball(&pgdata_path, buf_read)
+            import_datadir::extract_tar_zst(&pgdata_path, buf_read)
                .await
                .context("extract initdb tar")?;
        } else {
@@ -3210,7 +3178,7 @@ impl Tenant {
            .prepare_new_timeline(
                timeline_id,
                &new_metadata,
-                timeline_create_guard,
+                timeline_uninit_mark,
                pgdata_lsn,
                None,
            )
@@ -3282,12 +3250,13 @@ impl Tenant {
    ///
    /// An empty layer map is initialized, and new data and WAL can be imported starting
    /// at 'disk_consistent_lsn'. After any initial data has been imported, call
-    /// `finish_creation` to insert the Timeline into the timelines map.
+    /// `finish_creation` to insert the Timeline into the timelines map and to remove the
+    /// uninit mark file.
    async fn prepare_new_timeline<'a>(
        &'a self,
        new_timeline_id: TimelineId,
        new_metadata: &TimelineMetadata,
-        create_guard: TimelineCreateGuard<'a>,
+        uninit_mark: TimelineUninitMark<'a>,
        start_lsn: Lsn,
        ancestor: Option<Arc<Timeline>>,
    ) -> anyhow::Result<UninitializedTimeline> {
@@ -3310,12 +3279,9 @@ impl Tenant {

        timeline_struct.init_empty_layer_map(start_lsn);

-        if let Err(e) = self
-            .create_timeline_files(&create_guard.timeline_path)
-            .await
-        {
+        if let Err(e) = self.create_timeline_files(&uninit_mark.timeline_path).await {
            error!("Failed to create initial files for timeline {tenant_shard_id}/{new_timeline_id}, cleaning up: {e:?}");
-            cleanup_timeline_directory(create_guard);
+            cleanup_timeline_directory(uninit_mark);
            return Err(e);
        }

@@ -3326,31 +3292,41 @@ impl Tenant {
        Ok(UninitializedTimeline::new(
            self,
            new_timeline_id,
-            Some((timeline_struct, create_guard)),
+            Some((timeline_struct, uninit_mark)),
        ))
    }

    async fn create_timeline_files(&self, timeline_path: &Utf8Path) -> anyhow::Result<()> {
        crashsafe::create_dir(timeline_path).context("Failed to create timeline directory")?;

-        fail::fail_point!("after-timeline-dir-creation", |_| {
-            anyhow::bail!("failpoint after-timeline-dir-creation");
+        fail::fail_point!("after-timeline-uninit-mark-creation", |_| {
+            anyhow::bail!("failpoint after-timeline-uninit-mark-creation");
        });

        Ok(())
    }

-    /// Get a guard that provides exclusive access to the timeline directory, preventing
-    /// concurrent attempts to create the same timeline.
-    fn create_timeline_create_guard(
+    /// Attempts to create an uninit mark file for the timeline initialization.
+    /// Bails, if the timeline is already loaded into the memory (i.e. initialized before), or the uninit mark file already exists.
+    ///
+    /// This way, we need to hold the timelines lock only for small amount of time during the mark check/creation per timeline init.
+    fn create_timeline_uninit_mark(
        &self,
        timeline_id: TimelineId,
-    ) -> Result<TimelineCreateGuard, TimelineExclusionError> {
+    ) -> Result<TimelineUninitMark, TimelineExclusionError> {
        let tenant_shard_id = self.tenant_shard_id;

+        let uninit_mark_path = self
+            .conf
+            .timeline_uninit_mark_file_path(tenant_shard_id, timeline_id);
        let timeline_path = self.conf.timeline_path(&tenant_shard_id, &timeline_id);

-        let create_guard = TimelineCreateGuard::new(self, timeline_id, timeline_path.clone())?;
+        let uninit_mark = TimelineUninitMark::new(
+            self,
+            timeline_id,
+            uninit_mark_path.clone(),
+            timeline_path.clone(),
+        )?;

        // At this stage, we have got exclusive access to in-memory state for this timeline ID
        // for creation.
@@ -3366,7 +3342,23 @@ impl Tenant {
            )));
        }

-        Ok(create_guard)
+        // Create the on-disk uninit mark _after_ the in-memory acquisition of the tenant ID: guarantees
+        // that during process runtime, colliding creations will be caught in-memory without getting
+        // as far as failing to write a file.
+        fs::OpenOptions::new()
+            .write(true)
+            .create_new(true)
+            .open(&uninit_mark_path)
+            .context("Failed to create uninit mark file")
+            .and_then(|_| {
+                crashsafe::fsync_file_and_parent(&uninit_mark_path)
+                    .context("Failed to fsync uninit mark file")
+            })
+            .with_context(|| {
+                format!("Failed to crate uninit mark for timeline {tenant_shard_id}/{timeline_id}")
+            })?;
+
+        Ok(uninit_mark)
    }

    /// Gathers inputs from all of the timelines to produce a sizing model input.
@@ -5107,15 +5099,15 @@ mod tests {
    }

    #[tokio::test]
-    async fn test_create_guard_crash() -> anyhow::Result<()> {
-        let name = "test_create_guard_crash";
+    async fn test_uninit_mark_crash() -> anyhow::Result<()> {
+        let name = "test_uninit_mark_crash";
        let harness = TenantHarness::create(name)?;
        {
            let (tenant, ctx) = harness.load().await;
            let tline = tenant
                .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)
                .await?;
-            // Leave the timeline ID in [`Tenant::timelines_creating`] to exclude attempting to create it again
+            // Keeps uninit mark in place
            let raw_tline = tline.raw_timeline().unwrap();
            raw_tline
                .shutdown()
@@ -5143,6 +5135,11 @@ mod tests {
            .timeline_path(&tenant.tenant_shard_id, &TIMELINE_ID)
            .exists());

+        assert!(!harness
+            .conf
+            .timeline_uninit_mark_file_path(tenant.tenant_shard_id, TIMELINE_ID)
+            .exists());
+
        Ok(())
    }

--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -196,17 +196,16 @@ impl LocationConf {
    /// For use when attaching/re-attaching: update the generation stored in this
    /// structure.  If we were in a secondary state, promote to attached (posession
    /// of a fresh generation implies this).
-    pub(crate) fn attach_in_generation(&mut self, mode: AttachmentMode, generation: Generation) {
+    pub(crate) fn attach_in_generation(&mut self, generation: Generation) {
        match &mut self.mode {
            LocationMode::Attached(attach_conf) => {
                attach_conf.generation = generation;
-                attach_conf.attach_mode = mode;
            }
            LocationMode::Secondary(_) => {
                // We are promoted to attached by the control plane's re-attach response
                self.mode = LocationMode::Attached(AttachedLocationConfig {
                    generation,
-                    attach_mode: mode,
+                    attach_mode: AttachmentMode::Single,
                })
            }
        }
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -111,7 +111,6 @@ async fn create_local_delete_mark(
    let _ = std::fs::OpenOptions::new()
        .write(true)
        .create(true)
-        .truncate(true)
        .open(&marker_path)
        .with_context(|| format!("could not create delete marker file {marker_path:?}"))?;

@@ -297,7 +296,6 @@ impl DeleteTenantFlow {
        remote_storage: Option<GenericRemoteStorage>,
        tenants: &'static std::sync::RwLock<TenantsMap>,
        tenant: Arc<Tenant>,
-        cancel: &CancellationToken,
    ) -> Result<(), DeleteTenantError> {
        span::debug_assert_current_span_has_tenant_id();

@@ -305,9 +303,7 @@ impl DeleteTenantFlow {

        let mut guard = Self::prepare(&tenant).await?;

-        if let Err(e) =
-            Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant, cancel).await
-        {
+        if let Err(e) = Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant).await {
            tenant.set_broken(format!("{e:#}")).await;
            return Err(e);
        }
@@ -326,7 +322,6 @@ impl DeleteTenantFlow {
        conf: &'static PageServerConf,
        remote_storage: Option<&GenericRemoteStorage>,
        tenant: &Tenant,
-        cancel: &CancellationToken,
    ) -> Result<(), DeleteTenantError> {
        guard.mark_in_progress()?;

@@ -340,9 +335,15 @@ impl DeleteTenantFlow {
        // Though sounds scary, different mark name?
        // Detach currently uses remove_dir_all so in case of a crash we can end up in a weird state.
        if let Some(remote_storage) = &remote_storage {
-            create_remote_delete_mark(conf, remote_storage, &tenant.tenant_shard_id, cancel)
-                .await
-                .context("remote_mark")?
+            create_remote_delete_mark(
+                conf,
+                remote_storage,
+                &tenant.tenant_shard_id,
+                // Can't use tenant.cancel, it's already shut down.  TODO: wire in an appropriate token
+                &CancellationToken::new(),
+            )
+            .await
+            .context("remote_mark")?
        }

        fail::fail_point!("tenant-delete-before-create-local-mark", |_| {
@@ -545,7 +546,8 @@ impl DeleteTenantFlow {
            conf,
            remote_storage.as_ref(),
            &tenant.tenant_shard_id,
-            &task_mgr::shutdown_token(),
+            // Can't use tenant.cancel, it's already shut down.  TODO: wire in an appropriate token
+            &CancellationToken::new(),
        )
        .await?;

--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -2,13 +2,13 @@
 //! page server.

 use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf};
+use futures::stream::StreamExt;
 use itertools::Itertools;
 use pageserver_api::key::Key;
-use pageserver_api::models::LocationConfigMode;
+use pageserver_api::models::ShardParameters;
 use pageserver_api::shard::{
    ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId,
 };
-use pageserver_api::upcall_api::ReAttachResponseTenant;
 use rand::{distributions::Alphanumeric, Rng};
 use std::borrow::Cow;
 use std::cmp::Ordering;
@@ -16,7 +16,6 @@ use std::collections::{BTreeMap, HashMap};
 use std::ops::Deref;
 use std::sync::Arc;
 use std::time::{Duration, Instant};
-use sysinfo::SystemExt;
 use tokio::fs;
 use utils::timeout::{timeout_cancellable, TimeoutCancellableError};

@@ -40,10 +39,10 @@ use crate::metrics::{TENANT, TENANT_MANAGER as METRICS};
 use crate::task_mgr::{self, TaskKind};
 use crate::tenant::config::{
    AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, SecondaryLocationConfig,
+    TenantConfOpt,
 };
 use crate::tenant::delete::DeleteTenantFlow;
 use crate::tenant::span::debug_assert_current_span_has_tenant_id;
-use crate::tenant::storage_layer::inmemory_layer;
 use crate::tenant::{AttachedTenantConf, SpawnMode, Tenant, TenantState};
 use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TEMP_FILE_SUFFIX};

@@ -103,7 +102,7 @@ pub(crate) enum TenantsMap {
    /// [`init_tenant_mgr`] is done, all on-disk tenants have been loaded.
    /// New tenants can be added using [`tenant_map_acquire_slot`].
    Open(BTreeMap<TenantShardId, TenantSlot>),
-    /// The pageserver has entered shutdown mode via [`TenantManager::shutdown`].
+    /// The pageserver has entered shutdown mode via [`shutdown_all_tenants`].
    /// Existing tenants are still accessible, but no new tenants can be created.
    ShuttingDown(BTreeMap<TenantShardId, TenantSlot>),
 }
@@ -126,46 +125,6 @@ pub(crate) enum ShardSelector {
    Page(Key),
 }

-/// A convenience for use with the re_attach ControlPlaneClient function: rather
-/// than the serializable struct, we build this enum that encapsulates
-/// the invariant that attached tenants always have generations.
-///
-/// This represents the subset of a LocationConfig that we receive during re-attach.
-pub(crate) enum TenantStartupMode {
-    Attached((AttachmentMode, Generation)),
-    Secondary,
-}
-
-impl TenantStartupMode {
-    /// Return the generation & mode that should be used when starting
-    /// this tenant.
-    ///
-    /// If this returns None, the re-attach struct is in an invalid state and
-    /// should be ignored in the response.
-    fn from_reattach_tenant(rart: ReAttachResponseTenant) -> Option<Self> {
-        match (rart.mode, rart.gen) {
-            (LocationConfigMode::Detached, _) => None,
-            (LocationConfigMode::Secondary, _) => Some(Self::Secondary),
-            (LocationConfigMode::AttachedMulti, Some(g)) => {
-                Some(Self::Attached((AttachmentMode::Multi, Generation::new(g))))
-            }
-            (LocationConfigMode::AttachedSingle, Some(g)) => {
-                Some(Self::Attached((AttachmentMode::Single, Generation::new(g))))
-            }
-            (LocationConfigMode::AttachedStale, Some(g)) => {
-                Some(Self::Attached((AttachmentMode::Stale, Generation::new(g))))
-            }
-            _ => {
-                tracing::warn!(
-                    "Received invalid re-attach state for tenant {}: {rart:?}",
-                    rart.id
-                );
-                None
-            }
-        }
-    }
-}
-
 impl TenantsMap {
    /// Convenience function for typical usage, where we want to get a `Tenant` object, for
    /// working with attached tenants.  If the TenantId is in the map but in Secondary state,
@@ -302,17 +261,11 @@ pub struct TenantManager {
    // See https://github.com/neondatabase/neon/issues/5796
    tenants: &'static std::sync::RwLock<TenantsMap>,
    resources: TenantSharedResources,
-
-    // Long-running operations that happen outside of a [`Tenant`] lifetime should respect this token.
-    // This is for edge cases like tenant deletion.  In normal cases (within a Tenant lifetime),
-    // tenants have their own cancellation tokens, which we fire individually in [`Self::shutdown`], or
-    // when the tenant detaches.
-    cancel: CancellationToken,
 }

 fn emergency_generations(
    tenant_confs: &HashMap<TenantShardId, anyhow::Result<LocationConf>>,
-) -> HashMap<TenantShardId, TenantStartupMode> {
+) -> HashMap<TenantShardId, Generation> {
    tenant_confs
        .iter()
        .filter_map(|(tid, lc)| {
@@ -320,15 +273,12 @@ fn emergency_generations(
                Ok(lc) => lc,
                Err(_) => return None,
            };
-            Some((
-                *tid,
-                match &lc.mode {
-                    LocationMode::Attached(alc) => {
-                        TenantStartupMode::Attached((alc.attach_mode, alc.generation))
-                    }
-                    LocationMode::Secondary(_) => TenantStartupMode::Secondary,
-                },
-            ))
+            let gen = match &lc.mode {
+                LocationMode::Attached(alc) => Some(alc.generation),
+                LocationMode::Secondary(_) => None,
+            };
+
+            gen.map(|g| (*tid, g))
        })
        .collect()
 }
@@ -338,7 +288,7 @@ async fn init_load_generations(
    tenant_confs: &HashMap<TenantShardId, anyhow::Result<LocationConf>>,
    resources: &TenantSharedResources,
    cancel: &CancellationToken,
-) -> anyhow::Result<Option<HashMap<TenantShardId, TenantStartupMode>>> {
+) -> anyhow::Result<Option<HashMap<TenantShardId, Generation>>> {
    let generations = if conf.control_plane_emergency_mode {
        error!(
            "Emergency mode!  Tenants will be attached unsafely using their last known generation"
@@ -348,12 +298,7 @@ async fn init_load_generations(
        info!("Calling control plane API to re-attach tenants");
        // If we are configured to use the control plane API, then it is the source of truth for what tenants to load.
        match client.re_attach(conf).await {
-            Ok(tenants) => tenants
-                .into_iter()
-                .flat_map(|(id, rart)| {
-                    TenantStartupMode::from_reattach_tenant(rart).map(|tsm| (id, tsm))
-                })
-                .collect(),
+            Ok(tenants) => tenants,
            Err(RetryForeverError::ShuttingDown) => {
                anyhow::bail!("Shut down while waiting for control plane re-attach response")
            }
@@ -371,17 +316,9 @@ async fn init_load_generations(
    // Must only do this if remote storage is enabled, otherwise deletion queue
    // is not running and channel push will fail.
    if resources.remote_storage.is_some() {
-        let attached_tenants = generations
-            .iter()
-            .flat_map(|(id, start_mode)| {
-                match start_mode {
-                    TenantStartupMode::Attached((_mode, generation)) => Some(generation),
-                    TenantStartupMode::Secondary => None,
-                }
-                .map(|gen| (*id, *gen))
-            })
-            .collect();
-        resources.deletion_queue_client.recover(attached_tenants)?;
+        resources
+            .deletion_queue_client
+            .recover(generations.clone())?;
    }

    Ok(Some(generations))
@@ -544,23 +481,12 @@ pub async fn init_tenant_mgr(

    let ctx = RequestContext::todo_child(TaskKind::Startup, DownloadBehavior::Warn);

-    // Initialize dynamic limits that depend on system resources
-    let system_memory =
-        sysinfo::System::new_with_specifics(sysinfo::RefreshKind::new().with_memory())
-            .total_memory();
-    let max_ephemeral_layer_bytes =
-        conf.ephemeral_bytes_per_memory_kb as u64 * (system_memory / 1024);
-    tracing::info!("Initialized ephemeral layer size limit to {max_ephemeral_layer_bytes}, for {system_memory} bytes of memory");
-    inmemory_layer::GLOBAL_RESOURCES.max_dirty_bytes.store(
-        max_ephemeral_layer_bytes,
-        std::sync::atomic::Ordering::Relaxed,
-    );
-
    // Scan local filesystem for attached tenants
    let tenant_configs = init_load_tenant_configs(conf).await?;

-    // Determine which tenants are to be secondary or attached, and in which generation
-    let tenant_modes = init_load_generations(conf, &tenant_configs, &resources, &cancel).await?;
+    // Determine which tenants are to be attached
+    let tenant_generations =
+        init_load_generations(conf, &tenant_configs, &resources, &cancel).await?;

    tracing::info!(
        "Attaching {} tenants at startup, warming up {} at a time",
@@ -590,102 +516,97 @@ pub async fn init_tenant_mgr(
            }
        };

-        // FIXME: if we were attached, and get demoted to secondary on re-attach, we
-        // don't have a place to get a config.
-        // (https://github.com/neondatabase/neon/issues/5377)
-        const DEFAULT_SECONDARY_CONF: SecondaryLocationConfig =
-            SecondaryLocationConfig { warm: true };
-
-        // Update the location config according to the re-attach response
-        if let Some(tenant_modes) = &tenant_modes {
+        let generation = if let Some(generations) = &tenant_generations {
            // We have a generation map: treat it as the authority for whether
            // this tenant is really attached.
-            match tenant_modes.get(&tenant_shard_id) {
-                None => {
-                    info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Detaching tenant, control plane omitted it in re-attach response");
-                    if let Err(e) = safe_remove_tenant_dir_all(&tenant_dir_path).await {
-                        error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
-                            "Failed to remove detached tenant directory '{tenant_dir_path}': {e:?}",
-                        );
-                    }
-
-                    // We deleted local content: move on to next tenant, don't try and spawn this one.
-                    continue;
-                }
-                Some(TenantStartupMode::Secondary) => {
-                    if !matches!(location_conf.mode, LocationMode::Secondary(_)) {
-                        location_conf.mode = LocationMode::Secondary(DEFAULT_SECONDARY_CONF);
-                    }
-                }
-                Some(TenantStartupMode::Attached((attach_mode, generation))) => {
-                    let old_gen_higher = match &location_conf.mode {
-                        LocationMode::Attached(AttachedLocationConfig {
-                            generation: old_generation,
-                            attach_mode: _attach_mode,
-                        }) => {
-                            if old_generation > generation {
-                                Some(old_generation)
-                            } else {
-                                None
-                            }
-                        }
-                        _ => None,
-                    };
-                    if let Some(old_generation) = old_gen_higher {
+            if let Some(gen) = generations.get(&tenant_shard_id) {
+                if let LocationMode::Attached(attached) = &location_conf.mode {
+                    if attached.generation > *gen {
                        tracing::error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
-                            "Control plane gave decreasing generation ({generation:?}) in re-attach response for tenant that was attached in generation {:?}, demoting to secondary",
-                            old_generation
+                            "Control plane gave decreasing generation ({gen:?}) in re-attach response for tenant that was attached in generation {:?}, demoting to secondary",
+                            attached.generation
                        );

                        // We cannot safely attach this tenant given a bogus generation number, but let's avoid throwing away
                        // local disk content: demote to secondary rather than detaching.
-                        location_conf.mode = LocationMode::Secondary(DEFAULT_SECONDARY_CONF);
-                    } else {
-                        location_conf.attach_in_generation(*attach_mode, *generation);
+                        tenants.insert(
+                            tenant_shard_id,
+                            TenantSlot::Secondary(SecondaryTenant::new(
+                                tenant_shard_id,
+                                location_conf.shard,
+                                location_conf.tenant_conf.clone(),
+                                &SecondaryLocationConfig { warm: false },
+                            )),
+                        );
                    }
                }
+                *gen
+            } else {
+                match &location_conf.mode {
+                    LocationMode::Secondary(secondary_config) => {
+                        // We do not require the control plane's permission for secondary mode
+                        // tenants, because they do no remote writes and hence require no
+                        // generation number
+                        info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Loaded tenant in secondary mode");
+                        tenants.insert(
+                            tenant_shard_id,
+                            TenantSlot::Secondary(SecondaryTenant::new(
+                                tenant_shard_id,
+                                location_conf.shard,
+                                location_conf.tenant_conf,
+                                secondary_config,
+                            )),
+                        );
+                    }
+                    LocationMode::Attached(_) => {
+                        // TODO: augment re-attach API to enable the control plane to
+                        // instruct us about secondary attachments.  That way, instead of throwing
+                        // away local state, we can gracefully fall back to secondary here, if the control
+                        // plane tells us so.
+                        // (https://github.com/neondatabase/neon/issues/5377)
+                        info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Detaching tenant, control plane omitted it in re-attach response");
+                        if let Err(e) = safe_remove_tenant_dir_all(&tenant_dir_path).await {
+                            error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
+                                "Failed to remove detached tenant directory '{tenant_dir_path}': {e:?}",
+                            );
+                        }
+                    }
+                };
+
+                continue;
            }
        } else {
            // Legacy mode: no generation information, any tenant present
            // on local disk may activate
            info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Starting tenant in legacy mode, no generation",);
+            Generation::none()
        };

        // Presence of a generation number implies attachment: attach the tenant
        // if it wasn't already, and apply the generation number.
+        location_conf.attach_in_generation(generation);
        Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await?;

        let shard_identity = location_conf.shard;
-        let slot = match location_conf.mode {
-            LocationMode::Attached(attached_conf) => {
-                match tenant_spawn(
-                    conf,
-                    tenant_shard_id,
-                    &tenant_dir_path,
-                    resources.clone(),
-                    AttachedTenantConf::new(location_conf.tenant_conf, attached_conf),
-                    shard_identity,
-                    Some(init_order.clone()),
-                    &TENANTS,
-                    SpawnMode::Lazy,
-                    &ctx,
-                ) {
-                    Ok(tenant) => TenantSlot::Attached(tenant),
-                    Err(e) => {
-                        error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Failed to start tenant: {e:#}");
-                        continue;
-                    }
-                }
+        match tenant_spawn(
+            conf,
+            tenant_shard_id,
+            &tenant_dir_path,
+            resources.clone(),
+            AttachedTenantConf::try_from(location_conf)?,
+            shard_identity,
+            Some(init_order.clone()),
+            &TENANTS,
+            SpawnMode::Lazy,
+            &ctx,
+        ) {
+            Ok(tenant) => {
+                tenants.insert(tenant_shard_id, TenantSlot::Attached(tenant));
            }
-            LocationMode::Secondary(secondary_conf) => TenantSlot::Secondary(SecondaryTenant::new(
-                tenant_shard_id,
-                shard_identity,
-                location_conf.tenant_conf,
-                &secondary_conf,
-            )),
-        };
-
-        tenants.insert(tenant_shard_id, slot);
+            Err(e) => {
+                error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Failed to start tenant: {e:#}");
+            }
+        }
    }

    info!("Processed {} local tenants at startup", tenants.len());
@@ -699,14 +620,13 @@ pub async fn init_tenant_mgr(
        conf,
        tenants: &TENANTS,
        resources,
-        cancel: CancellationToken::new(),
    })
 }

 /// Wrapper for Tenant::spawn that checks invariants before running, and inserts
 /// a broken tenant in the map if Tenant::spawn fails.
 #[allow(clippy::too_many_arguments)]
-fn tenant_spawn(
+pub(crate) fn tenant_spawn(
    conf: &'static PageServerConf,
    tenant_shard_id: TenantShardId,
    tenant_path: &Utf8Path,
@@ -760,6 +680,21 @@ fn tenant_spawn(
    Ok(tenant)
 }

+///
+/// Shut down all tenants. This runs as part of pageserver shutdown.
+///
+/// NB: We leave the tenants in the map, so that they remain accessible through
+/// the management API until we shut it down. If we removed the shut-down tenants
+/// from the tenants map, the management API would return 404 for these tenants,
+/// because TenantsMap::get() now returns `None`.
+/// That could be easily misinterpreted by control plane, the consumer of the
+/// management API. For example, it could attach the tenant on a different pageserver.
+/// We would then be in split-brain once this pageserver restarts.
+#[instrument(skip_all)]
+pub(crate) async fn shutdown_all_tenants() {
+    shutdown_all_tenants0(&TENANTS).await
+}
+
 async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
    let mut join_set = JoinSet::new();

@@ -888,6 +823,50 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
    // caller will log how long we took
 }

+#[derive(Debug, thiserror::Error)]
+pub(crate) enum SetNewTenantConfigError {
+    #[error(transparent)]
+    GetTenant(#[from] GetTenantError),
+    #[error(transparent)]
+    Persist(anyhow::Error),
+    #[error(transparent)]
+    Other(anyhow::Error),
+}
+
+pub(crate) async fn set_new_tenant_config(
+    conf: &'static PageServerConf,
+    new_tenant_conf: TenantConfOpt,
+    tenant_id: TenantId,
+) -> Result<(), SetNewTenantConfigError> {
+    // Legacy API: does not support sharding
+    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
+
+    info!("configuring tenant {tenant_id}");
+    let tenant = get_tenant(tenant_shard_id, true)?;
+
+    if !tenant.tenant_shard_id().shard_count.is_unsharded() {
+        // Note that we use ShardParameters::default below.
+        return Err(SetNewTenantConfigError::Other(anyhow::anyhow!(
+            "This API may only be used on single-sharded tenants, use the /location_config API for sharded tenants"
+        )));
+    }
+
+    // This is a legacy API that only operates on attached tenants: the preferred
+    // API to use is the location_config/ endpoint, which lets the caller provide
+    // the full LocationConf.
+    let location_conf = LocationConf::attached_single(
+        new_tenant_conf.clone(),
+        tenant.generation,
+        &ShardParameters::default(),
+    );
+
+    Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf)
+        .await
+        .map_err(SetNewTenantConfigError::Persist)?;
+    tenant.set_new_tenant_config(new_tenant_conf);
+    Ok(())
+}
+
 #[derive(thiserror::Error, Debug)]
 pub(crate) enum UpsertLocationError {
    #[error("Bad config request: {0}")]
@@ -913,21 +892,32 @@ impl TenantManager {
        self.conf
    }

-    /// Gets the attached tenant from the in-memory data, erroring if it's absent, in secondary mode, or currently
-    /// undergoing a state change (i.e. slot is InProgress).
-    ///
-    /// The return Tenant is not guaranteed to be active: check its status after obtaing it, or
-    /// use [`Tenant::wait_to_become_active`] before using it if you will do I/O on it.
+    /// Gets the attached tenant from the in-memory data, erroring if it's absent, in secondary mode, or is not fitting to the query.
+    /// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants.
    pub(crate) fn get_attached_tenant_shard(
        &self,
        tenant_shard_id: TenantShardId,
+        active_only: bool,
    ) -> Result<Arc<Tenant>, GetTenantError> {
        let locked = self.tenants.read().unwrap();

        let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)?;

        match peek_slot {
-            Some(TenantSlot::Attached(tenant)) => Ok(Arc::clone(tenant)),
+            Some(TenantSlot::Attached(tenant)) => match tenant.current_state() {
+                TenantState::Broken {
+                    reason,
+                    backtrace: _,
+                } if active_only => Err(GetTenantError::Broken(reason)),
+                TenantState::Active => Ok(Arc::clone(tenant)),
+                _ => {
+                    if active_only {
+                        Err(GetTenantError::NotActive(tenant_shard_id))
+                    } else {
+                        Ok(Arc::clone(tenant))
+                    }
+                }
+            },
            Some(TenantSlot::InProgress(_)) => Err(GetTenantError::NotActive(tenant_shard_id)),
            None | Some(TenantSlot::Secondary(_)) => {
                Err(GetTenantError::NotFound(tenant_shard_id.tenant_id))
@@ -1420,8 +1410,7 @@ impl TenantManager {
                    .wait_to_become_active(activation_timeout)
                    .await
                    .map_err(|e| match e {
-                        GetActiveTenantError::WillNotBecomeActive(_)
-                        | GetActiveTenantError::Broken(_) => {
+                        GetActiveTenantError::WillNotBecomeActive(_) => {
                            DeleteTenantError::InvalidState(tenant.current_state())
                        }
                        GetActiveTenantError::Cancelled => DeleteTenantError::Cancelled,
@@ -1439,7 +1428,6 @@ impl TenantManager {
            self.resources.remote_storage.clone(),
            &TENANTS,
            tenant,
-            &self.cancel,
        )
        .await;

@@ -1448,30 +1436,29 @@ impl TenantManager {
        result
    }

-    #[instrument(skip_all, fields(tenant_id=%tenant.get_tenant_shard_id().tenant_id, shard_id=%tenant.get_tenant_shard_id().shard_slug(), new_shard_count=%new_shard_count.literal()))]
+    #[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), new_shard_count=%new_shard_count.literal()))]
    pub(crate) async fn shard_split(
        &self,
-        tenant: Arc<Tenant>,
+        tenant_shard_id: TenantShardId,
        new_shard_count: ShardCount,
        new_stripe_size: Option<ShardStripeSize>,
        ctx: &RequestContext,
    ) -> anyhow::Result<Vec<TenantShardId>> {
-        let tenant_shard_id = *tenant.get_tenant_shard_id();
        let r = self
-            .do_shard_split(tenant, new_shard_count, new_stripe_size, ctx)
+            .do_shard_split(tenant_shard_id, new_shard_count, new_stripe_size, ctx)
            .await;
        if r.is_err() {
            // Shard splitting might have left the original shard in a partially shut down state (it
            // stops the shard's remote timeline client).  Reset it to ensure we leave things in
            // a working state.
            if self.get(tenant_shard_id).is_some() {
-                tracing::warn!("Resetting after shard split failure");
+                tracing::warn!("Resetting {tenant_shard_id} after shard split failure");
                if let Err(e) = self.reset_tenant(tenant_shard_id, false, ctx).await {
                    // Log this error because our return value will still be the original error, not this one.  This is
                    // a severe error: if this happens, we might be leaving behind a tenant that is not fully functional
                    // (e.g. has uploads disabled).  We can't do anything else: if reset fails then shutting the tenant down or
                    // setting it broken probably won't help either.
-                    tracing::error!("Failed to reset: {e}");
+                    tracing::error!("Failed to reset {tenant_shard_id}: {e}");
                }
            }
        }
@@ -1481,12 +1468,12 @@ impl TenantManager {

    pub(crate) async fn do_shard_split(
        &self,
-        tenant: Arc<Tenant>,
+        tenant_shard_id: TenantShardId,
        new_shard_count: ShardCount,
        new_stripe_size: Option<ShardStripeSize>,
        ctx: &RequestContext,
    ) -> anyhow::Result<Vec<TenantShardId>> {
-        let tenant_shard_id = *tenant.get_tenant_shard_id();
+        let tenant = get_tenant(tenant_shard_id, true)?;

        // Validate the incoming request
        if new_shard_count.count() <= tenant_shard_id.shard_count.count() {
@@ -1532,6 +1519,7 @@ impl TenantManager {
            // If [`Tenant::split_prepare`] fails, we must reload the tenant, because it might
            // have been left in a partially-shut-down state.
            tracing::warn!("Failed to prepare for split: {e}, reloading Tenant before returning");
+            self.reset_tenant(tenant_shard_id, false, ctx).await?;
            return Err(e);
        }

@@ -1680,7 +1668,19 @@ impl TenantManager {
        let tmp_path = safe_rename_tenant_dir(&local_tenant_directory)
            .await
            .with_context(|| format!("local tenant directory {local_tenant_directory:?} rename"))?;
-        self.spawn_background_purge(tmp_path);
+        task_mgr::spawn(
+            task_mgr::BACKGROUND_RUNTIME.handle(),
+            TaskKind::MgmtRequest,
+            None,
+            None,
+            "tenant_files_delete",
+            false,
+            async move {
+                fs::remove_dir_all(tmp_path.as_path())
+                    .await
+                    .with_context(|| format!("tenant directory {:?} deletion", tmp_path))
+            },
+        );

        fail::fail_point!("shard-split-pre-finish", |_| Err(anyhow::anyhow!(
            "failpoint"
@@ -1715,9 +1715,9 @@ impl TenantManager {
                    .layers
                    .read()
                    .await
-                    .likely_resident_layers()
-                    .collect::<Vec<_>>();
-
+                    .resident_layers()
+                    .collect::<Vec<_>>()
+                    .await;
                for layer in timeline_layers {
                    let relative_path = layer
                        .local_path()
@@ -1817,136 +1817,6 @@ impl TenantManager {

        Ok(())
    }
-
-    ///
-    /// Shut down all tenants. This runs as part of pageserver shutdown.
-    ///
-    /// NB: We leave the tenants in the map, so that they remain accessible through
-    /// the management API until we shut it down. If we removed the shut-down tenants
-    /// from the tenants map, the management API would return 404 for these tenants,
-    /// because TenantsMap::get() now returns `None`.
-    /// That could be easily misinterpreted by control plane, the consumer of the
-    /// management API. For example, it could attach the tenant on a different pageserver.
-    /// We would then be in split-brain once this pageserver restarts.
-    #[instrument(skip_all)]
-    pub(crate) async fn shutdown(&self) {
-        self.cancel.cancel();
-
-        shutdown_all_tenants0(self.tenants).await
-    }
-
-    /// When we have moved a tenant's content to a temporary directory, we may delete it lazily in
-    /// the background, and thereby avoid blocking any API requests on this deletion completing.
-    fn spawn_background_purge(&self, tmp_path: Utf8PathBuf) {
-        // Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory.
-        // After a tenant is detached, there are no more task_mgr tasks for that tenant_id.
-        let task_tenant_id = None;
-
-        task_mgr::spawn(
-            task_mgr::BACKGROUND_RUNTIME.handle(),
-            TaskKind::MgmtRequest,
-            task_tenant_id,
-            None,
-            "tenant_files_delete",
-            false,
-            async move {
-                fs::remove_dir_all(tmp_path.as_path())
-                    .await
-                    .with_context(|| format!("tenant directory {:?} deletion", tmp_path))
-            },
-        );
-    }
-
-    pub(crate) async fn detach_tenant(
-        &self,
-        conf: &'static PageServerConf,
-        tenant_shard_id: TenantShardId,
-        detach_ignored: bool,
-        deletion_queue_client: &DeletionQueueClient,
-    ) -> Result<(), TenantStateError> {
-        let tmp_path = self
-            .detach_tenant0(
-                conf,
-                &TENANTS,
-                tenant_shard_id,
-                detach_ignored,
-                deletion_queue_client,
-            )
-            .await?;
-        self.spawn_background_purge(tmp_path);
-
-        Ok(())
-    }
-
-    async fn detach_tenant0(
-        &self,
-        conf: &'static PageServerConf,
-        tenants: &std::sync::RwLock<TenantsMap>,
-        tenant_shard_id: TenantShardId,
-        detach_ignored: bool,
-        deletion_queue_client: &DeletionQueueClient,
-    ) -> Result<Utf8PathBuf, TenantStateError> {
-        let tenant_dir_rename_operation = |tenant_id_to_clean: TenantShardId| async move {
-            let local_tenant_directory = conf.tenant_path(&tenant_id_to_clean);
-            safe_rename_tenant_dir(&local_tenant_directory)
-                .await
-                .with_context(|| {
-                    format!("local tenant directory {local_tenant_directory:?} rename")
-                })
-        };
-
-        let removal_result = remove_tenant_from_memory(
-            tenants,
-            tenant_shard_id,
-            tenant_dir_rename_operation(tenant_shard_id),
-        )
-        .await;
-
-        // Flush pending deletions, so that they have a good chance of passing validation
-        // before this tenant is potentially re-attached elsewhere.
-        deletion_queue_client.flush_advisory();
-
-        // Ignored tenants are not present in memory and will bail the removal from memory operation.
-        // Before returning the error, check for ignored tenant removal case — we only need to clean its local files then.
-        if detach_ignored
-            && matches!(
-                removal_result,
-                Err(TenantStateError::SlotError(TenantSlotError::NotFound(_)))
-            )
-        {
-            let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_shard_id);
-            if tenant_ignore_mark.exists() {
-                info!("Detaching an ignored tenant");
-                let tmp_path = tenant_dir_rename_operation(tenant_shard_id)
-                    .await
-                    .with_context(|| {
-                        format!("Ignored tenant {tenant_shard_id} local directory rename")
-                    })?;
-                return Ok(tmp_path);
-            }
-        }
-
-        removal_result
-    }
-
-    pub(crate) fn list_tenants(
-        &self,
-    ) -> Result<Vec<(TenantShardId, TenantState, Generation)>, TenantMapListError> {
-        let tenants = TENANTS.read().unwrap();
-        let m = match &*tenants {
-            TenantsMap::Initializing => return Err(TenantMapListError::Initializing),
-            TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m,
-        };
-        Ok(m.iter()
-            .filter_map(|(id, tenant)| match tenant {
-                TenantSlot::Attached(tenant) => {
-                    Some((*id, tenant.current_state(), tenant.generation()))
-                }
-                TenantSlot::Secondary(_) => None,
-                TenantSlot::InProgress(_) => None,
-            })
-            .collect())
-    }
 }

 #[derive(Debug, thiserror::Error)]
@@ -1958,12 +1828,51 @@ pub(crate) enum GetTenantError {

    #[error("Tenant {0} is not active")]
    NotActive(TenantShardId),
+    /// Broken is logically a subset of NotActive, but a distinct error is useful as
+    /// NotActive is usually a retryable state for API purposes, whereas Broken
+    /// is a stuck error state
+    #[error("Tenant is broken: {0}")]
+    Broken(String),

    // Initializing or shutting down: cannot authoritatively say whether we have this tenant
    #[error("Tenant map is not available: {0}")]
    MapState(#[from] TenantMapError),
 }

+/// Gets the tenant from the in-memory data, erroring if it's absent or is not fitting to the query.
+/// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants.
+///
+/// This method is cancel-safe.
+pub(crate) fn get_tenant(
+    tenant_shard_id: TenantShardId,
+    active_only: bool,
+) -> Result<Arc<Tenant>, GetTenantError> {
+    let locked = TENANTS.read().unwrap();
+
+    let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)?;
+
+    match peek_slot {
+        Some(TenantSlot::Attached(tenant)) => match tenant.current_state() {
+            TenantState::Broken {
+                reason,
+                backtrace: _,
+            } if active_only => Err(GetTenantError::Broken(reason)),
+            TenantState::Active => Ok(Arc::clone(tenant)),
+            _ => {
+                if active_only {
+                    Err(GetTenantError::NotActive(tenant_shard_id))
+                } else {
+                    Ok(Arc::clone(tenant))
+                }
+            }
+        },
+        Some(TenantSlot::InProgress(_)) => Err(GetTenantError::NotActive(tenant_shard_id)),
+        None | Some(TenantSlot::Secondary(_)) => {
+            Err(GetTenantError::NotFound(tenant_shard_id.tenant_id))
+        }
+    }
+}
+
 #[derive(thiserror::Error, Debug)]
 pub(crate) enum GetActiveTenantError {
    /// We may time out either while TenantSlot is InProgress, or while the Tenant
@@ -1987,12 +1896,6 @@ pub(crate) enum GetActiveTenantError {
    /// Tenant exists, but is in a state that cannot become active (e.g. Stopping, Broken)
    #[error("will not become active.  Current state: {0}")]
    WillNotBecomeActive(TenantState),
-
-    /// Broken is logically a subset of WillNotBecomeActive, but a distinct error is useful as
-    /// WillNotBecomeActive is a permitted error under some circumstances, whereas broken should
-    /// never happen.
-    #[error("Tenant is broken: {0}")]
-    Broken(String),
 }

 /// Get a [`Tenant`] in its active state. If the tenant_id is currently in [`TenantSlot::InProgress`]
@@ -2115,6 +2018,87 @@ pub(crate) enum TenantStateError {
    Other(#[from] anyhow::Error),
 }

+pub(crate) async fn detach_tenant(
+    conf: &'static PageServerConf,
+    tenant_shard_id: TenantShardId,
+    detach_ignored: bool,
+    deletion_queue_client: &DeletionQueueClient,
+) -> Result<(), TenantStateError> {
+    let tmp_path = detach_tenant0(
+        conf,
+        &TENANTS,
+        tenant_shard_id,
+        detach_ignored,
+        deletion_queue_client,
+    )
+    .await?;
+    // Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory.
+    // After a tenant is detached, there are no more task_mgr tasks for that tenant_id.
+    let task_tenant_id = None;
+    task_mgr::spawn(
+        task_mgr::BACKGROUND_RUNTIME.handle(),
+        TaskKind::MgmtRequest,
+        task_tenant_id,
+        None,
+        "tenant_files_delete",
+        false,
+        async move {
+            fs::remove_dir_all(tmp_path.as_path())
+                .await
+                .with_context(|| format!("tenant directory {:?} deletion", tmp_path))
+        },
+    );
+    Ok(())
+}
+
+async fn detach_tenant0(
+    conf: &'static PageServerConf,
+    tenants: &std::sync::RwLock<TenantsMap>,
+    tenant_shard_id: TenantShardId,
+    detach_ignored: bool,
+    deletion_queue_client: &DeletionQueueClient,
+) -> Result<Utf8PathBuf, TenantStateError> {
+    let tenant_dir_rename_operation = |tenant_id_to_clean: TenantShardId| async move {
+        let local_tenant_directory = conf.tenant_path(&tenant_id_to_clean);
+        safe_rename_tenant_dir(&local_tenant_directory)
+            .await
+            .with_context(|| format!("local tenant directory {local_tenant_directory:?} rename"))
+    };
+
+    let removal_result = remove_tenant_from_memory(
+        tenants,
+        tenant_shard_id,
+        tenant_dir_rename_operation(tenant_shard_id),
+    )
+    .await;
+
+    // Flush pending deletions, so that they have a good chance of passing validation
+    // before this tenant is potentially re-attached elsewhere.
+    deletion_queue_client.flush_advisory();
+
+    // Ignored tenants are not present in memory and will bail the removal from memory operation.
+    // Before returning the error, check for ignored tenant removal case — we only need to clean its local files then.
+    if detach_ignored
+        && matches!(
+            removal_result,
+            Err(TenantStateError::SlotError(TenantSlotError::NotFound(_)))
+        )
+    {
+        let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_shard_id);
+        if tenant_ignore_mark.exists() {
+            info!("Detaching an ignored tenant");
+            let tmp_path = tenant_dir_rename_operation(tenant_shard_id)
+                .await
+                .with_context(|| {
+                    format!("Ignored tenant {tenant_shard_id} local directory rename")
+                })?;
+            return Ok(tmp_path);
+        }
+    }
+
+    removal_result
+}
+
 pub(crate) async fn load_tenant(
    conf: &'static PageServerConf,
    tenant_id: TenantId,
@@ -2148,7 +2132,7 @@ pub(crate) async fn load_tenant(

    let mut location_conf =
        Tenant::load_tenant_config(conf, &tenant_shard_id).map_err(TenantMapInsertError::Other)?;
-    location_conf.attach_in_generation(AttachmentMode::Single, generation);
+    location_conf.attach_in_generation(generation);

    Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await?;

@@ -2212,6 +2196,27 @@ pub(crate) enum TenantMapListError {
    Initializing,
 }

+///
+/// Get list of tenants, for the mgmt API
+///
+pub(crate) async fn list_tenants(
+) -> Result<Vec<(TenantShardId, TenantState, Generation)>, TenantMapListError> {
+    let tenants = TENANTS.read().unwrap();
+    let m = match &*tenants {
+        TenantsMap::Initializing => return Err(TenantMapListError::Initializing),
+        TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m,
+    };
+    Ok(m.iter()
+        .filter_map(|(id, tenant)| match tenant {
+            TenantSlot::Attached(tenant) => {
+                Some((*id, tenant.current_state(), tenant.generation()))
+            }
+            TenantSlot::Secondary(_) => None,
+            TenantSlot::InProgress(_) => None,
+        })
+        .collect())
+}
+
 #[derive(Debug, thiserror::Error)]
 pub(crate) enum TenantMapInsertError {
    #[error(transparent)]
@@ -2714,7 +2719,7 @@ use {
    utils::http::error::ApiError,
 };

-pub(crate) fn immediate_gc(
+pub(crate) async fn immediate_gc(
    tenant_shard_id: TenantShardId,
    timeline_id: TimelineId,
    gc_req: TimelineGcRequest,
@@ -2736,8 +2741,6 @@ pub(crate) fn immediate_gc(
    // Run in task_mgr to avoid race with tenant_detach operation
    let ctx = ctx.detached_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
    let (task_done, wait_task_done) = tokio::sync::oneshot::channel();
-    let span = info_span!("manual_gc", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id);
-
    // TODO: spawning is redundant now, need to hold the gate
    task_mgr::spawn(
        &tokio::runtime::Handle::current(),
@@ -2752,15 +2755,16 @@ pub(crate) fn immediate_gc(
            #[allow(unused_mut)]
            let mut result = tenant
                .gc_iteration(Some(timeline_id), gc_horizon, pitr, &cancel, &ctx)
+                .instrument(info_span!("manual_gc", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id))
                .await;
                // FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it
                // better once the types support it.

            #[cfg(feature = "testing")]
            {
-                // we need to synchronize with drop completion for python tests without polling for
-                // log messages
                if let Ok(result) = result.as_mut() {
+                    // why not futures unordered? it seems it needs very much the same task structure
+                    // but would only run on single task.
                    let mut js = tokio::task::JoinSet::new();
                    for layer in std::mem::take(&mut result.doomed_layers) {
                        js.spawn(layer.wait_drop());
@@ -2776,7 +2780,7 @@ pub(crate) fn immediate_gc(

                if let Some(rtc) = rtc {
                    // layer drops schedule actions on remote timeline client to actually do the
-                    // deletions; don't care about the shutdown error, just exit fast
+                    // deletions; don't care just exit fast about the shutdown error
                    drop(rtc.wait_completion().await);
                }
            }
@@ -2787,7 +2791,6 @@ pub(crate) fn immediate_gc(
            }
            Ok(())
        }
-        .instrument(span)
    );

    // drop the guard until after we've spawned the task so that timeline shutdown will wait for the task
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -217,7 +217,7 @@ use crate::task_mgr::shutdown_token;
 use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::remote_timeline_client::download::download_retry;
 use crate::tenant::storage_layer::AsLayerDesc;
-use crate::tenant::upload_queue::{Delete, UploadQueueStoppedDeletable};
+use crate::tenant::upload_queue::Delete;
 use crate::tenant::TIMELINES_SEGMENT_NAME;
 use crate::{
    config::PageServerConf,
@@ -266,6 +266,15 @@ pub enum MaybeDeletedIndexPart {
    Deleted(IndexPart),
 }

+/// Errors that can arise when calling [`RemoteTimelineClient::stop`].
+#[derive(Debug, thiserror::Error)]
+pub enum StopError {
+    /// Returned if the upload queue was never initialized.
+    /// See [`RemoteTimelineClient::init_upload_queue`] and [`RemoteTimelineClient::init_upload_queue_for_empty_remote`].
+    #[error("queue is not initialized")]
+    QueueUninitialized,
+}
+
 #[derive(Debug, thiserror::Error)]
 pub enum PersistIndexPartWithDeletedFlagError {
    #[error("another task is already setting the deleted_flag, started at {0:?}")]
@@ -390,10 +399,15 @@ impl RemoteTimelineClient {
            "bug: it is responsibility of the caller to provide index part from MaybeDeletedIndexPart::Deleted"
        ))?;

+        {
+            let mut upload_queue = self.upload_queue.lock().unwrap();
+            upload_queue.initialize_with_current_remote_index_part(index_part)?;
+            self.update_remote_physical_size_gauge(Some(index_part));
+        }
+        // also locks upload queue, without dropping the guard above it will be a deadlock
+        self.stop().expect("initialized line above");
+
        let mut upload_queue = self.upload_queue.lock().unwrap();
-        upload_queue.initialize_with_current_remote_index_part(index_part)?;
-        self.update_remote_physical_size_gauge(Some(index_part));
-        self.stop_impl(&mut upload_queue);

        upload_queue
            .stopped_mut()
@@ -407,8 +421,7 @@ impl RemoteTimelineClient {
        match &mut *self.upload_queue.lock().unwrap() {
            UploadQueue::Uninitialized => None,
            UploadQueue::Initialized(q) => q.get_last_remote_consistent_lsn_projected(),
-            UploadQueue::Stopped(UploadQueueStopped::Uninitialized) => None,
-            UploadQueue::Stopped(UploadQueueStopped::Deletable(q)) => q
+            UploadQueue::Stopped(q) => q
                .upload_queue_for_deletion
                .get_last_remote_consistent_lsn_projected(),
        }
@@ -418,8 +431,7 @@ impl RemoteTimelineClient {
        match &mut *self.upload_queue.lock().unwrap() {
            UploadQueue::Uninitialized => None,
            UploadQueue::Initialized(q) => Some(q.get_last_remote_consistent_lsn_visible()),
-            UploadQueue::Stopped(UploadQueueStopped::Uninitialized) => None,
-            UploadQueue::Stopped(UploadQueueStopped::Deletable(q)) => Some(
+            UploadQueue::Stopped(q) => Some(
                q.upload_queue_for_deletion
                    .get_last_remote_consistent_lsn_visible(),
            ),
@@ -886,7 +898,7 @@ impl RemoteTimelineClient {
    /// Wait for all previously scheduled operations to complete, and then stop.
    ///
    /// Not cancellation safe
-    pub(crate) async fn shutdown(self: &Arc<Self>) {
+    pub(crate) async fn shutdown(self: &Arc<Self>) -> Result<(), StopError> {
        // On cancellation the queue is left in ackward state of refusing new operations but
        // proper stop is yet to be called. On cancel the original or some later task must call
        // `stop` or `shutdown`.
@@ -897,12 +909,8 @@ impl RemoteTimelineClient {
        let fut = {
            let mut guard = self.upload_queue.lock().unwrap();
            let upload_queue = match &mut *guard {
-                UploadQueue::Stopped(_) => return,
-                UploadQueue::Uninitialized => {
-                    // transition into Stopped state
-                    self.stop_impl(&mut guard);
-                    return;
-                }
+                UploadQueue::Stopped(_) => return Ok(()),
+                UploadQueue::Uninitialized => return Err(StopError::QueueUninitialized),
                UploadQueue::Initialized(ref mut init) => init,
            };

@@ -934,7 +942,7 @@ impl RemoteTimelineClient {
            }
        }

-        self.stop();
+        self.stop()
    }

    /// Set the deleted_at field in the remote index file.
@@ -1316,7 +1324,12 @@ impl RemoteTimelineClient {
            // upload finishes or times out soon enough.
            if cancel.is_cancelled() {
                info!("upload task cancelled by shutdown request");
-                self.stop();
+                match self.stop() {
+                    Ok(()) => {}
+                    Err(StopError::QueueUninitialized) => {
+                        unreachable!("we never launch an upload task if the queue is uninitialized, and once it is initialized, we never go back")
+                    }
+                }
                return;
            }

@@ -1571,23 +1584,17 @@ impl RemoteTimelineClient {
    /// In-progress operations will still be running after this function returns.
    /// Use `task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(timeline_id))`
    /// to wait for them to complete, after calling this function.
-    pub(crate) fn stop(&self) {
+    pub(crate) fn stop(&self) -> Result<(), StopError> {
        // Whichever *task* for this RemoteTimelineClient grabs the mutex first will transition the queue
        // into stopped state, thereby dropping all off the queued *ops* which haven't become *tasks* yet.
        // The other *tasks* will come here and observe an already shut down queue and hence simply wrap up their business.
        let mut guard = self.upload_queue.lock().unwrap();
-        self.stop_impl(&mut guard);
-    }
-
-    fn stop_impl(&self, guard: &mut std::sync::MutexGuard<UploadQueue>) {
-        match &mut **guard {
-            UploadQueue::Uninitialized => {
-                info!("UploadQueue is in state Uninitialized, nothing to do");
-                **guard = UploadQueue::Stopped(UploadQueueStopped::Uninitialized);
-            }
+        match &mut *guard {
+            UploadQueue::Uninitialized => Err(StopError::QueueUninitialized),
            UploadQueue::Stopped(_) => {
                // nothing to do
                info!("another concurrent task already shut down the queue");
+                Ok(())
            }
            UploadQueue::Initialized(initialized) => {
                info!("shutting down upload queue");
@@ -1620,13 +1627,11 @@ impl RemoteTimelineClient {
                    };

                    let upload_queue = std::mem::replace(
-                        &mut **guard,
-                        UploadQueue::Stopped(UploadQueueStopped::Deletable(
-                            UploadQueueStoppedDeletable {
-                                upload_queue_for_deletion,
-                                deleted_at: SetDeletedFlagProgress::NotRunning,
-                            },
-                        )),
+                        &mut *guard,
+                        UploadQueue::Stopped(UploadQueueStopped {
+                            upload_queue_for_deletion,
+                            deleted_at: SetDeletedFlagProgress::NotRunning,
+                        }),
                    );
                    if let UploadQueue::Initialized(qi) = upload_queue {
                        qi
@@ -1655,6 +1660,10 @@ impl RemoteTimelineClient {
                    // which is exactly what we want to happen.
                    drop(op);
                }
+
+                // We're done.
+                drop(guard);
+                Ok(())
            }
        }
    }
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -23,7 +23,7 @@ use crate::tenant::storage_layer::LayerFileName;
 use crate::tenant::Generation;
 use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile};
 use crate::TEMP_FILE_SUFFIX;
-use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode, RemotePath};
+use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode};
 use utils::crashsafe::path_with_suffix_extension;
 use utils::id::TimelineId;

@@ -73,13 +73,55 @@ pub async fn download_layer_file<'a>(
    // If pageserver crashes the temp file will be deleted on startup and re-downloaded.
    let temp_file_path = path_with_suffix_extension(&local_path, TEMP_DOWNLOAD_EXTENSION);

-    let bytes_amount = download_retry(
-        || async { download_object(storage, &remote_path, &temp_file_path, cancel).await },
+    let (mut destination_file, bytes_amount) = download_retry(
+        || async {
+            let destination_file = tokio::fs::File::create(&temp_file_path)
+                .await
+                .with_context(|| format!("create a destination file for layer '{temp_file_path}'"))
+                .map_err(DownloadError::Other)?;
+
+            let download = storage.download(&remote_path, cancel).await?;
+
+            let mut destination_file =
+                tokio::io::BufWriter::with_capacity(super::BUFFER_SIZE, destination_file);
+
+            let mut reader = tokio_util::io::StreamReader::new(download.download_stream);
+
+            let bytes_amount = tokio::io::copy_buf(&mut reader, &mut destination_file).await;
+
+            match bytes_amount {
+                Ok(bytes_amount) => {
+                    let destination_file = destination_file.into_inner();
+                    Ok((destination_file, bytes_amount))
+                }
+                Err(e) => {
+                    if let Err(e) = tokio::fs::remove_file(&temp_file_path).await {
+                        on_fatal_io_error(&e, &format!("Removing temporary file {temp_file_path}"));
+                    }
+
+                    Err(e.into())
+                }
+            }
+        },
        &format!("download {remote_path:?}"),
        cancel,
    )
    .await?;

+    // Tokio doc here: https://docs.rs/tokio/1.17.0/tokio/fs/struct.File.html states that:
+    // A file will not be closed immediately when it goes out of scope if there are any IO operations
+    // that have not yet completed. To ensure that a file is closed immediately when it is dropped,
+    // you should call flush before dropping it.
+    //
+    // From the tokio code I see that it waits for pending operations to complete. There shouldt be any because
+    // we assume that `destination_file` file is fully written. I e there is no pending .write(...).await operations.
+    // But for additional safety lets check/wait for any pending operations.
+    destination_file
+        .flush()
+        .await
+        .with_context(|| format!("flush source file at {temp_file_path}"))
+        .map_err(DownloadError::Other)?;
+
    let expected = layer_metadata.file_size();
    if expected != bytes_amount {
        return Err(DownloadError::Other(anyhow!(
@@ -87,6 +129,14 @@ pub async fn download_layer_file<'a>(
        )));
    }

+    // not using sync_data because it can lose file size update
+    destination_file
+        .sync_all()
+        .await
+        .with_context(|| format!("failed to fsync source file at {temp_file_path}"))
+        .map_err(DownloadError::Other)?;
+    drop(destination_file);
+
    fail::fail_point!("remote-storage-download-pre-rename", |_| {
        Err(DownloadError::Other(anyhow!(
            "remote-storage-download-pre-rename failpoint triggered"
@@ -119,128 +169,6 @@ pub async fn download_layer_file<'a>(
    Ok(bytes_amount)
 }

-/// Download the object `src_path` in the remote `storage` to local path `dst_path`.
-///
-/// If Ok() is returned, the download succeeded and the inode & data have been made durable.
-/// (Note that the directory entry for the inode is not made durable.)
-/// The file size in bytes is returned.
-///
-/// If Err() is returned, there was some error. The file at `dst_path` has been unlinked.
-/// The unlinking has _not_ been made durable.
-async fn download_object<'a>(
-    storage: &'a GenericRemoteStorage,
-    src_path: &RemotePath,
-    dst_path: &Utf8PathBuf,
-    cancel: &CancellationToken,
-) -> Result<u64, DownloadError> {
-    let res = match crate::virtual_file::io_engine::get() {
-        crate::virtual_file::io_engine::IoEngine::NotSet => panic!("unset"),
-        crate::virtual_file::io_engine::IoEngine::StdFs => {
-            async {
-                let destination_file = tokio::fs::File::create(dst_path)
-                    .await
-                    .with_context(|| format!("create a destination file for layer '{dst_path}'"))
-                    .map_err(DownloadError::Other)?;
-
-                let download = storage.download(src_path, cancel).await?;
-
-                let mut buf_writer =
-                    tokio::io::BufWriter::with_capacity(super::BUFFER_SIZE, destination_file);
-
-                let mut reader = tokio_util::io::StreamReader::new(download.download_stream);
-
-                let bytes_amount = tokio::io::copy_buf(&mut reader, &mut buf_writer).await?;
-                buf_writer.flush().await?;
-
-                let mut destination_file = buf_writer.into_inner();
-
-                // Tokio doc here: https://docs.rs/tokio/1.17.0/tokio/fs/struct.File.html states that:
-                // A file will not be closed immediately when it goes out of scope if there are any IO operations
-                // that have not yet completed. To ensure that a file is closed immediately when it is dropped,
-                // you should call flush before dropping it.
-                //
-                // From the tokio code I see that it waits for pending operations to complete. There shouldt be any because
-                // we assume that `destination_file` file is fully written. I e there is no pending .write(...).await operations.
-                // But for additional safety lets check/wait for any pending operations.
-                destination_file
-                    .flush()
-                    .await
-                    .with_context(|| format!("flush source file at {dst_path}"))
-                    .map_err(DownloadError::Other)?;
-
-                // not using sync_data because it can lose file size update
-                destination_file
-                    .sync_all()
-                    .await
-                    .with_context(|| format!("failed to fsync source file at {dst_path}"))
-                    .map_err(DownloadError::Other)?;
-
-                Ok(bytes_amount)
-            }
-            .await
-        }
-        #[cfg(target_os = "linux")]
-        crate::virtual_file::io_engine::IoEngine::TokioEpollUring => {
-            use crate::virtual_file::owned_buffers_io::{self, util::size_tracking_writer};
-            async {
-                let destination_file = VirtualFile::create(dst_path)
-                    .await
-                    .with_context(|| format!("create a destination file for layer '{dst_path}'"))
-                    .map_err(DownloadError::Other)?;
-
-                let mut download = storage.download(src_path, cancel).await?;
-
-                // TODO: use vectored write (writev) once supported by tokio-epoll-uring.
-                // There's chunks_vectored() on the stream.
-                let (bytes_amount, destination_file) = async {
-                    let size_tracking = size_tracking_writer::Writer::new(destination_file);
-                    let mut buffered = owned_buffers_io::write::BufferedWriter::<
-                        { super::BUFFER_SIZE },
-                        _,
-                    >::new(size_tracking);
-                    while let Some(res) =
-                        futures::StreamExt::next(&mut download.download_stream).await
-                    {
-                        let chunk = match res {
-                            Ok(chunk) => chunk,
-                            Err(e) => return Err(e),
-                        };
-                        buffered
-                            .write_buffered(tokio_epoll_uring::BoundedBuf::slice_full(chunk))
-                            .await?;
-                    }
-                    let size_tracking = buffered.flush_and_into_inner().await?;
-                    Ok(size_tracking.into_inner())
-                }
-                .await?;
-
-                // not using sync_data because it can lose file size update
-                destination_file
-                    .sync_all()
-                    .await
-                    .with_context(|| format!("failed to fsync source file at {dst_path}"))
-                    .map_err(DownloadError::Other)?;
-
-                Ok(bytes_amount)
-            }
-            .await
-        }
-    };
-
-    // in case the download failed, clean up
-    match res {
-        Ok(bytes_amount) => Ok(bytes_amount),
-        Err(e) => {
-            if let Err(e) = tokio::fs::remove_file(dst_path).await {
-                if e.kind() != std::io::ErrorKind::NotFound {
-                    on_fatal_io_error(&e, &format!("Removing temporary file {dst_path}"));
-                }
-            }
-            Err(e)
-        }
-    }
-}
-
 const TEMP_DOWNLOAD_EXTENSION: &str = "temp_download";

 pub(crate) fn is_temp_download_file(path: &Utf8Path) -> bool {
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -95,11 +95,7 @@ pub(crate) struct SecondaryTenant {
    shard_identity: ShardIdentity,
    tenant_conf: std::sync::Mutex<TenantConfOpt>,

-    // Internal state used by the Downloader.
    detail: std::sync::Mutex<SecondaryDetail>,
-
-    // Public state indicating overall progress of downloads relative to the last heatmap seen
-    pub(crate) progress: std::sync::Mutex<models::SecondaryProgress>,
 }

 impl SecondaryTenant {
@@ -122,8 +118,6 @@ impl SecondaryTenant {
            tenant_conf: std::sync::Mutex::new(tenant_conf),

            detail: std::sync::Mutex::new(SecondaryDetail::new(config.clone())),
-
-            progress: std::sync::Mutex::default(),
        })
    }

@@ -253,12 +247,9 @@ impl SecondaryTenant {
 }

 /// The SecondaryController is a pseudo-rpc client for administrative control of secondary mode downloads,
-/// and heatmap uploads.  This is not a hot data path: it's used for:
-/// - Live migrations, where we want to ensure a migration destination has the freshest possible
-///   content before trying to cut over.
-/// - Tests, where we want to immediately upload/download for a particular tenant.
-///
-/// In normal operations, outside of migrations, uploads & downloads are autonomous and not driven by this interface.
+/// and heatmap uploads.  This is not a hot data path: it's primarily a hook for tests,
+/// where we want to immediately upload/download for a particular tenant.  In normal operation
+/// uploads & downloads are autonomous and not driven by this interface.
 pub struct SecondaryController {
    upload_req_tx: tokio::sync::mpsc::Sender<CommandRequest<UploadCommand>>,
    download_req_tx: tokio::sync::mpsc::Sender<CommandRequest<DownloadCommand>>,
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -15,7 +15,6 @@ use crate::{
    tenant::{
        config::SecondaryLocationConfig,
        debug_assert_current_span_has_tenant_and_timeline_id,
-        ephemeral_file::is_ephemeral_file,
        remote_timeline_client::{
            index::LayerFileMetadata, is_temp_download_file, FAILED_DOWNLOAD_WARN_THRESHOLD,
            FAILED_REMOTE_OP_RETRIES,
@@ -42,16 +41,14 @@ use crate::tenant::{
 use camino::Utf8PathBuf;
 use chrono::format::{DelayedFormat, StrftimeItems};
 use futures::Future;
-use pageserver_api::models::SecondaryProgress;
 use pageserver_api::shard::TenantShardId;
 use rand::Rng;
-use remote_storage::{DownloadError, Etag, GenericRemoteStorage};
+use remote_storage::{DownloadError, GenericRemoteStorage};

 use tokio_util::sync::CancellationToken;
 use tracing::{info_span, instrument, warn, Instrument};
 use utils::{
-    backoff, completion::Barrier, crashsafe::path_with_suffix_extension, failpoint_support, fs_ext,
-    id::TimelineId,
+    backoff, completion::Barrier, crashsafe::path_with_suffix_extension, fs_ext, id::TimelineId,
 };

 use super::{
@@ -131,7 +128,6 @@ pub(super) struct SecondaryDetail {
    pub(super) config: SecondaryLocationConfig,

    last_download: Option<Instant>,
-    last_etag: Option<Etag>,
    next_download: Option<Instant>,
    pub(super) timelines: HashMap<TimelineId, SecondaryDetailTimeline>,
 }
@@ -142,26 +138,11 @@ fn strftime(t: &'_ SystemTime) -> DelayedFormat<StrftimeItems<'_>> {
    datetime.format("%d/%m/%Y %T")
 }

-/// Information returned from download function when it detects the heatmap has changed
-struct HeatMapModified {
-    etag: Etag,
-    last_modified: SystemTime,
-    bytes: Vec<u8>,
-}
-
-enum HeatMapDownload {
-    // The heatmap's etag has changed: return the new etag, mtime and the body bytes
-    Modified(HeatMapModified),
-    // The heatmap's etag is unchanged
-    Unmodified,
-}
-
 impl SecondaryDetail {
    pub(super) fn new(config: SecondaryLocationConfig) -> Self {
        Self {
            config,
            last_download: None,
-            last_etag: None,
            next_download: None,
            timelines: HashMap::new(),
        }
@@ -496,31 +477,11 @@ impl<'a> TenantDownloader<'a> {
        };

        let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
-
-        // We will use the etag from last successful download to make the download conditional on changes
-        let last_etag = self
-            .secondary_state
-            .detail
-            .lock()
-            .unwrap()
-            .last_etag
-            .clone();
-
        // Download the tenant's heatmap
-        let HeatMapModified {
-            last_modified: heatmap_mtime,
-            etag: heatmap_etag,
-            bytes: heatmap_bytes,
-        } = match tokio::select!(
-            bytes = self.download_heatmap(last_etag.as_ref()) => {bytes?},
+        let heatmap_bytes = tokio::select!(
+            bytes = self.download_heatmap() => {bytes?},
            _ = self.secondary_state.cancel.cancelled() => return Ok(())
-        ) {
-            HeatMapDownload::Unmodified => {
-                tracing::info!("Heatmap unchanged since last successful download");
-                return Ok(());
-            }
-            HeatMapDownload::Modified(m) => m,
-        };
+        );

        let heatmap = serde_json::from_slice::<HeatMapTenant>(&heatmap_bytes)?;

@@ -535,27 +496,11 @@ impl<'a> TenantDownloader<'a> {
            .await
            .maybe_fatal_err(&context_msg)?;

-        tracing::debug!(
-            "Wrote local heatmap to {}, with {} timelines",
-            heatmap_path,
-            heatmap.timelines.len()
-        );
-
-        // Clean up any local layers that aren't in the heatmap.  We do this first for all timelines, on the general
-        // principle that deletions should be done before writes wherever possible, and so that we can use this
-        // phase to initialize our SecondaryProgress.
-        {
-            *self.secondary_state.progress.lock().unwrap() =
-                self.prepare_timelines(&heatmap, heatmap_mtime).await?;
-        }
+        tracing::debug!("Wrote local heatmap to {}", heatmap_path);

        // Download the layers in the heatmap
        for timeline in heatmap.timelines {
            if self.secondary_state.cancel.is_cancelled() {
-                tracing::debug!(
-                    "Cancelled before downloading timeline {}",
-                    timeline.timeline_id
-                );
                return Ok(());
            }

@@ -570,159 +515,30 @@ impl<'a> TenantDownloader<'a> {
                .await?;
        }

-        // Only update last_etag after a full successful download: this way will not skip
-        // the next download, even if the heatmap's actual etag is unchanged.
-        self.secondary_state.detail.lock().unwrap().last_etag = Some(heatmap_etag);
-
        Ok(())
    }

-    /// Do any fast local cleanup that comes before the much slower process of downloading
-    /// layers from remote storage.  In the process, initialize the SecondaryProgress object
-    /// that will later be updated incrementally as we download layers.
-    async fn prepare_timelines(
-        &self,
-        heatmap: &HeatMapTenant,
-        heatmap_mtime: SystemTime,
-    ) -> Result<SecondaryProgress, UpdateError> {
-        let heatmap_stats = heatmap.get_stats();
-        // We will construct a progress object, and then populate its initial "downloaded" numbers
-        // while iterating through local layer state in [`Self::prepare_timelines`]
-        let mut progress = SecondaryProgress {
-            layers_total: heatmap_stats.layers,
-            bytes_total: heatmap_stats.bytes,
-            heatmap_mtime: Some(heatmap_mtime),
-            layers_downloaded: 0,
-            bytes_downloaded: 0,
-        };
-        // Accumulate list of things to delete while holding the detail lock, for execution after dropping the lock
-        let mut delete_layers = Vec::new();
-        let mut delete_timelines = Vec::new();
-        {
-            let mut detail = self.secondary_state.detail.lock().unwrap();
-            for (timeline_id, timeline_state) in &mut detail.timelines {
-                let Some(heatmap_timeline_index) = heatmap
-                    .timelines
-                    .iter()
-                    .position(|t| t.timeline_id == *timeline_id)
-                else {
-                    // This timeline is no longer referenced in the heatmap: delete it locally
-                    delete_timelines.push(*timeline_id);
-                    continue;
-                };
-
-                let heatmap_timeline = heatmap.timelines.get(heatmap_timeline_index).unwrap();
-
-                let layers_in_heatmap = heatmap_timeline
-                    .layers
-                    .iter()
-                    .map(|l| &l.name)
-                    .collect::<HashSet<_>>();
-                let layers_on_disk = timeline_state
-                    .on_disk_layers
-                    .iter()
-                    .map(|l| l.0)
-                    .collect::<HashSet<_>>();
-
-                let mut layer_count = layers_on_disk.len();
-                let mut layer_byte_count: u64 = timeline_state
-                    .on_disk_layers
-                    .values()
-                    .map(|l| l.metadata.file_size())
-                    .sum();
-
-                // Remove on-disk layers that are no longer present in heatmap
-                for layer in layers_on_disk.difference(&layers_in_heatmap) {
-                    layer_count -= 1;
-                    layer_byte_count -= timeline_state
-                        .on_disk_layers
-                        .get(layer)
-                        .unwrap()
-                        .metadata
-                        .file_size();
-
-                    delete_layers.push((*timeline_id, (*layer).clone()));
-                }
-
-                progress.bytes_downloaded += layer_byte_count;
-                progress.layers_downloaded += layer_count;
-            }
-        }
-
-        // Execute accumulated deletions
-        for (timeline_id, layer_name) in delete_layers {
-            let timeline_path = self
-                .conf
-                .timeline_path(self.secondary_state.get_tenant_shard_id(), &timeline_id);
-            let local_path = timeline_path.join(layer_name.to_string());
-            tracing::info!(timeline_id=%timeline_id, "Removing secondary local layer {layer_name} because it's absent in heatmap",);
-
-            tokio::fs::remove_file(&local_path)
-                .await
-                .or_else(fs_ext::ignore_not_found)
-                .maybe_fatal_err("Removing secondary layer")?;
-
-            // Update in-memory housekeeping to reflect the absence of the deleted layer
-            let mut detail = self.secondary_state.detail.lock().unwrap();
-            let Some(timeline_state) = detail.timelines.get_mut(&timeline_id) else {
-                continue;
-            };
-            timeline_state.on_disk_layers.remove(&layer_name);
-        }
-
-        for timeline_id in delete_timelines {
-            let timeline_path = self
-                .conf
-                .timeline_path(self.secondary_state.get_tenant_shard_id(), &timeline_id);
-            tracing::info!(timeline_id=%timeline_id,
-                "Timeline no longer in heatmap, removing from secondary location"
-            );
-            tokio::fs::remove_dir_all(&timeline_path)
-                .await
-                .or_else(fs_ext::ignore_not_found)
-                .maybe_fatal_err("Removing secondary timeline")?;
-        }
-
-        Ok(progress)
-    }
-
-    /// Returns downloaded bytes if the etag differs from `prev_etag`, or None if the object
-    /// still matches `prev_etag`.
-    async fn download_heatmap(
-        &self,
-        prev_etag: Option<&Etag>,
-    ) -> Result<HeatMapDownload, UpdateError> {
+    async fn download_heatmap(&self) -> Result<Vec<u8>, UpdateError> {
        debug_assert_current_span_has_tenant_id();
        let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
-        // TODO: pull up etag check into the request, to do a conditional GET rather than
-        // issuing a GET and then maybe ignoring the response body
+        // TODO: make download conditional on ETag having changed since last download
        // (https://github.com/neondatabase/neon/issues/6199)
        tracing::debug!("Downloading heatmap for secondary tenant",);

        let heatmap_path = remote_heatmap_path(tenant_shard_id);
        let cancel = &self.secondary_state.cancel;

-        backoff::retry(
+        let heatmap_bytes = backoff::retry(
            || async {
                let download = self
                    .remote_storage
                    .download(&heatmap_path, cancel)
                    .await
                    .map_err(UpdateError::from)?;
-
-                if Some(&download.etag) == prev_etag {
-                    Ok(HeatMapDownload::Unmodified)
-                } else {
-                    let mut heatmap_bytes = Vec::new();
-                    let mut body = tokio_util::io::StreamReader::new(download.download_stream);
-                    let _size = tokio::io::copy_buf(&mut body, &mut heatmap_bytes).await?;
-                    SECONDARY_MODE.download_heatmap.inc();
-                    Ok(HeatMapDownload::Modified(HeatMapModified {
-                        etag: download.etag,
-                        last_modified: download.last_modified,
-                        bytes: heatmap_bytes,
-                    }))
-                }
+                let mut heatmap_bytes = Vec::new();
+                let mut body = tokio_util::io::StreamReader::new(download.download_stream);
+                let _size = tokio::io::copy_buf(&mut body, &mut heatmap_bytes).await?;
+                Ok(heatmap_bytes)
            },
            |e| matches!(e, UpdateError::NoData | UpdateError::Cancelled),
            FAILED_DOWNLOAD_WARN_THRESHOLD,
@@ -732,7 +548,11 @@ impl<'a> TenantDownloader<'a> {
        )
        .await
        .ok_or_else(|| UpdateError::Cancelled)
-        .and_then(|x| x)
+        .and_then(|x| x)?;
+
+        SECONDARY_MODE.download_heatmap.inc();
+
+        Ok(heatmap_bytes)
    }

    async fn download_timeline(&self, timeline: HeatMapTimeline) -> Result<(), UpdateError> {
@@ -773,13 +593,31 @@ impl<'a> TenantDownloader<'a> {
            }
        };

-        tracing::debug!(timeline_id=%timeline.timeline_id, "Downloading layers, {} in heatmap", timeline.layers.len());
+        let layers_in_heatmap = timeline
+            .layers
+            .iter()
+            .map(|l| &l.name)
+            .collect::<HashSet<_>>();
+        let layers_on_disk = timeline_state
+            .on_disk_layers
+            .iter()
+            .map(|l| l.0)
+            .collect::<HashSet<_>>();
+
+        // Remove on-disk layers that are no longer present in heatmap
+        for layer in layers_on_disk.difference(&layers_in_heatmap) {
+            let local_path = timeline_path.join(layer.to_string());
+            tracing::info!("Removing secondary local layer {layer} because it's absent in heatmap",);
+            tokio::fs::remove_file(&local_path)
+                .await
+                .or_else(fs_ext::ignore_not_found)
+                .maybe_fatal_err("Removing secondary layer")?;
+        }

        // Download heatmap layers that are not present on local disk, or update their
        // access time if they are already present.
        for layer in timeline.layers {
            if self.secondary_state.cancel.is_cancelled() {
-                tracing::debug!("Cancelled -- dropping out of layer loop");
                return Ok(());
            }

@@ -824,12 +662,6 @@ impl<'a> TenantDownloader<'a> {
                }
            }

-            // Failpoint for simulating slow remote storage
-            failpoint_support::sleep_millis_async!(
-                "secondary-layer-download-sleep",
-                &self.secondary_state.cancel
-            );
-
            // Note: no backoff::retry wrapper here because download_layer_file does its own retries internally
            let downloaded_bytes = match download_layer_file(
                self.conf,
@@ -869,11 +701,6 @@ impl<'a> TenantDownloader<'a> {
                tokio::fs::remove_file(&local_path)
                    .await
                    .or_else(fs_ext::ignore_not_found)?;
-            } else {
-                tracing::info!("Downloaded layer {}, size {}", layer.name, downloaded_bytes);
-                let mut progress = self.secondary_state.progress.lock().unwrap();
-                progress.bytes_downloaded += downloaded_bytes;
-                progress.layers_downloaded += 1;
            }

            SECONDARY_MODE.download_layer.inc();
@@ -962,10 +789,7 @@ async fn init_timeline_state(
            // Secondary mode doesn't use local metadata files, but they might have been left behind by an attached tenant.
            warn!(path=?dentry.path(), "found legacy metadata file, these should have been removed in load_tenant_config");
            continue;
-        } else if crate::is_temporary(&file_path)
-            || is_temp_download_file(&file_path)
-            || is_ephemeral_file(file_name)
-        {
+        } else if crate::is_temporary(&file_path) || is_temp_download_file(&file_path) {
            // Temporary files are frequently left behind from restarting during downloads
            tracing::info!("Cleaning up temporary file {file_path}");
            if let Err(e) = tokio::fs::remove_file(&file_path)
--- a/pageserver/src/tenant/secondary/heatmap.rs
+++ b/pageserver/src/tenant/secondary/heatmap.rs
@@ -62,25 +62,3 @@ impl HeatMapTimeline {
        }
    }
 }
-
-pub(crate) struct HeatMapStats {
-    pub(crate) bytes: u64,
-    pub(crate) layers: usize,
-}
-
-impl HeatMapTenant {
-    pub(crate) fn get_stats(&self) -> HeatMapStats {
-        let mut stats = HeatMapStats {
-            bytes: 0,
-            layers: 0,
-        };
-        for timeline in &self.timelines {
-            for layer in &timeline.layers {
-                stats.layers += 1;
-                stats.bytes += layer.metadata.file_size;
-            }
-        }
-
-        stats
-    }
-}
--- a/pageserver/src/tenant/secondary/heatmap_uploader.rs
+++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs
@@ -9,7 +9,6 @@ use crate::{
    metrics::SECONDARY_MODE,
    tenant::{
        config::AttachmentMode,
-        mgr::GetTenantError,
        mgr::TenantManager,
        remote_timeline_client::remote_heatmap_path,
        span::debug_assert_current_span_has_tenant_id,
@@ -293,11 +292,8 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
            "Starting heatmap write on command");
        let tenant = self
            .tenant_manager
-            .get_attached_tenant_shard(*tenant_shard_id)
+            .get_attached_tenant_shard(*tenant_shard_id, true)
            .map_err(|e| anyhow::anyhow!(e))?;
-        if !tenant.is_active() {
-            return Err(GetTenantError::NotActive(*tenant_shard_id).into());
-        }

        Ok(UploadPending {
            // Ignore our state for last digest: this forces an upload even if nothing has changed
--- a/pageserver/src/tenant/secondary/scheduler.rs
+++ b/pageserver/src/tenant/secondary/scheduler.rs
@@ -300,7 +300,6 @@ where

        let tenant_shard_id = job.get_tenant_shard_id();
        let barrier = if let Some(barrier) = self.get_running(tenant_shard_id) {
-            tracing::info!("Command already running, waiting for it");
            barrier
        } else {
            let running = self.spawn_now(job);
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -183,13 +183,7 @@ pub(super) async fn gather_inputs(
        // new gc run, which we have no control over. however differently from `Timeline::gc`
        // we don't consider the `Timeline::disk_consistent_lsn` at all, because we are not
        // actually removing files.
-        //
-        // We only consider [`GcInfo::pitr_cutoff`], and not [`GcInfo::horizon_cutoff`], because from
-        // a user's perspective they have only requested retention up to the time bound (pitr_cutoff), rather
-        // than a space bound (horizon cutoff).  This means that if someone drops a database and waits for their
-        // PITR interval, they will see synthetic size decrease, even if we are still storing data inside
-        // horizon_cutoff.
-        let mut next_gc_cutoff = gc_info.pitr_cutoff;
+        let mut next_gc_cutoff = cmp::min(gc_info.horizon_cutoff, gc_info.pitr_cutoff);

        // If the caller provided a shorter retention period, use that instead of the GC cutoff.
        let retention_param_cutoff = if let Some(max_retention_period) = max_retention_period {
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -3,7 +3,7 @@
 pub mod delta_layer;
 mod filename;
 pub mod image_layer;
-pub(crate) mod inmemory_layer;
+mod inmemory_layer;
 pub(crate) mod layer;
 mod layer_desc;

@@ -20,7 +20,6 @@ use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum};
 use pageserver_api::models::{
    LayerAccessKind, LayerResidenceEvent, LayerResidenceEventReason, LayerResidenceStatus,
 };
-use std::borrow::Cow;
 use std::cmp::{Ordering, Reverse};
 use std::collections::hash_map::Entry;
 use std::collections::{BinaryHeap, HashMap};
@@ -428,7 +427,7 @@ impl LayerAccessStatFullDetails {
        } = self;
        pageserver_api::models::LayerAccessStatFullDetails {
            when_millis_since_epoch: system_time_to_millis_since_epoch(when),
-            task_kind: Cow::Borrowed(task_kind.into()), // into static str, powered by strum_macros
+            task_kind: task_kind.into(), // into static str, powered by strum_macros
            access_kind: *access_kind,
        }
    }
@@ -526,7 +525,7 @@ impl LayerAccessStats {
                .collect(),
            task_kind_access_flag: task_kind_flag
                .iter()
-                .map(|task_kind| Cow::Borrowed(task_kind.into())) // into static str, powered by strum_macros
+                .map(|task_kind| task_kind.into()) // into static str, powered by strum_macros
                .collect(),
            first: first_access.as_ref().map(|a| a.as_api_model()),
            accesses_history: last_accesses.map(|m| m.as_api_model()),
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -23,12 +23,8 @@ use tracing::*;
 use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn, vec_map::VecMap};
 // avoid binding to Write (conflicts with std::io::Write)
 // while being able to use std::fmt::Write's methods
-use crate::metrics::TIMELINE_EPHEMERAL_BYTES;
-use std::cmp::Ordering;
 use std::fmt::Write as _;
 use std::ops::Range;
-use std::sync::atomic::Ordering as AtomicOrdering;
-use std::sync::atomic::{AtomicU64, AtomicUsize};
 use tokio::sync::{RwLock, RwLockWriteGuard};

 use super::{
@@ -74,8 +70,6 @@ pub struct InMemoryLayerInner {
    /// Each serialized Value is preceded by a 'u32' length field.
    /// PerSeg::page_versions map stores offsets into this file.
    file: EphemeralFile,
-
-    resource_units: GlobalResourceUnits,
 }

 impl std::fmt::Debug for InMemoryLayerInner {
@@ -84,121 +78,6 @@ impl std::fmt::Debug for InMemoryLayerInner {
    }
 }

-/// State shared by all in-memory (ephemeral) layers.  Updated infrequently during background ticks in Timeline,
-/// to minimize contention.
-///
-/// This global state is used to implement behaviors that require a global view of the system, e.g.
-/// rolling layers proactively to limit the total amount of dirty data.
-pub(crate) struct GlobalResources {
-    // Limit on how high dirty_bytes may grow before we start freezing layers to reduce it.
-    // Zero means unlimited.
-    pub(crate) max_dirty_bytes: AtomicU64,
-    // How many bytes are in all EphemeralFile objects
-    dirty_bytes: AtomicU64,
-    // How many layers are contributing to dirty_bytes
-    dirty_layers: AtomicUsize,
-}
-
-// Per-timeline RAII struct for its contribution to [`GlobalResources`]
-struct GlobalResourceUnits {
-    // How many dirty bytes have I added to the global dirty_bytes: this guard object is responsible
-    // for decrementing the global counter by this many bytes when dropped.
-    dirty_bytes: u64,
-}
-
-impl GlobalResourceUnits {
-    // Hint for the layer append path to update us when the layer size differs from the last
-    // call to update_size by this much.  If we don't reach this threshold, we'll still get
-    // updated when the Timeline "ticks" in the background.
-    const MAX_SIZE_DRIFT: u64 = 10 * 1024 * 1024;
-
-    fn new() -> Self {
-        GLOBAL_RESOURCES
-            .dirty_layers
-            .fetch_add(1, AtomicOrdering::Relaxed);
-        Self { dirty_bytes: 0 }
-    }
-
-    /// Do not call this frequently: all timelines will write to these same global atomics,
-    /// so this is a relatively expensive operation.  Wait at least a few seconds between calls.
-    ///
-    /// Returns the effective layer size limit that should be applied, if any, to keep
-    /// the total number of dirty bytes below the configured maximum.
-    fn publish_size(&mut self, size: u64) -> Option<u64> {
-        let new_global_dirty_bytes = match size.cmp(&self.dirty_bytes) {
-            Ordering::Equal => GLOBAL_RESOURCES.dirty_bytes.load(AtomicOrdering::Relaxed),
-            Ordering::Greater => {
-                let delta = size - self.dirty_bytes;
-                let old = GLOBAL_RESOURCES
-                    .dirty_bytes
-                    .fetch_add(delta, AtomicOrdering::Relaxed);
-                old + delta
-            }
-            Ordering::Less => {
-                let delta = self.dirty_bytes - size;
-                let old = GLOBAL_RESOURCES
-                    .dirty_bytes
-                    .fetch_sub(delta, AtomicOrdering::Relaxed);
-                old - delta
-            }
-        };
-
-        // This is a sloppy update: concurrent updates to the counter will race, and the exact
-        // value of the metric might not be the exact latest value of GLOBAL_RESOURCES::dirty_bytes.
-        // That's okay: as long as the metric contains some recent value, it doesn't have to always
-        // be literally the last update.
-        TIMELINE_EPHEMERAL_BYTES.set(new_global_dirty_bytes);
-
-        self.dirty_bytes = size;
-
-        let max_dirty_bytes = GLOBAL_RESOURCES
-            .max_dirty_bytes
-            .load(AtomicOrdering::Relaxed);
-        if max_dirty_bytes > 0 && new_global_dirty_bytes > max_dirty_bytes {
-            // Set the layer file limit to the average layer size: this implies that all above-average
-            // sized layers will be elegible for freezing.  They will be frozen in the order they
-            // next enter publish_size.
-            Some(
-                new_global_dirty_bytes
-                    / GLOBAL_RESOURCES.dirty_layers.load(AtomicOrdering::Relaxed) as u64,
-            )
-        } else {
-            None
-        }
-    }
-
-    // Call publish_size if the input size differs from last published size by more than
-    // the drift limit
-    fn maybe_publish_size(&mut self, size: u64) {
-        let publish = match size.cmp(&self.dirty_bytes) {
-            Ordering::Equal => false,
-            Ordering::Greater => size - self.dirty_bytes > Self::MAX_SIZE_DRIFT,
-            Ordering::Less => self.dirty_bytes - size > Self::MAX_SIZE_DRIFT,
-        };
-
-        if publish {
-            self.publish_size(size);
-        }
-    }
-}
-
-impl Drop for GlobalResourceUnits {
-    fn drop(&mut self) {
-        GLOBAL_RESOURCES
-            .dirty_layers
-            .fetch_sub(1, AtomicOrdering::Relaxed);
-
-        // Subtract our contribution to the global total dirty bytes
-        self.publish_size(0);
-    }
-}
-
-pub(crate) static GLOBAL_RESOURCES: GlobalResources = GlobalResources {
-    max_dirty_bytes: AtomicU64::new(0),
-    dirty_bytes: AtomicU64::new(0),
-    dirty_layers: AtomicUsize::new(0),
-};
-
 impl InMemoryLayer {
    pub(crate) fn get_timeline_id(&self) -> TimelineId {
        self.timeline_id
@@ -214,10 +93,6 @@ impl InMemoryLayer {
        }
    }

-    pub(crate) fn try_len(&self) -> Option<u64> {
-        self.inner.try_read().map(|i| i.file.len()).ok()
-    }
-
    pub(crate) fn assert_writable(&self) {
        assert!(self.end_lsn.get().is_none());
    }
@@ -453,7 +328,6 @@ impl InMemoryLayer {
            inner: RwLock::new(InMemoryLayerInner {
                index: HashMap::new(),
                file,
-                resource_units: GlobalResourceUnits::new(),
            }),
        })
    }
@@ -504,18 +378,9 @@ impl InMemoryLayer {
            warn!("Key {} at {} already exists", key, lsn);
        }

-        let size = locked_inner.file.len();
-        locked_inner.resource_units.maybe_publish_size(size);
-
        Ok(())
    }

-    pub(crate) async fn tick(&self) -> Option<u64> {
-        let mut inner = self.inner.write().await;
-        let size = inner.file.len();
-        inner.resource_units.publish_size(size)
-    }
-
    pub(crate) async fn put_tombstones(&self, _key_ranges: &[(Range<Key>, Lsn)]) -> Result<()> {
        // TODO: Currently, we just leak the storage for any deleted keys
        Ok(())
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
--- a/pageserver/src/tenant/storage_layer/layer/failpoints.rs
+++ b/pageserver/src/tenant/storage_layer/layer/failpoints.rs
@@ -1,119 +0,0 @@
-//! failpoints for unit tests, implying `#[cfg(test)]`.
-//!
-//! These are not accessible over http.
-
-use super::*;
-
-impl Layer {
-    /// Enable a failpoint from a unit test.
-    pub(super) fn enable_failpoint(&self, failpoint: Failpoint) {
-        self.0.failpoints.lock().unwrap().push(failpoint);
-    }
-}
-
-impl LayerInner {
-    /// Query if this failpoint is enabled, as in, arrive at a failpoint.
-    ///
-    /// Calls to this method need to be `#[cfg(test)]` guarded.
-    pub(super) async fn failpoint(&self, kind: FailpointKind) -> Result<(), FailpointHit> {
-        let fut = {
-            let mut fps = self.failpoints.lock().unwrap();
-            // find the *last* failpoint for cases in which we need to use multiple for the same
-            // thing (two blocked evictions)
-            let fp = fps.iter_mut().rfind(|x| x.kind() == kind);
-
-            let Some(fp) = fp else {
-                return Ok(());
-            };
-
-            fp.hit()
-        };
-
-        fut.await
-    }
-}
-
-#[derive(Debug, PartialEq, Eq)]
-pub(crate) enum FailpointKind {
-    /// Failpoint acts as an accurate cancelled by drop here; see the only site of use.
-    AfterDeterminingLayerNeedsNoDownload,
-    /// Failpoint for stalling eviction starting
-    WaitBeforeStartingEvicting,
-    /// Failpoint hit in the spawned task
-    WaitBeforeDownloading,
-}
-
-pub(crate) enum Failpoint {
-    AfterDeterminingLayerNeedsNoDownload,
-    WaitBeforeStartingEvicting(
-        Option<utils::completion::Completion>,
-        utils::completion::Barrier,
-    ),
-    WaitBeforeDownloading(
-        Option<utils::completion::Completion>,
-        utils::completion::Barrier,
-    ),
-}
-
-impl Failpoint {
-    fn kind(&self) -> FailpointKind {
-        match self {
-            Failpoint::AfterDeterminingLayerNeedsNoDownload => {
-                FailpointKind::AfterDeterminingLayerNeedsNoDownload
-            }
-            Failpoint::WaitBeforeStartingEvicting(..) => FailpointKind::WaitBeforeStartingEvicting,
-            Failpoint::WaitBeforeDownloading(..) => FailpointKind::WaitBeforeDownloading,
-        }
-    }
-
-    fn hit(&mut self) -> impl std::future::Future<Output = Result<(), FailpointHit>> + 'static {
-        use futures::future::FutureExt;
-
-        // use boxed futures to avoid Either hurdles
-        match self {
-            Failpoint::AfterDeterminingLayerNeedsNoDownload => {
-                let kind = self.kind();
-
-                async move { Err(FailpointHit(kind)) }.boxed()
-            }
-            Failpoint::WaitBeforeStartingEvicting(arrival, b)
-            | Failpoint::WaitBeforeDownloading(arrival, b) => {
-                // first one signals arrival
-                drop(arrival.take());
-
-                let b = b.clone();
-
-                async move {
-                    tracing::trace!("waiting on a failpoint barrier");
-                    b.wait().await;
-                    tracing::trace!("done waiting on a failpoint barrier");
-                    Ok(())
-                }
-                .boxed()
-            }
-        }
-    }
-}
-
-impl std::fmt::Display for FailpointKind {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        std::fmt::Debug::fmt(self, f)
-    }
-}
-
-#[derive(Debug)]
-pub(crate) struct FailpointHit(FailpointKind);
-
-impl std::fmt::Display for FailpointHit {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        std::fmt::Debug::fmt(self, f)
-    }
-}
-
-impl std::error::Error for FailpointHit {}
-
-impl From<FailpointHit> for DownloadError {
-    fn from(value: FailpointHit) -> Self {
-        DownloadError::Failpoint(value.0)
-    }
-}
--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -1,13 +1,14 @@
+use futures::StreamExt;
 use pageserver_api::key::CONTROLFILE_KEY;
 use tokio::task::JoinSet;
+use tracing::Instrument;
 use utils::{
    completion::{self, Completion},
    id::TimelineId,
 };

-use super::failpoints::{Failpoint, FailpointKind};
 use super::*;
-use crate::context::DownloadBehavior;
+use crate::{context::DownloadBehavior, task_mgr::BACKGROUND_RUNTIME};
 use crate::{task_mgr::TaskKind, tenant::harness::TenantHarness};

 /// Used in tests to advance a future to wanted await point, and not futher.
@@ -20,7 +21,7 @@ const FOREVER: std::time::Duration = std::time::Duration::from_secs(ADVANCE.as_s
 /// Demonstrate the API and resident -> evicted -> resident -> deleted transitions.
 #[tokio::test]
 async fn smoke_test() {
-    let handle = tokio::runtime::Handle::current();
+    let handle = BACKGROUND_RUNTIME.handle();

    let h = TenantHarness::create("smoke_test").unwrap();
    let span = h.span();
@@ -37,7 +38,7 @@ async fn smoke_test() {
    let layer = {
        let mut layers = {
            let layers = timeline.layers.read().await;
-            layers.likely_resident_layers().collect::<Vec<_>>()
+            layers.resident_layers().collect::<Vec<_>>().await
        };

        assert_eq!(layers.len(), 1);
@@ -87,7 +88,7 @@ async fn smoke_test() {
    //
    // ZERO for timeout does not work reliably, so first take up all spawn_blocking slots to
    // artificially slow it down.
-    let helper = SpawnBlockingPoolHelper::consume_all_spawn_blocking_threads(&handle).await;
+    let helper = SpawnBlockingPoolHelper::consume_all_spawn_blocking_threads(handle).await;

    match layer
        .evict_and_wait(std::time::Duration::ZERO)
@@ -98,7 +99,7 @@ async fn smoke_test() {
            // expected, but note that the eviction is "still ongoing"
            helper.release().await;
            // exhaust spawn_blocking pool to ensure it is now complete
-            SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle)
+            SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(handle)
                .await;
        }
        other => unreachable!("{other:?}"),
@@ -107,7 +108,7 @@ async fn smoke_test() {
    // only way to query if a layer is resident is to acquire a ResidentLayer instance.
    // Layer::keep_resident never downloads, but it might initialize if the layer file is found
    // downloaded locally.
-    let none = layer.keep_resident().await;
+    let none = layer.keep_resident().await.unwrap();
    assert!(
        none.is_none(),
        "Expected none, because eviction removed the local file, found: {none:?}"
@@ -166,7 +167,6 @@ async fn smoke_test() {
    rtc.wait_completion().await.unwrap();

    assert_eq!(rtc.get_remote_physical_size(), 0);
-    assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get())
 }

 /// This test demonstrates a previous hang when a eviction and deletion were requested at the same
@@ -174,7 +174,7 @@ async fn smoke_test() {
 #[tokio::test(start_paused = true)]
 async fn evict_and_wait_on_wanted_deleted() {
    // this is the runtime on which Layer spawns the blocking tasks on
-    let handle = tokio::runtime::Handle::current();
+    let handle = BACKGROUND_RUNTIME.handle();

    let h = TenantHarness::create("evict_and_wait_on_wanted_deleted").unwrap();
    utils::logging::replace_panic_hook_with_tracing_panic_hook().forget();
@@ -188,7 +188,7 @@ async fn evict_and_wait_on_wanted_deleted() {
    let layer = {
        let mut layers = {
            let layers = timeline.layers.read().await;
-            layers.likely_resident_layers().collect::<Vec<_>>()
+            layers.resident_layers().collect::<Vec<_>>().await
        };

        assert_eq!(layers.len(), 1);
@@ -213,11 +213,11 @@ async fn evict_and_wait_on_wanted_deleted() {
        drop(resident);

        // make sure the eviction task gets to run
-        SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle).await;
+        SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(handle).await;

        let resident = layer.keep_resident().await;
        assert!(
-            resident.is_none(),
+            matches!(resident, Ok(None)),
            "keep_resident should not have re-initialized: {resident:?}"
        );

@@ -235,332 +235,24 @@ async fn evict_and_wait_on_wanted_deleted() {
        layers.finish_gc_timeline(&[layer]);
    }

-    SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle).await;
+    SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(handle).await;

    assert_eq!(1, LAYER_IMPL_METRICS.started_deletes.get());
    assert_eq!(1, LAYER_IMPL_METRICS.completed_deletes.get());
    assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get());
    assert_eq!(1, LAYER_IMPL_METRICS.completed_evictions.get());
-    assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get())
 }

-/// This test ensures we are able to read the layer while the layer eviction has been
-/// started but not completed.
-#[test]
-fn read_wins_pending_eviction() {
-    let rt = tokio::runtime::Builder::new_current_thread()
-        .max_blocking_threads(1)
-        .enable_all()
-        .start_paused(true)
-        .build()
-        .unwrap();
-
-    rt.block_on(async move {
-        // this is the runtime on which Layer spawns the blocking tasks on
-        let handle = tokio::runtime::Handle::current();
-        let h = TenantHarness::create("read_wins_pending_eviction").unwrap();
-        let (tenant, ctx) = h.load().await;
-        let span = h.span();
-        let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
-
-        let timeline = tenant
-            .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
-            .await
-            .unwrap();
-
-        let layer = {
-            let mut layers = {
-                let layers = timeline.layers.read().await;
-                layers.likely_resident_layers().collect::<Vec<_>>()
-            };
-
-            assert_eq!(layers.len(), 1);
-
-            layers.swap_remove(0)
-        };
-
-        // setup done
-
-        let resident = layer.keep_resident().await.unwrap();
-
-        let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait(FOREVER));
-
-        // drive the future to await on the status channel
-        tokio::time::timeout(ADVANCE, &mut evict_and_wait)
-            .await
-            .expect_err("should had been a timeout since we are holding the layer resident");
-        assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get());
-
-        let (completion, barrier) = utils::completion::channel();
-        let (arrival, arrived_at_barrier) = utils::completion::channel();
-        layer.enable_failpoint(Failpoint::WaitBeforeStartingEvicting(
-            Some(arrival),
-            barrier,
-        ));
-
-        // now the eviction cannot proceed because the threads are consumed while completion exists
-        drop(resident);
-        arrived_at_barrier.wait().await;
-        assert!(!layer.is_likely_resident());
-
-        // because no actual eviction happened, we get to just reinitialize the DownloadedLayer
-        layer
-            .0
-            .get_or_maybe_download(false, None)
-            .instrument(download_span)
-            .await
-            .expect("should had reinitialized without downloading");
-
-        assert!(layer.is_likely_resident());
-
-        // reinitialization notifies of new resident status, which should error out all evict_and_wait
-        let e = tokio::time::timeout(ADVANCE, &mut evict_and_wait)
-            .await
-            .expect("no timeout, because get_or_maybe_download re-initialized")
-            .expect_err("eviction should not have succeeded because re-initialized");
-
-        // works as intended: evictions lose to "downloads"
-        assert!(matches!(e, EvictionError::Downloaded), "{e:?}");
-        assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get());
-
-        // this is not wrong: the eviction is technically still "on the way" as it's still queued
-        // because of a failpoint
-        assert_eq!(
-            0,
-            LAYER_IMPL_METRICS
-                .cancelled_evictions
-                .values()
-                .map(|ctr| ctr.get())
-                .sum::<u64>()
-        );
-
-        drop(completion);
-
-        tokio::time::sleep(ADVANCE).await;
-        SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads0(&handle, 1)
-            .await;
-
-        assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get());
-
-        // now we finally can observe the original eviction failing
-        // it would had been possible to observe it earlier, but here it is guaranteed to have
-        // happened.
-        assert_eq!(
-            1,
-            LAYER_IMPL_METRICS
-                .cancelled_evictions
-                .values()
-                .map(|ctr| ctr.get())
-                .sum::<u64>()
-        );
-
-        assert_eq!(
-            1,
-            LAYER_IMPL_METRICS.cancelled_evictions[EvictionCancelled::AlreadyReinitialized].get()
-        );
-
-        assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get())
-    });
-}
-
-/// Use failpoint to delay an eviction starting to get a VersionCheckFailed.
-#[test]
-fn multiple_pending_evictions_in_order() {
-    let name = "multiple_pending_evictions_in_order";
-    let in_order = true;
-    multiple_pending_evictions_scenario(name, in_order);
-}
-
-/// Use failpoint to reorder later eviction before first to get a UnexpectedEvictedState.
-#[test]
-fn multiple_pending_evictions_out_of_order() {
-    let name = "multiple_pending_evictions_out_of_order";
-    let in_order = false;
-    multiple_pending_evictions_scenario(name, in_order);
-}
-
-fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) {
-    let rt = tokio::runtime::Builder::new_current_thread()
-        .max_blocking_threads(1)
-        .enable_all()
-        .start_paused(true)
-        .build()
-        .unwrap();
-
-    rt.block_on(async move {
-        // this is the runtime on which Layer spawns the blocking tasks on
-        let handle = tokio::runtime::Handle::current();
-        let h = TenantHarness::create(name).unwrap();
-        let (tenant, ctx) = h.load().await;
-        let span = h.span();
-        let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
-
-        let timeline = tenant
-            .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
-            .await
-            .unwrap();
-
-        let layer = {
-            let mut layers = {
-                let layers = timeline.layers.read().await;
-                layers.likely_resident_layers().collect::<Vec<_>>()
-            };
-
-            assert_eq!(layers.len(), 1);
-
-            layers.swap_remove(0)
-        };
-
-        // setup done
-
-        let resident = layer.keep_resident().await.unwrap();
-
-        let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait(FOREVER));
-
-        // drive the future to await on the status channel
-        tokio::time::timeout(ADVANCE, &mut evict_and_wait)
-            .await
-            .expect_err("should had been a timeout since we are holding the layer resident");
-        assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get());
-
-        let (completion1, barrier) = utils::completion::channel();
-        let mut completion1 = Some(completion1);
-        let (arrival, arrived_at_barrier) = utils::completion::channel();
-        layer.enable_failpoint(Failpoint::WaitBeforeStartingEvicting(
-            Some(arrival),
-            barrier,
-        ));
-
-        // now the eviction cannot proceed because we are simulating arbitrary long delay for the
-        // eviction task start.
-        drop(resident);
-        assert!(!layer.is_likely_resident());
-
-        arrived_at_barrier.wait().await;
-
-        // because no actual eviction happened, we get to just reinitialize the DownloadedLayer
-        layer
-            .0
-            .get_or_maybe_download(false, None)
-            .instrument(download_span)
-            .await
-            .expect("should had reinitialized without downloading");
-
-        assert!(layer.is_likely_resident());
-
-        // reinitialization notifies of new resident status, which should error out all evict_and_wait
-        let e = tokio::time::timeout(ADVANCE, &mut evict_and_wait)
-            .await
-            .expect("no timeout, because get_or_maybe_download re-initialized")
-            .expect_err("eviction should not have succeeded because re-initialized");
-
-        // works as intended: evictions lose to "downloads"
-        assert!(matches!(e, EvictionError::Downloaded), "{e:?}");
-        assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get());
-
-        // this is not wrong: the eviction is technically still "on the way" as it's still queued
-        // because of a failpoint
-        assert_eq!(
-            0,
-            LAYER_IMPL_METRICS
-                .cancelled_evictions
-                .values()
-                .map(|ctr| ctr.get())
-                .sum::<u64>()
-        );
-
-        assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get());
-
-        // configure another failpoint for the second eviction -- evictions are per initialization,
-        // so now that we've reinitialized the inner, we get to run two of them at the same time.
-        let (completion2, barrier) = utils::completion::channel();
-        let (arrival, arrived_at_barrier) = utils::completion::channel();
-        layer.enable_failpoint(Failpoint::WaitBeforeStartingEvicting(
-            Some(arrival),
-            barrier,
-        ));
-
-        let mut second_eviction = std::pin::pin!(layer.evict_and_wait(FOREVER));
-
-        // advance to the wait on the queue
-        tokio::time::timeout(ADVANCE, &mut second_eviction)
-            .await
-            .expect_err("timeout because failpoint is blocking");
-
-        arrived_at_barrier.wait().await;
-
-        assert_eq!(2, LAYER_IMPL_METRICS.started_evictions.get());
-
-        let mut release_earlier_eviction = |expected_reason| {
-            assert_eq!(
-                0,
-                LAYER_IMPL_METRICS.cancelled_evictions[expected_reason].get(),
-            );
-
-            drop(completion1.take().unwrap());
-
-            let handle = &handle;
-
-            async move {
-                tokio::time::sleep(ADVANCE).await;
-                SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads0(
-                    handle, 1,
-                )
-                .await;
-
-                assert_eq!(
-                    1,
-                    LAYER_IMPL_METRICS.cancelled_evictions[expected_reason].get(),
-                );
-            }
-        };
-
-        if in_order {
-            release_earlier_eviction(EvictionCancelled::VersionCheckFailed).await;
-        }
-
-        // release the later eviction which is for the current version
-        drop(completion2);
-        tokio::time::sleep(ADVANCE).await;
-        SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads0(&handle, 1)
-            .await;
-
-        if !in_order {
-            release_earlier_eviction(EvictionCancelled::UnexpectedEvictedState).await;
-        }
-
-        tokio::time::timeout(ADVANCE, &mut second_eviction)
-            .await
-            .expect("eviction goes through now that spawn_blocking is unclogged")
-            .expect("eviction should succeed, because version matches");
-
-        assert_eq!(1, LAYER_IMPL_METRICS.completed_evictions.get());
-
-        // ensure the cancelled are unchanged
-        assert_eq!(
-            1,
-            LAYER_IMPL_METRICS
-                .cancelled_evictions
-                .values()
-                .map(|ctr| ctr.get())
-                .sum::<u64>()
-        );
-
-        assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get())
-    });
-}
-
-/// The test ensures with a failpoint that a pending eviction is not cancelled by what is currently
-/// a `Layer::keep_resident` call.
+/// This test shows that ensures we are able to read the layer while the layer eviction has been
+/// started but not completed due to spawn_blocking pool being blocked.
 ///
-/// This matters because cancelling the eviction would leave us in a state where the file is on
-/// disk but the layer internal state says it has not been initialized. Futhermore, it allows us to
-/// have non-repairing `Layer::is_likely_resident`.
+/// Here `Layer::keep_resident` is used to "simulate" reads, because it cannot download.
 #[tokio::test(start_paused = true)]
-async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() {
-    let handle = tokio::runtime::Handle::current();
-    let h =
-        TenantHarness::create("cancelled_get_or_maybe_download_does_not_cancel_eviction").unwrap();
+async fn residency_check_while_evict_and_wait_on_clogged_spawn_blocking() {
+    // this is the runtime on which Layer spawns the blocking tasks on
+    let handle = BACKGROUND_RUNTIME.handle();
+    let h = TenantHarness::create("residency_check_while_evict_and_wait_on_clogged_spawn_blocking")
+        .unwrap();
    let (tenant, ctx) = h.load().await;

    let timeline = tenant
@@ -571,7 +263,7 @@ async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() {
    let layer = {
        let mut layers = {
            let layers = timeline.layers.read().await;
-            layers.likely_resident_layers().collect::<Vec<_>>()
+            layers.resident_layers().collect::<Vec<_>>().await
        };

        assert_eq!(layers.len(), 1);
@@ -579,154 +271,90 @@ async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() {
        layers.swap_remove(0)
    };

-    // this failpoint will simulate the `get_or_maybe_download` becoming cancelled (by returning an
-    // Err) at the right time as in "during" the `LayerInner::needs_download`.
-    layer.enable_failpoint(Failpoint::AfterDeterminingLayerNeedsNoDownload);
+    // setup done

-    let (completion, barrier) = utils::completion::channel();
-    let (arrival, arrived_at_barrier) = utils::completion::channel();
-
-    layer.enable_failpoint(Failpoint::WaitBeforeStartingEvicting(
-        Some(arrival),
-        barrier,
-    ));
-
-    tokio::time::timeout(ADVANCE, layer.evict_and_wait(FOREVER))
-        .await
-        .expect_err("should had advanced to waiting on channel");
-
-    arrived_at_barrier.wait().await;
-
-    // simulate a cancelled read which is cancelled before it gets to re-initialize
-    let e = layer
-        .0
-        .get_or_maybe_download(false, None)
-        .await
-        .unwrap_err();
-    assert!(
-        matches!(
-            e,
-            DownloadError::Failpoint(FailpointKind::AfterDeterminingLayerNeedsNoDownload)
-        ),
-        "{e:?}"
-    );
-
-    assert!(
-        layer.0.needs_download().await.unwrap().is_none(),
-        "file is still on disk"
-    );
-
-    // release the eviction task
-    drop(completion);
-    tokio::time::sleep(ADVANCE).await;
-    SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle).await;
-
-    // failpoint is still enabled, but it is not hit
-    let e = layer
-        .0
-        .get_or_maybe_download(false, None)
-        .await
-        .unwrap_err();
-    assert!(matches!(e, DownloadError::DownloadRequired), "{e:?}");
-
-    // failpoint is not counted as cancellation either
-    assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get())
-}
-
-#[tokio::test(start_paused = true)]
-async fn evict_and_wait_does_not_wait_for_download() {
-    // let handle = tokio::runtime::Handle::current();
-    let h = TenantHarness::create("evict_and_wait_does_not_wait_for_download").unwrap();
-    let (tenant, ctx) = h.load().await;
-    let span = h.span();
-    let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
-
-    let timeline = tenant
-        .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
-        .await
-        .unwrap();
-
-    let layer = {
-        let mut layers = {
-            let layers = timeline.layers.read().await;
-            layers.likely_resident_layers().collect::<Vec<_>>()
-        };
-
-        assert_eq!(layers.len(), 1);
-
-        layers.swap_remove(0)
-    };
-
-    // kind of forced setup: start an eviction but do not allow it progress until we are
-    // downloading
-    let (eviction_can_continue, barrier) = utils::completion::channel();
-    let (arrival, eviction_arrived) = utils::completion::channel();
-    layer.enable_failpoint(Failpoint::WaitBeforeStartingEvicting(
-        Some(arrival),
-        barrier,
-    ));
+    let resident = layer.keep_resident().await.unwrap();

    let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait(FOREVER));

-    // use this once-awaited other_evict to synchronize with the eviction
-    let other_evict = layer.evict_and_wait(FOREVER);
-
+    // drive the future to await on the status channel
    tokio::time::timeout(ADVANCE, &mut evict_and_wait)
        .await
-        .expect_err("should had advanced");
-    eviction_arrived.wait().await;
-    drop(eviction_can_continue);
-    other_evict.await.unwrap();
+        .expect_err("should had been a timeout since we are holding the layer resident");
+    assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get());

-    // now the layer is evicted, and the "evict_and_wait" is waiting on the receiver
-    assert!(!layer.is_likely_resident());
+    // clog up BACKGROUND_RUNTIME spawn_blocking
+    let helper = SpawnBlockingPoolHelper::consume_all_spawn_blocking_threads(handle).await;

-    // following new evict_and_wait will fail until we've completed the download
-    let e = layer.evict_and_wait(FOREVER).await.unwrap_err();
-    assert!(matches!(e, EvictionError::NotFound), "{e:?}");
+    // now the eviction cannot proceed because the threads are consumed while completion exists
+    drop(resident);

-    let (download_can_continue, barrier) = utils::completion::channel();
-    let (arrival, _download_arrived) = utils::completion::channel();
-    layer.enable_failpoint(Failpoint::WaitBeforeDownloading(Some(arrival), barrier));
+    // because no actual eviction happened, we get to just reinitialize the DownloadedLayer
+    layer
+        .keep_resident()
+        .await
+        .expect("keep_resident should had reinitialized without downloading")
+        .expect("ResidentLayer");

-    let mut download = std::pin::pin!(layer
-        .0
-        .get_or_maybe_download(true, None)
-        .instrument(download_span));
+    // because the keep_resident check alters wanted evicted without sending a message, we will
+    // never get completed
+    let e = tokio::time::timeout(ADVANCE, &mut evict_and_wait)
+        .await
+        .expect("no timeout, because keep_resident re-initialized")
+        .expect_err("eviction should not have succeeded because re-initialized");

-    assert!(
-        !layer.is_likely_resident(),
-        "during download layer is evicted"
+    // works as intended: evictions lose to "downloads"
+    assert!(matches!(e, EvictionError::Downloaded), "{e:?}");
+    assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get());
+
+    // this is not wrong: the eviction is technically still "on the way" as it's still queued
+    // because spawn_blocking is clogged up
+    assert_eq!(
+        0,
+        LAYER_IMPL_METRICS
+            .cancelled_evictions
+            .values()
+            .map(|ctr| ctr.get())
+            .sum::<u64>()
    );

-    tokio::time::timeout(ADVANCE, &mut download)
+    let mut second_eviction = std::pin::pin!(layer.evict_and_wait(FOREVER));
+
+    // advance to the wait on the queue
+    tokio::time::timeout(ADVANCE, &mut second_eviction)
        .await
-        .expect_err("should had timed out because of failpoint");
+        .expect_err("timeout because spawn_blocking is clogged");

-    // now we finally get to continue, and because the latest state is downloading, we deduce that
-    // original eviction succeeded
-    evict_and_wait.await.unwrap();
+    // in this case we don't leak started evictions, but I think there is still a chance of that
+    // happening, because we could have upgrades race multiple evictions while only one of them
+    // happens?
+    assert_eq!(2, LAYER_IMPL_METRICS.started_evictions.get());

-    // however a new evict_and_wait will fail
-    let e = layer.evict_and_wait(FOREVER).await.unwrap_err();
-    assert!(matches!(e, EvictionError::NotFound), "{e:?}");
+    helper.release().await;

-    assert!(!layer.is_likely_resident());
+    // the second_eviction gets to run here
+    //
+    // synchronize to be *strictly* after the second_eviction spawn_blocking run
+    SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(handle).await;

-    drop(download_can_continue);
-    download.await.expect("download should had succeeded");
-    assert!(layer.is_likely_resident());
+    tokio::time::timeout(ADVANCE, &mut second_eviction)
+        .await
+        .expect("eviction goes through now that spawn_blocking is unclogged")
+        .expect("eviction should succeed, because version matches");

-    // only now can we evict
-    layer.evict_and_wait(FOREVER).await.unwrap();
-}
+    assert_eq!(1, LAYER_IMPL_METRICS.completed_evictions.get());

-#[test]
-fn layer_size() {
-    assert_eq!(std::mem::size_of::<LayerAccessStats>(), 2040);
-    assert_eq!(std::mem::size_of::<PersistentLayerDesc>(), 104);
-    assert_eq!(std::mem::size_of::<LayerInner>(), 2328);
-    // it also has the utf8 path
+    // now we finally can observe the original spawn_blocking failing
+    // it would had been possible to observe it earlier, but here it is guaranteed to have
+    // happened.
+    assert_eq!(
+        1,
+        LAYER_IMPL_METRICS
+            .cancelled_evictions
+            .values()
+            .map(|ctr| ctr.get())
+            .sum::<u64>()
+    );
 }

 struct SpawnBlockingPoolHelper {
@@ -743,41 +371,31 @@ impl SpawnBlockingPoolHelper {
    ///
    /// This should be no issue nowdays, because nextest runs each test in it's own process.
    async fn consume_all_spawn_blocking_threads(handle: &tokio::runtime::Handle) -> Self {
-        let default_max_blocking_threads = 512;
-
-        Self::consume_all_spawn_blocking_threads0(handle, default_max_blocking_threads).await
-    }
-
-    async fn consume_all_spawn_blocking_threads0(
-        handle: &tokio::runtime::Handle,
-        threads: usize,
-    ) -> Self {
-        assert_ne!(threads, 0);
-
        let (completion, barrier) = completion::channel();
-        let (started, starts_completed) = completion::channel();
+        let (tx, mut rx) = tokio::sync::mpsc::channel(8);
+
+        let assumed_max_blocking_threads = 512;

        let mut blocking_tasks = JoinSet::new();

-        for _ in 0..threads {
+        for _ in 0..assumed_max_blocking_threads {
            let barrier = barrier.clone();
-            let started = started.clone();
+            let tx = tx.clone();
            blocking_tasks.spawn_blocking_on(
                move || {
-                    drop(started);
+                    tx.blocking_send(()).unwrap();
+                    drop(tx);
                    tokio::runtime::Handle::current().block_on(barrier.wait());
                },
                handle,
            );
        }

-        drop(started);
-
-        starts_completed.wait().await;
-
        drop(barrier);

-        tracing::trace!("consumed all threads");
+        for _ in 0..assumed_max_blocking_threads {
+            rx.recv().await.unwrap();
+        }

        SpawnBlockingPoolHelper {
            awaited_by_spawn_blocking_tasks: completion,
@@ -797,22 +415,13 @@ impl SpawnBlockingPoolHelper {
        while let Some(res) = blocking_tasks.join_next().await {
            res.expect("none of the tasks should had panicked");
        }
-
-        tracing::trace!("released all threads");
    }

    /// In the tests it is used as an easy way of making sure something scheduled on the target
    /// runtimes `spawn_blocking` has completed, because it must've been scheduled and completed
    /// before our tasks have a chance to schedule and complete.
    async fn consume_and_release_all_of_spawn_blocking_threads(handle: &tokio::runtime::Handle) {
-        Self::consume_and_release_all_of_spawn_blocking_threads0(handle, 512).await
-    }
-
-    async fn consume_and_release_all_of_spawn_blocking_threads0(
-        handle: &tokio::runtime::Handle,
-        threads: usize,
-    ) {
-        Self::consume_all_spawn_blocking_threads0(handle, threads)
+        Self::consume_all_spawn_blocking_threads(handle)
            .await
            .release()
            .await
@@ -826,7 +435,7 @@ fn spawn_blocking_pool_helper_actually_works() {
    // because the amount is not configurable for our helper, expect the same amount as
    // BACKGROUND_RUNTIME using the tokio defaults would have.
    let rt = tokio::runtime::Builder::new_current_thread()
-        .max_blocking_threads(1)
+        .max_blocking_threads(512)
        .enable_all()
        .build()
        .unwrap();
@@ -836,8 +445,7 @@ fn spawn_blocking_pool_helper_actually_works() {
    rt.block_on(async move {
        // this will not return until all threads are spun up and actually executing the code
        // waiting on `consumed` to be `SpawnBlockingPoolHelper::release`'d.
-        let consumed =
-            SpawnBlockingPoolHelper::consume_all_spawn_blocking_threads0(handle, 1).await;
+        let consumed = SpawnBlockingPoolHelper::consume_all_spawn_blocking_threads(handle).await;

        println!("consumed");

--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -13,13 +13,14 @@ use bytes::Bytes;
 use camino::Utf8Path;
 use enumset::EnumSet;
 use fail::fail_point;
+use futures::stream::StreamExt;
 use once_cell::sync::Lazy;
 use pageserver_api::{
    key::AUX_FILES_KEY,
    keyspace::KeySpaceAccum,
    models::{
        CompactionAlgorithm, DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest,
-        EvictionPolicy, InMemoryLayerInfo, LayerMapInfo, TimelineState,
+        EvictionPolicy, LayerMapInfo, TimelineState,
    },
    reltag::BlockNumber,
    shard::{ShardIdentity, TenantShardId},
@@ -36,7 +37,6 @@ use tracing::*;
 use utils::{
    bin_ser::BeSer,
    sync::gate::{Gate, GateGuard},
-    vec_map::VecMap,
 };

 use std::ops::{Deref, Range};
@@ -54,7 +54,6 @@ use std::{
    ops::ControlFlow,
 };

-use crate::deletion_queue::DeletionQueueClient;
 use crate::tenant::timeline::logical_size::CurrentLogicalSize;
 use crate::tenant::{
    layer_map::{LayerMap, SearchResult},
@@ -65,6 +64,7 @@ use crate::{
    disk_usage_eviction_task::DiskUsageEvictionInfo,
    pgdatadir_mapping::CollectKeySpaceError,
 };
+use crate::{deletion_queue::DeletionQueueClient, tenant::remote_timeline_client::StopError};
 use crate::{
    disk_usage_eviction_task::finite_f32,
    tenant::storage_layer::{
@@ -1142,79 +1142,6 @@ impl Timeline {
        self.flush_frozen_layers_and_wait().await
    }

-    /// If there is no writer, and conditions for rolling the latest layer are met, then freeze it.
-    ///
-    /// This is for use in background housekeeping, to provide guarantees of layers closing eventually
-    /// even if there are no ongoing writes to drive that.
-    async fn maybe_freeze_ephemeral_layer(&self) {
-        let Ok(_write_guard) = self.write_lock.try_lock() else {
-            // If the write lock is held, there is an active wal receiver: rolling open layers
-            // is their responsibility while they hold this lock.
-            return;
-        };
-
-        let Ok(layers_guard) = self.layers.try_read() else {
-            // Don't block if the layer lock is busy
-            return;
-        };
-
-        let Some(open_layer) = &layers_guard.layer_map().open_layer else {
-            // No open layer, no work to do.
-            return;
-        };
-
-        let Some(current_size) = open_layer.try_len() else {
-            // Unexpected: since we hold the write guard, nobody else should be writing to this layer, so
-            // read lock to get size should always succeed.
-            tracing::warn!("Lock conflict while reading size of open layer");
-            return;
-        };
-
-        let current_lsn = self.get_last_record_lsn();
-
-        let checkpoint_distance_override = open_layer.tick().await;
-
-        if let Some(size_override) = checkpoint_distance_override {
-            if current_size > size_override {
-                // This is not harmful, but it only happens in relatively rare cases where
-                // time-based checkpoints are not happening fast enough to keep the amount of
-                // ephemeral data within configured limits.  It's a sign of stress on the system.
-                tracing::info!("Early-rolling open layer at size {current_size} (limit {size_override}) due to dirty data pressure");
-            }
-        }
-
-        let checkpoint_distance =
-            checkpoint_distance_override.unwrap_or(self.get_checkpoint_distance());
-
-        if self.should_roll(
-            current_size,
-            current_size,
-            checkpoint_distance,
-            self.get_last_record_lsn(),
-            self.last_freeze_at.load(),
-            *self.last_freeze_ts.read().unwrap(),
-        ) {
-            match open_layer.info() {
-                InMemoryLayerInfo::Frozen { lsn_start, lsn_end } => {
-                    // We may reach this point if the layer was already frozen by not yet flushed: flushing
-                    // happens asynchronously in the background.
-                    tracing::debug!(
-                        "Not freezing open layer, it's already frozen ({lsn_start}..{lsn_end})"
-                    );
-                }
-                InMemoryLayerInfo::Open { .. } => {
-                    // Upgrade to a write lock and freeze the layer
-                    drop(layers_guard);
-                    let mut layers_guard = self.layers.write().await;
-                    layers_guard
-                        .try_freeze_in_memory_layer(current_lsn, &self.last_freeze_at)
-                        .await;
-                }
-            }
-            self.flush_frozen_layers();
-        }
-    }
-
    /// Outermost timeline compaction operation; downloads needed layers.
    pub(crate) async fn compact(
        self: &Arc<Self>,
@@ -1237,11 +1164,6 @@ impl Timeline {
            (guard, permit)
        };

-        // Prior to compaction, check if an open ephemeral layer should be closed: this provides
-        // background enforcement of checkpoint interval if there is no active WAL receiver, to avoid keeping
-        // an ephemeral layer open forever when idle.
-        self.maybe_freeze_ephemeral_layer().await;
-
        // this wait probably never needs any "long time spent" logging, because we already nag if
        // compaction task goes over it's period (20s) which is quite often in production.
        let (_guard, _permit) = tokio::select! {
@@ -1274,7 +1196,6 @@ impl Timeline {

    pub(crate) fn activate(
        self: &Arc<Self>,
-        parent: Arc<crate::tenant::Tenant>,
        broker_client: BrokerClientChannel,
        background_jobs_can_start: Option<&completion::Barrier>,
        ctx: &RequestContext,
@@ -1285,7 +1206,7 @@ impl Timeline {
        }
        self.launch_wal_receiver(ctx, broker_client);
        self.set_state(TimelineState::Active);
-        self.launch_eviction_task(parent, background_jobs_can_start);
+        self.launch_eviction_task(background_jobs_can_start);
    }

    /// Graceful shutdown, may do a lot of I/O as we flush any open layers to disk and then
@@ -1320,7 +1241,11 @@ impl Timeline {
                    // what is problematic is the shutting down of RemoteTimelineClient, because
                    // obviously it does not make sense to stop while we wait for it, but what
                    // about corner cases like s3 suddenly hanging up?
-                    client.shutdown().await;
+                    if let Err(e) = client.shutdown().await {
+                        // Non-fatal.  Shutdown is infallible.  Failures to flush just mean that
+                        // we have some extra WAL replay to do next time the timeline starts.
+                        warn!("failed to flush to remote storage: {e:#}");
+                    }
                }
            }
            Err(e) => {
@@ -1357,7 +1282,12 @@ impl Timeline {
        // Shut down remote timeline client: this gracefully moves its metadata into its Stopping state in
        // case our caller wants to use that for a deletion
        if let Some(remote_client) = self.remote_client.as_ref() {
-            remote_client.stop();
+            match remote_client.stop() {
+                Ok(()) => {}
+                Err(StopError::QueueUninitialized) => {
+                    // Shutting down during initialization is legal
+                }
+            }
        }

        tracing::debug!("Waiting for tasks...");
@@ -1513,53 +1443,6 @@ impl Timeline {
            Err(EvictionError::Timeout) => Ok(Some(false)),
        }
    }
-
-    fn should_roll(
-        &self,
-        layer_size: u64,
-        projected_layer_size: u64,
-        checkpoint_distance: u64,
-        projected_lsn: Lsn,
-        last_freeze_at: Lsn,
-        last_freeze_ts: Instant,
-    ) -> bool {
-        let distance = projected_lsn.widening_sub(last_freeze_at);
-
-        // Rolling the open layer can be triggered by:
-        // 1. The distance from the last LSN we rolled at. This bounds the amount of WAL that
-        //    the safekeepers need to store.  For sharded tenants, we multiply by shard count to
-        //    account for how writes are distributed across shards: we expect each node to consume
-        //    1/count of the LSN on average.
-        // 2. The size of the currently open layer.
-        // 3. The time since the last roll. It helps safekeepers to regard pageserver as caught
-        //    up and suspend activity.
-        if distance >= checkpoint_distance as i128 * self.shard_identity.count.count() as i128 {
-            info!(
-                "Will roll layer at {} with layer size {} due to LSN distance ({})",
-                projected_lsn, layer_size, distance
-            );
-
-            true
-        } else if projected_layer_size >= checkpoint_distance {
-            info!(
-                "Will roll layer at {} with layer size {} due to layer size ({})",
-                projected_lsn, layer_size, projected_layer_size
-            );
-
-            true
-        } else if distance > 0 && last_freeze_ts.elapsed() >= self.get_checkpoint_timeout() {
-            info!(
-                "Will roll layer at {} with layer size {} due to time since last flush ({:?})",
-                projected_lsn,
-                layer_size,
-                last_freeze_ts.elapsed()
-            );
-
-            true
-        } else {
-            false
-        }
-    }
 }

 /// Number of times we will compute partition within a checkpoint distance.
@@ -2559,7 +2442,7 @@ impl Timeline {

        let guard = self.layers.read().await;

-        let resident = guard.likely_resident_layers().map(|layer| {
+        let resident = guard.resident_layers().map(|layer| {
            let last_activity_ts = layer.access_stats().latest_activity_or_now();

            HeatMapLayer::new(
@@ -2569,7 +2452,7 @@ impl Timeline {
            )
        });

-        let layers = resident.collect();
+        let layers = resident.collect().await;

        Some(HeatMapTimeline::new(self.timeline_id, layers))
    }
@@ -2713,10 +2596,6 @@ impl Timeline {
                    // Get all the data needed to reconstruct the page version from this layer.
                    // But if we have an older cached page image, no need to go past that.
                    let lsn_floor = max(cached_lsn + 1, start_lsn);
-
-                    let open_layer = open_layer.clone();
-                    drop(guard);
-
                    result = match open_layer
                        .get_value_reconstruct_data(
                            key,
@@ -2734,7 +2613,10 @@ impl Timeline {
                    traversal_path.push((
                        result,
                        cont_lsn,
-                        Box::new(move || open_layer.traversal_id()),
+                        Box::new({
+                            let open_layer = Arc::clone(open_layer);
+                            move || open_layer.traversal_id()
+                        }),
                    ));
                    continue 'outer;
                }
@@ -2744,10 +2626,6 @@ impl Timeline {
                if cont_lsn > start_lsn {
                    //info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.filename().display());
                    let lsn_floor = max(cached_lsn + 1, start_lsn);
-
-                    let frozen_layer = frozen_layer.clone();
-                    drop(guard);
-
                    result = match frozen_layer
                        .get_value_reconstruct_data(
                            key,
@@ -2765,7 +2643,10 @@ impl Timeline {
                    traversal_path.push((
                        result,
                        cont_lsn,
-                        Box::new(move || frozen_layer.traversal_id()),
+                        Box::new({
+                            let frozen_layer = Arc::clone(frozen_layer);
+                            move || frozen_layer.traversal_id()
+                        }),
                    ));
                    continue 'outer;
                }
@@ -2773,8 +2654,6 @@ impl Timeline {

            if let Some(SearchResult { lsn_floor, layer }) = layers.search(key, cont_lsn) {
                let layer = guard.get_from_desc(&layer);
-                drop(guard);
-
                // Get all the data needed to reconstruct the page version from this layer.
                // But if we have an older cached page image, no need to go past that.
                let lsn_floor = max(cached_lsn + 1, lsn_floor);
@@ -3905,11 +3784,8 @@ impl Timeline {
                        // The timestamp is in the future. That sounds impossible,
                        // but what it really means is that there hasn't been
                        // any commits since the cutoff timestamp.
-                        //
-                        // In this case we should use the LSN of the most recent commit,
-                        // which is implicitly the last LSN in the log.
                        debug!("future({})", lsn);
-                        self.get_last_record_lsn()
+                        cutoff_horizon
                    }
                    LsnForTimestamp::Past(lsn) => {
                        debug!("past({})", lsn);
@@ -4423,7 +4299,7 @@ impl Timeline {
        let mut max_layer_size: Option<u64> = None;

        let resident_layers = guard
-            .likely_resident_layers()
+            .resident_layers()
            .map(|layer| {
                let file_size = layer.layer_desc().file_size;
                max_layer_size = max_layer_size.map_or(Some(file_size), |m| Some(m.max(file_size)));
@@ -4436,7 +4312,8 @@ impl Timeline {
                    relative_last_activity: finite_f32::FiniteF32::ZERO,
                }
            })
-            .collect();
+            .collect()
+            .await;

        DiskUsageEvictionInfo {
            max_layer_size,
@@ -4581,6 +4458,49 @@ impl<'a> TimelineWriter<'a> {
        res
    }

+    /// "Tick" the timeline writer: it will roll the open layer if required
+    /// and do nothing else.
+    pub(crate) async fn tick(&mut self) -> anyhow::Result<()> {
+        self.open_layer_if_present().await?;
+
+        let last_record_lsn = self.get_last_record_lsn();
+        let action = self.get_open_layer_action(last_record_lsn, 0);
+        if action == OpenLayerAction::Roll {
+            self.roll_layer(last_record_lsn).await?;
+        }
+
+        Ok(())
+    }
+
+    /// Populate the timeline writer state only if an in-memory layer
+    /// is already open.
+    async fn open_layer_if_present(&mut self) -> anyhow::Result<()> {
+        assert!(self.write_guard.is_none());
+
+        let open_layer = {
+            let guard = self.layers.read().await;
+            let layers = guard.layer_map();
+            match layers.open_layer {
+                Some(ref open_layer) => open_layer.clone(),
+                None => {
+                    return Ok(());
+                }
+            }
+        };
+
+        let initial_size = open_layer.size().await?;
+        let last_freeze_at = self.last_freeze_at.load();
+        let last_freeze_ts = *self.last_freeze_ts.read().unwrap();
+        self.write_guard.replace(TimelineWriterState::new(
+            open_layer,
+            initial_size,
+            last_freeze_at,
+            last_freeze_ts,
+        ));
+
+        Ok(())
+    }
+
    async fn handle_open_layer_action(
        &mut self,
        at: Lsn,
@@ -4652,29 +4572,59 @@ impl<'a> TimelineWriter<'a> {
            return OpenLayerAction::None;
        }

-        if self.tl.should_roll(
-            state.current_size,
-            state.current_size + new_value_size,
-            self.get_checkpoint_distance(),
-            lsn,
-            state.cached_last_freeze_at,
-            state.cached_last_freeze_ts,
-        ) {
+        let distance = lsn.widening_sub(state.cached_last_freeze_at);
+        let proposed_open_layer_size = state.current_size + new_value_size;
+
+        // Rolling the open layer can be triggered by:
+        // 1. The distance from the last LSN we rolled at. This bounds the amount of WAL that
+        //    the safekeepers need to store.  For sharded tenants, we multiply by shard count to
+        //    account for how writes are distributed across shards: we expect each node to consume
+        //    1/count of the LSN on average.
+        // 2. The size of the currently open layer.
+        // 3. The time since the last roll. It helps safekeepers to regard pageserver as caught
+        //    up and suspend activity.
+        if distance
+            >= self.get_checkpoint_distance() as i128 * self.shard_identity.count.count() as i128
+        {
+            info!(
+                "Will roll layer at {} with layer size {} due to LSN distance ({})",
+                lsn, state.current_size, distance
+            );
+
+            OpenLayerAction::Roll
+        } else if proposed_open_layer_size >= self.get_checkpoint_distance() {
+            info!(
+                "Will roll layer at {} with layer size {} due to layer size ({})",
+                lsn, state.current_size, proposed_open_layer_size
+            );
+
+            OpenLayerAction::Roll
+        } else if distance > 0
+            && state.cached_last_freeze_ts.elapsed() >= self.get_checkpoint_timeout()
+        {
+            info!(
+                "Will roll layer at {} with layer size {} due to time since last flush ({:?})",
+                lsn,
+                state.current_size,
+                state.cached_last_freeze_ts.elapsed()
+            );
+
            OpenLayerAction::Roll
        } else {
            OpenLayerAction::None
        }
    }

-    /// Put a batch of keys at the specified Lsns.
+    /// Put a batch keys at the specified Lsns.
    ///
-    /// The batch is sorted by Lsn (enforced by usage of [`utils::vec_map::VecMap`].
+    /// The batch should be sorted by Lsn such that it's safe
+    /// to roll the open layer mid batch.
    pub(crate) async fn put_batch(
        &mut self,
-        batch: VecMap<Lsn, (Key, Value)>,
+        batch: Vec<(Key, Lsn, Value)>,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
-        for (lsn, (key, val)) in batch {
+        for (key, lsn, val) in batch {
            self.put(key, lsn, &val, ctx).await?
        }

@@ -4760,6 +4710,7 @@ mod tests {
            .keep_resident()
            .await
            .expect("no download => no downloading errors")
+            .expect("should had been resident")
            .drop_eviction_guard();

        let forever = std::time::Duration::from_secs(120);
@@ -4770,7 +4721,7 @@ mod tests {
        let (first, second) = tokio::join!(first, second);

        let res = layer.keep_resident().await;
-        assert!(res.is_none(), "{res:?}");
+        assert!(matches!(res, Ok(None)), "{res:?}");

        match (first, second) {
            (Ok(()), Ok(())) => {
--- a/Show More
+++ b/Show More