Test

fix(walreceiver): Timeline::shutdown can leave a dangling handle_walreceiver_connection tokio task (#7235 )
# Problem As pointed out through doc-comments in this PR, `drop_old_connection` is not cancellation-safe. This means we can leave a `handle_walreceiver_connection` tokio task dangling during Timeline shutdown. More details described in the corresponding issue #7062. # Solution Don't cancel-by-drop the `connection_manager_loop_step` from the `tokio::select!()` in the task_mgr task. Instead, transform the code to use a `CancellationToken` --- specifically, `task_mgr::shutdown_token()` --- and make code responsive to it. The `drop_old_connection()` is still not cancellation-safe and also doesn't get a cancellation token, because there's no point inside the function where we could return early if cancellation were requested using a token. We rely on the `handle_walreceiver_connection` to be sensitive to the `TaskHandle`s cancellation token (argument name: `cancellation`). Currently it checks for `cancellation` on each WAL message. It is probably also sensitive to `Timeline::cancel` because ultimately all that `handle_walreceiver_connection` does is interact with the `Timeline`. In summary, the above means that the following code (which is found in `Timeline::shutdown`) now might **take longer**, but actually ensures that all `handle_walreceiver_connection` tasks are finished: ```rust task_mgr::shutdown_tasks( Some(TaskKind::WalReceiverManager), Some(self.tenant_shard_id), Some(self.timeline_id) ) ``` # Refs refs #7062
2026-06-20 05:40:38 +00:00 · 2024-03-27 13:42:33 +01:00 · 2024-03-27 12:04:31 +01:00 · 2024-03-26 19:31:19 +00:00 · 2024-03-26 18:29:08 +00:00 · 2024-03-26 17:44:18 +00:00
257 changed files with 13989 additions and 5721 deletions
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -461,6 +461,7 @@ jobs:
      - name: Pytest regression tests
        uses: ./.github/actions/run-python-test-set
        timeout-minutes: 60
        with:
          build_type: ${{ matrix.build_type }}
          test_selection: regress
@@ -474,7 +475,7 @@ jobs:
          TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
          CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
          BUILD_TAG: ${{ needs.tag.outputs.build-tag }}
-          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: std-fs
+          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
          PAGESERVER_GET_VECTORED_IMPL: vectored
      # Temporary disable this step until we figure out why it's so flaky
@@ -554,7 +555,7 @@ jobs:
          VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
          PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
          TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
-          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: std-fs
+          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
      # XXX: no coverage data handling here, since benchmarks are run on release builds,
      # while coverage is currently collected for the debug ones
@@ -1120,10 +1121,16 @@ jobs:
        run: |
          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=false
            # TODO: move deployPreprodRegion to release (`"$GITHUB_REF_NAME" == "release"` block), once Staging support different compute tag prefixes for different regions
            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=true
          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main \
              -f deployPgSniRouter=false \
              -f deployProxy=false \
              -f deployStorage=true \
              -f deployStorageBroker=true \
              -f branch=main \
              -f dockerTag=${{needs.tag.outputs.build-tag}} \
              -f deployPreprodRegion=true
            gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main \
              -f deployPgSniRouter=false \
              -f deployProxy=false \
@@ -1132,6 +1139,15 @@ jobs:
              -f branch=main \
              -f dockerTag=${{needs.tag.outputs.build-tag}}
          elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main \
              -f deployPgSniRouter=true \
              -f deployProxy=true \
              -f deployStorage=false \
              -f deployStorageBroker=false \
              -f branch=main \
              -f dockerTag=${{needs.tag.outputs.build-tag}} \
              -f deployPreprodRegion=true
            gh workflow --repo neondatabase/aws run deploy-proxy-prod.yml --ref main \
              -f deployPgSniRouter=true \
              -f deployProxy=true \
--- a/3
+++ b/3
@@ -1,12 +1,13 @@
 /compute_tools/ @neondatabase/control-plane @neondatabase/compute
 /control_plane/attachment_service @neondatabase/storage
 /libs/pageserver_api/ @neondatabase/storage
-/libs/postgres_ffi/ @neondatabase/compute
+/libs/postgres_ffi/ @neondatabase/compute @neondatabase/safekeepers
 /libs/remote_storage/ @neondatabase/storage
 /libs/safekeeper_api/ @neondatabase/safekeepers
 /libs/vm_monitor/ @neondatabase/autoscaling
 /pageserver/ @neondatabase/storage
 /pgxn/ @neondatabase/compute
 /pgxn/neon/ @neondatabase/compute @neondatabase/safekeepers
 /proxy/ @neondatabase/proxy
 /safekeeper/ @neondatabase/safekeepers
 /vendor/ @neondatabase/compute
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -276,7 +276,7 @@ version = "0.1.0"
 dependencies = [
 "anyhow",
 "aws-config",
- "aws-sdk-secretsmanager",
+ "bytes",
 "camino",
 "clap",
 "control_plane",
@@ -288,6 +288,8 @@ dependencies = [
 "hex",
 "humantime",
 "hyper",
 "lasso",
 "measured",
 "metrics",
 "once_cell",
 "pageserver_api",
@@ -295,6 +297,7 @@ dependencies = [
 "postgres_connection",
 "r2d2",
 "reqwest",
 "routerify",
 "serde",
 "serde_json",
 "thiserror",
@@ -343,9 +346,9 @@ dependencies = [
 [[package]]
 name = "aws-credential-types"
-version = "1.1.4"
+version = "1.1.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "33cc49dcdd31c8b6e79850a179af4c367669150c7ac0135f176c61bec81a70f7"
+checksum = "fa8587ae17c8e967e4b05a62d495be2fb7701bec52a97f7acfe8a29f938384c8"
 dependencies = [
 "aws-smithy-async",
 "aws-smithy-runtime-api",
@@ -355,9 +358,9 @@ dependencies = [
 [[package]]
 name = "aws-runtime"
-version = "1.1.4"
+version = "1.1.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eb031bff99877c26c28895766f7bb8484a05e24547e370768d6cc9db514662aa"
+checksum = "b13dc54b4b49f8288532334bba8f87386a40571c47c37b1304979b556dc613c8"
 dependencies = [
 "aws-credential-types",
 "aws-sigv4",
@@ -377,6 +380,29 @@ dependencies = [
 "uuid",
 ]
 [[package]]
 name = "aws-sdk-iam"
 version = "1.17.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b8ae76026bfb1b80a6aed0bb400c1139cd9c0563e26bce1986cd021c6a968c7b"
 dependencies = [
 "aws-credential-types",
 "aws-runtime",
 "aws-smithy-async",
 "aws-smithy-http",
 "aws-smithy-json",
 "aws-smithy-query",
 "aws-smithy-runtime",
 "aws-smithy-runtime-api",
 "aws-smithy-types",
 "aws-smithy-xml",
 "aws-types",
 "http 0.2.9",
 "once_cell",
 "regex-lite",
 "tracing",
 ]
 [[package]]
 name = "aws-sdk-s3"
 version = "1.14.0"
@@ -406,29 +432,6 @@ dependencies = [
 "url",
 ]
 [[package]]
 name = "aws-sdk-secretsmanager"
 version = "1.14.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0a0b64e61e7d632d9df90a2e0f32630c68c24960cab1d27d848718180af883d3"
 dependencies = [
 "aws-credential-types",
 "aws-runtime",
 "aws-smithy-async",
 "aws-smithy-http",
 "aws-smithy-json",
 "aws-smithy-runtime",
 "aws-smithy-runtime-api",
 "aws-smithy-types",
 "aws-types",
 "bytes",
 "fastrand 2.0.0",
 "http 0.2.9",
 "once_cell",
 "regex-lite",
 "tracing",
 ]
 [[package]]
 name = "aws-sdk-sso"
 version = "1.12.0"
@@ -498,9 +501,9 @@ dependencies = [
 [[package]]
 name = "aws-sigv4"
-version = "1.1.4"
+version = "1.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c371c6b0ac54d4605eb6f016624fb5c7c2925d315fdf600ac1bf21b19d5f1742"
+checksum = "11d6f29688a4be9895c0ba8bef861ad0c0dac5c15e9618b9b7a6c233990fc263"
 dependencies = [
 "aws-credential-types",
 "aws-smithy-eventstream",
@@ -513,7 +516,7 @@ dependencies = [
 "hex",
 "hmac",
 "http 0.2.9",
- "http 1.0.0",
+ "http 1.1.0",
 "once_cell",
 "p256",
 "percent-encoding",
@@ -527,9 +530,9 @@ dependencies = [
 [[package]]
 name = "aws-smithy-async"
-version = "1.1.4"
+version = "1.1.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "72ee2d09cce0ef3ae526679b522835d63e75fb427aca5413cd371e490d52dcc6"
+checksum = "d26ea8fa03025b2face2b3038a63525a10891e3d8829901d502e5384a0d8cd46"
 dependencies = [
 "futures-util",
 "pin-project-lite",
@@ -570,9 +573,9 @@ dependencies = [
 [[package]]
 name = "aws-smithy-http"
-version = "0.60.4"
+version = "0.60.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dab56aea3cd9e1101a0a999447fb346afb680ab1406cebc44b32346e25b4117d"
+checksum = "3f10fa66956f01540051b0aa7ad54574640f748f9839e843442d99b970d3aff9"
 dependencies = [
 "aws-smithy-eventstream",
 "aws-smithy-runtime-api",
@@ -591,18 +594,18 @@ dependencies = [
 [[package]]
 name = "aws-smithy-json"
-version = "0.60.4"
+version = "0.60.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fd3898ca6518f9215f62678870064398f00031912390efd03f1f6ef56d83aa8e"
+checksum = "4683df9469ef09468dad3473d129960119a0d3593617542b7d52086c8486f2d6"
 dependencies = [
 "aws-smithy-types",
 ]
 [[package]]
 name = "aws-smithy-query"
-version = "0.60.4"
+version = "0.60.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bda4b1dfc9810e35fba8a620e900522cd1bd4f9578c446e82f49d1ce41d2e9f9"
+checksum = "f2fbd61ceb3fe8a1cb7352e42689cec5335833cd9f94103a61e98f9bb61c64bb"
 dependencies = [
 "aws-smithy-types",
 "urlencoding",
@@ -610,9 +613,9 @@ dependencies = [
 [[package]]
 name = "aws-smithy-runtime"
-version = "1.1.4"
+version = "1.1.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fafdab38f40ad7816e7da5dec279400dd505160780083759f01441af1bbb10ea"
+checksum = "ec81002d883e5a7fd2bb063d6fb51c4999eb55d404f4fff3dd878bf4733b9f01"
 dependencies = [
 "aws-smithy-async",
 "aws-smithy-http",
@@ -635,14 +638,15 @@ dependencies = [
 [[package]]
 name = "aws-smithy-runtime-api"
-version = "1.1.4"
+version = "1.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c18276dd28852f34b3bf501f4f3719781f4999a51c7bff1a5c6dc8c4529adc29"
+checksum = "9acb931e0adaf5132de878f1398d83f8677f90ba70f01f65ff87f6d7244be1c5"
 dependencies = [
 "aws-smithy-async",
 "aws-smithy-types",
 "bytes",
 "http 0.2.9",
 "http 1.1.0",
 "pin-project-lite",
 "tokio",
 "tracing",
@@ -651,9 +655,9 @@ dependencies = [
 [[package]]
 name = "aws-smithy-types"
-version = "1.1.4"
+version = "1.1.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bb3e134004170d3303718baa2a4eb4ca64ee0a1c0a7041dca31b38be0fb414f3"
+checksum = "abe14dceea1e70101d38fbf2a99e6a34159477c0fb95e68e05c66bd7ae4c3729"
 dependencies = [
 "base64-simd",
 "bytes",
@@ -674,18 +678,18 @@ dependencies = [
 [[package]]
 name = "aws-smithy-xml"
-version = "0.60.4"
+version = "0.60.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8604a11b25e9ecaf32f9aa56b9fe253c5e2f606a3477f0071e96d3155a5ed218"
+checksum = "872c68cf019c0e4afc5de7753c4f7288ce4b71663212771bf5e4542eb9346ca9"
 dependencies = [
 "xmlparser",
 ]
 [[package]]
 name = "aws-types"
-version = "1.1.4"
+version = "1.1.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "789bbe008e65636fe1b6dbbb374c40c8960d1232b96af5ff4aec349f9c4accf4"
+checksum = "0dbf2f3da841a8930f159163175cf6a3d16ddde517c1b0fba7aa776822800f40"
 dependencies = [
 "aws-credential-types",
 "aws-smithy-async",
@@ -1346,6 +1350,7 @@ dependencies = [
 "futures",
 "git-version",
 "hex",
 "humantime",
 "hyper",
 "nix 0.27.1",
 "once_cell",
@@ -2391,9 +2396,9 @@ dependencies = [
 [[package]]
 name = "http"
-version = "1.0.0"
+version = "1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b32afd38673a8016f7c9ae69e5af41a58f81b1d31689040f2f1959594ce194ea"
+checksum = "21b9ddb458710bc376481b842f5da65cdf31522de232c1ca8146abce2a358258"
 dependencies = [
 "bytes",
 "fnv",
@@ -2493,7 +2498,7 @@ dependencies = [
 "hyper",
 "log",
 "rustls 0.21.9",
- "rustls-native-certs",
+ "rustls-native-certs 0.6.2",
 "tokio",
 "tokio-rustls 0.24.0",
 ]
@@ -2879,6 +2884,35 @@ version = "0.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"
 [[package]]
 name = "measured"
 version = "0.0.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f246648d027839a34b420e27c7de1165ace96e19ef894985d0a6ff89a7840a9f"
 dependencies = [
 "bytes",
 "hashbrown 0.14.0",
 "itoa",
 "lasso",
 "measured-derive",
 "memchr",
 "parking_lot 0.12.1",
 "rustc-hash",
 "ryu",
 ]
 [[package]]
 name = "measured-derive"
 version = "0.0.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "edaa5cc22d99d5d6d7d99c3b5b5f7e7f8034c22f1b5d62a1adecd2ed005d9b80"
 dependencies = [
 "heck",
 "proc-macro2",
 "quote",
 "syn 2.0.52",
 ]
 [[package]]
 name = "memchr"
 version = "2.6.4"
@@ -3529,6 +3563,7 @@ dependencies = [
 "postgres_connection",
 "postgres_ffi",
 "pq_proto",
 "procfs",
 "rand 0.8.5",
 "regex",
 "remote_storage",
@@ -3546,6 +3581,7 @@ dependencies = [
 "strum_macros",
 "svg_fmt",
 "sync_wrapper",
 "sysinfo",
 "tenant_size_model",
 "thiserror",
 "tokio",
@@ -3899,7 +3935,7 @@ dependencies = [
 [[package]]
 name = "postgres"
 version = "0.19.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
 dependencies = [
 "bytes",
 "fallible-iterator",
@@ -3912,7 +3948,7 @@ dependencies = [
 [[package]]
 name = "postgres-native-tls"
 version = "0.5.0"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
 dependencies = [
 "native-tls",
 "tokio",
@@ -3923,7 +3959,7 @@ dependencies = [
 [[package]]
 name = "postgres-protocol"
 version = "0.6.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
 dependencies = [
 "base64 0.20.0",
 "byteorder",
@@ -3936,12 +3972,13 @@ dependencies = [
 "rand 0.8.5",
 "sha2",
 "stringprep",
 "tokio",
 ]
 [[package]]
 name = "postgres-types"
 version = "0.2.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
 dependencies = [
 "bytes",
 "fallible-iterator",
@@ -4163,6 +4200,10 @@ version = "0.1.0"
 dependencies = [
 "anyhow",
 "async-trait",
 "aws-config",
 "aws-sdk-iam",
 "aws-sigv4",
 "aws-types",
 "base64 0.13.1",
 "bstr",
 "bytes",
@@ -4173,6 +4214,7 @@ dependencies = [
 "consumption_metrics",
 "dashmap",
 "env_logger",
 "fallible-iterator",
 "futures",
 "git-version",
 "hashbrown 0.13.2",
@@ -4180,6 +4222,7 @@ dependencies = [
 "hex",
 "hmac",
 "hostname",
 "http 1.1.0",
 "humantime",
 "hyper",
 "hyper-tungstenite",
@@ -4223,6 +4266,7 @@ dependencies = [
 "smallvec",
 "smol_str",
 "socket2 0.5.5",
 "subtle",
 "sync_wrapper",
 "task-local-extensions",
 "thiserror",
@@ -4394,9 +4438,9 @@ dependencies = [
 [[package]]
 name = "redis"
-version = "0.24.0"
+version = "0.25.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c580d9cbbe1d1b479e8d67cf9daf6a62c957e6846048408b80b43ac3f6af84cd"
+checksum = "71d64e978fd98a0e6b105d066ba4889a7301fca65aeac850a877d8797343feeb"
 dependencies = [
 "async-trait",
 "bytes",
@@ -4405,15 +4449,15 @@ dependencies = [
 "itoa",
 "percent-encoding",
 "pin-project-lite",
- "rustls 0.21.9",
+ "rustls 0.22.2",
- "rustls-native-certs",
+ "rustls-native-certs 0.7.0",
- "rustls-pemfile 1.0.2",
+ "rustls-pemfile 2.1.1",
- "rustls-webpki 0.101.7",
+ "rustls-pki-types",
 "ryu",
 "sha1_smol",
- "socket2 0.4.9",
+ "socket2 0.5.5",
 "tokio",
- "tokio-rustls 0.24.0",
+ "tokio-rustls 0.25.0",
 "tokio-util",
 "url",
 ]
@@ -4842,6 +4886,19 @@ dependencies = [
 "security-framework",
 ]
 [[package]]
 name = "rustls-native-certs"
 version = "0.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8f1fb85efa936c42c6d5fc28d2629bb51e4b2f4b8a5211e297d599cc5a093792"
 dependencies = [
 "openssl-probe",
 "rustls-pemfile 2.1.1",
 "rustls-pki-types",
 "schannel",
 "security-framework",
 ]
 [[package]]
 name = "rustls-pemfile"
 version = "1.0.2"
@@ -5344,13 +5401,23 @@ checksum = "ae1a47186c03a32177042e55dbc5fd5aee900b8e0069a8d70fba96a9375cd012"
 [[package]]
 name = "sha2"
-version = "0.10.6"
+version = "0.10.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "82e6b795fe2e3b1e845bafcb27aa35405c4d47cdfc92af5fc8d3002f76cebdc0"
+checksum = "793db75ad2bcafc3ffa7c68b215fee268f537982cd901d132f89c6343f3a3dc8"
 dependencies = [
 "cfg-if",
 "cpufeatures",
 "digest",
 "sha2-asm",
 ]
 [[package]]
 name = "sha2-asm"
 version = "0.6.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f27ba7066011e3fb30d808b51affff34f0a66d3a03a58edd787c6e420e40e44e"
 dependencies = [
 "cc",
 ]
 [[package]]
@@ -5886,7 +5953,7 @@ dependencies = [
 [[package]]
 name = "tokio-epoll-uring"
 version = "0.1.0"
-source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#868d2c42b5d54ca82fead6e8f2f233b69a540d3e"
+source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#342ddd197a060a8354e8f11f4d12994419fff939"
 dependencies = [
 "futures",
 "nix 0.26.4",
@@ -5933,7 +6000,7 @@ dependencies = [
 [[package]]
 name = "tokio-postgres"
 version = "0.7.7"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
 dependencies = [
 "async-trait",
 "byteorder",
@@ -6099,7 +6166,7 @@ dependencies = [
 "percent-encoding",
 "pin-project",
 "prost",
- "rustls-native-certs",
+ "rustls-native-certs 0.6.2",
 "rustls-pemfile 1.0.2",
 "tokio",
 "tokio-rustls 0.24.0",
@@ -6423,7 +6490,7 @@ dependencies = [
 [[package]]
 name = "uring-common"
 version = "0.1.0"
-source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#868d2c42b5d54ca82fead6e8f2f233b69a540d3e"
+source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#342ddd197a060a8354e8f11f4d12994419fff939"
 dependencies = [
 "bytes",
 "io-uring",
@@ -6466,6 +6533,7 @@ version = "0.1.0"
 dependencies = [
 "anyhow",
 "arc-swap",
 "async-compression",
 "async-trait",
 "bincode",
 "byteorder",
@@ -6504,12 +6572,14 @@ dependencies = [
 "thiserror",
 "tokio",
 "tokio-stream",
 "tokio-tar",
 "tokio-util",
 "tracing",
 "tracing-error",
 "tracing-subscriber",
 "url",
 "uuid",
 "walkdir",
 "workspace_hack",
 ]
@@ -6981,7 +7051,6 @@ dependencies = [
 "aws-sigv4",
 "aws-smithy-async",
 "aws-smithy-http",
 "aws-smithy-runtime-api",
 "aws-smithy-types",
 "axum",
 "base64 0.21.1",
@@ -7027,6 +7096,7 @@ dependencies = [
 "scopeguard",
 "serde",
 "serde_json",
 "sha2",
 "smallvec",
 "subtle",
 "syn 1.0.109",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -52,10 +52,12 @@ async-stream = "0.3"
 async-trait = "0.1"
 aws-config = { version = "1.1.4", default-features = false, features=["rustls"] }
 aws-sdk-s3 = "1.14"
-aws-sdk-secretsmanager = { version = "1.14.0" }
+aws-sdk-iam = "1.15.0"
 aws-smithy-async = { version = "1.1.4", default-features = false, features=["rt-tokio"] }
 aws-smithy-types = "1.1.4"
 aws-credential-types = "1.1.4"
 aws-sigv4 = { version = "1.2.0", features = ["sign-http"] }
 aws-types = "1.1.7"
 axum = { version = "0.6.20", features = ["ws"] }
 base64 = "0.13.0"
 bincode = "1.3"
@@ -76,6 +78,7 @@ either = "1.8"
 enum-map = "2.4.2"
 enumset = "1.0.12"
 fail = "0.5.0"
 fallible-iterator = "0.2"
 fs2 = "0.4.3"
 futures = "0.3"
 futures-core = "0.3"
@@ -88,6 +91,7 @@ hex = "0.4"
 hex-literal = "0.4"
 hmac = "0.12.1"
 hostname = "0.3.1"
 http = {version = "1.1.0", features = ["std"]}
 http-types = { version = "2", default-features = false }
 humantime = "2.1"
 humantime-serde = "1.1.1"
@@ -101,6 +105,7 @@ lasso = "0.7"
 leaky-bucket = "1.0.1"
 libc = "0.2"
 md5 = "0.7.0"
 measured = { version = "0.0.13", features=["default", "lasso"] }
 memoffset = "0.8"
 native-tls = "0.2"
 nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] }
@@ -120,7 +125,7 @@ procfs = "0.14"
 prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
 prost = "0.11"
 rand = "0.8"
-redis = { version = "0.24.0", features = ["tokio-rustls-comp", "keep-alive"] }
+redis = { version = "0.25.2", features = ["tokio-rustls-comp", "keep-alive"] }
 regex = "1.10.2"
 reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
 reqwest-tracing = { version = "0.4.7", features = ["opentelemetry_0_20"] }
@@ -148,6 +153,7 @@ smol_str = { version = "0.2.0", features = ["serde"] }
 socket2 = "0.5"
 strum = "0.24"
 strum_macros = "0.24"
 "subtle"  = "2.5.0"
 svg_fmt = "0.4.1"
 sync_wrapper = "0.1.2"
 tar = "0.4"
--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -135,7 +135,7 @@ WORKDIR /home/nonroot
 # Rust
 # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
-ENV RUSTC_VERSION=1.76.0
+ENV RUSTC_VERSION=1.77.0
 ENV RUSTUP_HOME="/home/nonroot/.rustup"
 ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
 RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
@@ -149,7 +149,7 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
    cargo install --git https://github.com/paritytech/cachepot && \
    cargo install rustfilt && \
    cargo install cargo-hakari && \
-    cargo install cargo-deny && \
+    cargo install cargo-deny --locked && \
    cargo install cargo-hack && \
    cargo install cargo-nextest && \
    rm -rf /home/nonroot/.cargo/registry && \
--- a/2
+++ b/2
@@ -51,7 +51,7 @@ CARGO_BUILD_FLAGS += $(filter -j1,$(MAKEFLAGS))
 CARGO_CMD_PREFIX += $(if $(filter n,$(MAKEFLAGS)),,+)
 # Force cargo not to print progress bar
 CARGO_CMD_PREFIX += CARGO_TERM_PROGRESS_WHEN=never CI=1
-# Set PQ_LIB_DIR to make sure `attachment_service` get linked with bundled libpq (through diesel)
+# Set PQ_LIB_DIR to make sure `storage_controller` get linked with bundled libpq (through diesel)
 CARGO_CMD_PREFIX += PQ_LIB_DIR=$(POSTGRES_INSTALL_DIR)/v16/lib
 #
--- a/README.md
+++ b/README.md
@@ -238,6 +238,14 @@ If you encounter errors during setting up the initial tenant, it's best to stop
 ## Running tests
 ### Rust unit tests
 We are using [`cargo-nextest`](https://nexte.st/) to run the tests in Github Workflows.
 Some crates do not support running plain `cargo test` anymore, prefer `cargo nextest run` instead.
 You can install `cargo-nextest` with `cargo install cargo-nextest`.
 ### Integration tests
 Ensure your dependencies are installed as described [here](https://github.com/neondatabase/neon#dependency-installation-notes).
 ```sh
--- a/clippy.toml
+++ b/clippy.toml
@@ -2,6 +2,8 @@ disallowed-methods = [
    "tokio::task::block_in_place",
    # Allow this for now, to deny it later once we stop using Handle::block_on completely
    # "tokio::runtime::Handle::block_on",
    # use tokio_epoll_uring_ext instead
    "tokio_epoll_uring::thread_local_system",
 ]
 disallowed-macros = [
--- a/compute_tools/README.md
+++ b/compute_tools/README.md
@@ -32,6 +32,29 @@ compute_ctl -D /var/db/postgres/compute \
            -b /usr/local/bin/postgres
 ```
 ## State Diagram
 Computes can be in various states. Below is a diagram that details how a
 compute moves between states.
 ```mermaid
 %% https://mermaid.js.org/syntax/stateDiagram.html
 stateDiagram-v2
  [*] --> Empty : Compute spawned
  Empty --> ConfigurationPending : Waiting for compute spec
  ConfigurationPending --> Configuration : Received compute spec
  Configuration --> Failed : Failed to configure the compute
  Configuration --> Running : Compute has been configured
  Empty --> Init : Compute spec is immediately available
  Empty --> TerminationPending : Requested termination
  Init --> Failed : Failed to start Postgres
  Init --> Running : Started Postgres
  Running --> TerminationPending : Requested termination
  TerminationPending --> Terminated : Terminated compute
  Failed --> [*] : Compute exited
  Terminated --> [*] : Compute exited
 ```
 ## Tests
 Cargo formatter:
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -17,6 +17,7 @@ use chrono::{DateTime, Utc};
 use futures::future::join_all;
 use futures::stream::FuturesUnordered;
 use futures::StreamExt;
 use nix::unistd::Pid;
 use postgres::error::SqlState;
 use postgres::{Client, NoTls};
 use tracing::{debug, error, info, instrument, warn};
@@ -722,8 +723,12 @@ impl ComputeNode {
        // Stop it when it's ready
        info!("waiting for postgres");
        wait_for_postgres(&mut pg, Path::new(pgdata))?;
-        pg.kill()?;
+        // SIGQUIT orders postgres to exit immediately. We don't want to SIGKILL
-        info!("sent kill signal");
+        // it to avoid orphaned processes prowling around while datadir is
        // wiped.
        let pm_pid = Pid::from_raw(pg.id() as i32);
        kill(pm_pid, Signal::SIGQUIT)?;
        info!("sent SIGQUIT signal");
        pg.wait()?;
        info!("done prewarming");
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -17,6 +17,7 @@ pub fn line_in_file(path: &Path, line: &str) -> Result<bool> {
        .write(true)
        .create(true)
        .append(false)
        .truncate(false)
        .open(path)?;
    let buf = io::BufReader::new(&file);
    let mut count: usize = 0;
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -302,9 +302,9 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
            RoleAction::Create => {
                // This branch only runs when roles are created through the console, so it is
                // safe to add more permissions here. BYPASSRLS and REPLICATION are inherited
-                // from neon_superuser.
+                // from neon_superuser. (NOTE: REPLICATION has been removed from here for now).
                let mut query: String = format!(
-                    "CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser",
+                    "CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS IN ROLE neon_superuser",
                    name.pg_quote()
                );
                info!("running role create query: '{}'", &query);
@@ -743,19 +743,21 @@ pub fn handle_extension_neon(client: &mut Client) -> Result<()> {
    // which may happen in two cases:
    // - extension was just installed
    // - extension was already installed and is up to date
-    let query = "ALTER EXTENSION neon UPDATE";
+    // DISABLED due to compute node unpinning epic
-    info!("update neon extension version with query: {}", query);
+    // let query = "ALTER EXTENSION neon UPDATE";
-    client.simple_query(query)?;
+    // info!("update neon extension version with query: {}", query);
    // client.simple_query(query)?;
    Ok(())
 }
 #[instrument(skip_all)]
-pub fn handle_neon_extension_upgrade(client: &mut Client) -> Result<()> {
+pub fn handle_neon_extension_upgrade(_client: &mut Client) -> Result<()> {
-    info!("handle neon extension upgrade");
+    info!("handle neon extension upgrade (not really)");
-    let query = "ALTER EXTENSION neon UPDATE";
+    // DISABLED due to compute node unpinning epic
-    info!("update neon extension version with query: {}", query);
+    // let query = "ALTER EXTENSION neon UPDATE";
-    client.simple_query(query)?;
+    // info!("update neon extension version with query: {}", query);
    // client.simple_query(query)?;
    Ok(())
 }
@@ -805,6 +807,18 @@ $$;"#,
        "",
        "",
        // Add new migrations below.
        r#"
 DO $$
 DECLARE
    role_name TEXT;
 BEGIN
    FOR role_name IN SELECT rolname FROM pg_roles WHERE rolreplication IS TRUE
    LOOP
        RAISE NOTICE 'EXECUTING ALTER ROLE % NOREPLICATION', quote_ident(role_name);
        EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOREPLICATION';
    END LOOP;
 END
 $$;"#,
    ];
    let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -12,6 +12,7 @@ clap.workspace = true
 comfy-table.workspace = true
 futures.workspace = true
 git-version.workspace = true
 humantime.workspace = true
 nix.workspace = true
 once_cell.workspace = true
 postgres.workspace = true
--- a/control_plane/attachment_service/Cargo.toml
+++ b/control_plane/attachment_service/Cargo.toml
@@ -16,7 +16,7 @@ testing = []
 [dependencies]
 anyhow.workspace = true
 aws-config.workspace = true
-aws-sdk-secretsmanager.workspace = true
+bytes.workspace = true
 camino.workspace = true
 clap.workspace = true
 fail.workspace = true
@@ -25,17 +25,20 @@ git-version.workspace = true
 hex.workspace = true
 hyper.workspace = true
 humantime.workspace = true
 lasso.workspace = true
 once_cell.workspace = true
 pageserver_api.workspace = true
 pageserver_client.workspace = true
 postgres_connection.workspace = true
 reqwest.workspace = true
 routerify.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 thiserror.workspace = true
 tokio.workspace = true
 tokio-util.workspace = true
 tracing.workspace = true
 measured.workspace = true
 diesel = { version = "2.1.4", features = ["serde_json", "postgres", "r2d2"] }
 diesel_migrations = { version = "2.1.0" }
--- a/control_plane/attachment_service/migrations/2024-03-18-184429_rename_policy/down.sql
+++ b/control_plane/attachment_service/migrations/2024-03-18-184429_rename_policy/down.sql
@@ -0,0 +1,3 @@
 UPDATE tenant_shards set placement_policy='{"Double": 1}' where placement_policy='{"Attached": 1}';
 UPDATE tenant_shards set placement_policy='"Single"' where placement_policy='{"Attached": 0}';
--- a/control_plane/attachment_service/migrations/2024-03-18-184429_rename_policy/up.sql
+++ b/control_plane/attachment_service/migrations/2024-03-18-184429_rename_policy/up.sql
@@ -0,0 +1,3 @@
 UPDATE tenant_shards set placement_policy='{"Attached": 1}' where placement_policy='{"Double": 1}';
 UPDATE tenant_shards set placement_policy='{"Attached": 0}' where placement_policy='"Single"';
--- a/control_plane/attachment_service/src/compute_hook.rs
+++ b/control_plane/attachment_service/src/compute_hook.rs
@@ -1,4 +1,3 @@
 use std::sync::Arc;
 use std::{collections::HashMap, time::Duration};
 use control_plane::endpoint::{ComputeControlPlane, EndpointStatus};
@@ -24,13 +23,10 @@ struct ShardedComputeHookTenant {
    stripe_size: ShardStripeSize,
    shard_count: ShardCount,
    shards: Vec<(ShardNumber, NodeId)>,
    // Async lock used for ensuring that remote compute hook calls are ordered identically to updates to this structure
    lock: Arc<tokio::sync::Mutex<()>>,
 }
 enum ComputeHookTenant {
-    Unsharded((NodeId, Arc<tokio::sync::Mutex<()>>)),
+    Unsharded(NodeId),
    Sharded(ShardedComputeHookTenant),
 }
@@ -42,17 +38,9 @@ impl ComputeHookTenant {
                shards: vec![(tenant_shard_id.shard_number, node_id)],
                stripe_size,
                shard_count: tenant_shard_id.shard_count,
                lock: Arc::default(),
            })
        } else {
-            Self::Unsharded((node_id, Arc::default()))
+            Self::Unsharded(node_id)
        }
    }
    fn get_lock(&self) -> &Arc<tokio::sync::Mutex<()>> {
        match self {
            Self::Unsharded((_node_id, lock)) => lock,
            Self::Sharded(sharded_tenant) => &sharded_tenant.lock,
        }
    }
@@ -65,9 +53,7 @@ impl ComputeHookTenant {
        node_id: NodeId,
    ) {
        match self {
-            Self::Unsharded((existing_node_id, _lock))
+            Self::Unsharded(existing_node_id) if tenant_shard_id.shard_count.count() == 1 => {
                if tenant_shard_id.shard_count.count() == 1 =>
            {
                *existing_node_id = node_id
            }
            Self::Sharded(sharded_tenant)
@@ -136,15 +122,9 @@ pub(crate) enum NotifyError {
 }
 impl ComputeHookTenant {
-    fn maybe_reconfigure(
+    fn maybe_reconfigure(&self, tenant_id: TenantId) -> Option<ComputeHookNotifyRequest> {
-        &self,
+        match self {
-        tenant_id: TenantId,
+            Self::Unsharded(node_id) => Some(ComputeHookNotifyRequest {
    ) -> Option<(
        ComputeHookNotifyRequest,
        impl std::future::Future<Output = tokio::sync::OwnedMutexGuard<()>>,
    )> {
        let request = match self {
            Self::Unsharded((node_id, _lock)) => Some(ComputeHookNotifyRequest {
                tenant_id,
                shards: vec![ComputeHookNotifyRequestShard {
                    shard_number: ShardNumber(0),
@@ -178,9 +158,7 @@ impl ComputeHookTenant {
                );
                None
            }
-        };
+        }
        request.map(|r| (r, self.get_lock().clone().lock_owned()))
    }
 }
@@ -189,11 +167,8 @@ impl ComputeHookTenant {
 /// the compute connection string.
 pub(super) struct ComputeHook {
    config: Config,
-    state: std::sync::Mutex<HashMap<TenantId, ComputeHookTenant>>,
+    state: tokio::sync::Mutex<HashMap<TenantId, ComputeHookTenant>>,
    authorization_header: Option<String>,
    // This lock is only used in testing enviroments, to serialize calls into neon_lock
    neon_local_lock: tokio::sync::Mutex<()>,
 }
 impl ComputeHook {
@@ -207,7 +182,6 @@ impl ComputeHook {
            state: Default::default(),
            config,
            authorization_header,
            neon_local_lock: Default::default(),
        }
    }
@@ -216,10 +190,6 @@ impl ComputeHook {
        &self,
        reconfigure_request: ComputeHookNotifyRequest,
    ) -> anyhow::Result<()> {
        // neon_local updates are not safe to call concurrently, use a lock to serialize
        // all calls to this function
        let _locked = self.neon_local_lock.lock().await;
        let env = match LocalEnv::load_config() {
            Ok(e) => e,
            Err(e) => {
@@ -370,38 +340,30 @@ impl ComputeHook {
        stripe_size: ShardStripeSize,
        cancel: &CancellationToken,
    ) -> Result<(), NotifyError> {
-        let reconfigure_request = {
+        let mut locked = self.state.lock().await;
            let mut locked = self.state.lock().unwrap();
-            use std::collections::hash_map::Entry;
+        use std::collections::hash_map::Entry;
-            let tenant = match locked.entry(tenant_shard_id.tenant_id) {
+        let tenant = match locked.entry(tenant_shard_id.tenant_id) {
-                Entry::Vacant(e) => e.insert(ComputeHookTenant::new(
+            Entry::Vacant(e) => e.insert(ComputeHookTenant::new(
-                    tenant_shard_id,
+                tenant_shard_id,
-                    stripe_size,
+                stripe_size,
-                    node_id,
+                node_id,
-                )),
+            )),
-                Entry::Occupied(e) => {
+            Entry::Occupied(e) => {
-                    let tenant = e.into_mut();
+                let tenant = e.into_mut();
-                    tenant.update(tenant_shard_id, stripe_size, node_id);
+                tenant.update(tenant_shard_id, stripe_size, node_id);
-                    tenant
+                tenant
-                }
+            }
            };
            tenant.maybe_reconfigure(tenant_shard_id.tenant_id)
        };
-        let Some((reconfigure_request, lock_fut)) = reconfigure_request else {
+
        let reconfigure_request = tenant.maybe_reconfigure(tenant_shard_id.tenant_id);
        let Some(reconfigure_request) = reconfigure_request else {
            // The tenant doesn't yet have pageservers for all its shards: we won't notify anything
            // until it does.
            tracing::info!("Tenant isn't yet ready to emit a notification");
            return Ok(());
        };
        // Finish acquiring the tenant's async lock: this future was created inside the self.state
        // lock above, so we are guaranteed to get this lock in the same order as callers took
        // that lock.  This ordering is essential: the cloud control plane must end up with the
        // same end state for the tenant that we see.
        let _guard = lock_fut.await;
        if let Some(notify_url) = &self.config.compute_hook_url {
            self.do_notify(notify_url, reconfigure_request, cancel)
                .await
@@ -443,7 +405,6 @@ pub(crate) mod tests {
            tenant_state
                .maybe_reconfigure(tenant_id)
                .unwrap()
                .0
                .shards
                .len(),
            1
@@ -451,7 +412,6 @@ pub(crate) mod tests {
        assert!(tenant_state
            .maybe_reconfigure(tenant_id)
            .unwrap()
            .0
            .stripe_size
            .is_none());
@@ -485,7 +445,6 @@ pub(crate) mod tests {
            tenant_state
                .maybe_reconfigure(tenant_id)
                .unwrap()
                .0
                .shards
                .len(),
            2
@@ -494,7 +453,6 @@ pub(crate) mod tests {
            tenant_state
                .maybe_reconfigure(tenant_id)
                .unwrap()
                .0
                .stripe_size,
            Some(ShardStripeSize(32768))
        );
--- a/control_plane/attachment_service/src/heartbeater.rs
+++ b/control_plane/attachment_service/src/heartbeater.rs
@@ -0,0 +1,227 @@
 use futures::{stream::FuturesUnordered, StreamExt};
 use std::{
    collections::HashMap,
    sync::Arc,
    time::{Duration, Instant},
 };
 use tokio_util::sync::CancellationToken;
 use pageserver_api::{
    controller_api::{NodeAvailability, UtilizationScore},
    models::PageserverUtilization,
 };
 use thiserror::Error;
 use utils::id::NodeId;
 use crate::node::Node;
 struct HeartbeaterTask {
    receiver: tokio::sync::mpsc::UnboundedReceiver<HeartbeatRequest>,
    cancel: CancellationToken,
    state: HashMap<NodeId, PageserverState>,
    max_unavailable_interval: Duration,
    jwt_token: Option<String>,
 }
 #[derive(Debug, Clone)]
 pub(crate) enum PageserverState {
    Available {
        last_seen_at: Instant,
        utilization: PageserverUtilization,
    },
    Offline,
 }
 #[derive(Debug)]
 pub(crate) struct AvailablityDeltas(pub Vec<(NodeId, PageserverState)>);
 #[derive(Debug, Error)]
 pub(crate) enum HeartbeaterError {
    #[error("Cancelled")]
    Cancel,
 }
 struct HeartbeatRequest {
    pageservers: Arc<HashMap<NodeId, Node>>,
    reply: tokio::sync::oneshot::Sender<Result<AvailablityDeltas, HeartbeaterError>>,
 }
 pub(crate) struct Heartbeater {
    sender: tokio::sync::mpsc::UnboundedSender<HeartbeatRequest>,
 }
 impl Heartbeater {
    pub(crate) fn new(
        jwt_token: Option<String>,
        max_unavailable_interval: Duration,
        cancel: CancellationToken,
    ) -> Self {
        let (sender, receiver) = tokio::sync::mpsc::unbounded_channel::<HeartbeatRequest>();
        let mut heartbeater =
            HeartbeaterTask::new(receiver, jwt_token, max_unavailable_interval, cancel);
        tokio::task::spawn(async move { heartbeater.run().await });
        Self { sender }
    }
    pub(crate) async fn heartbeat(
        &self,
        pageservers: Arc<HashMap<NodeId, Node>>,
    ) -> Result<AvailablityDeltas, HeartbeaterError> {
        let (sender, receiver) = tokio::sync::oneshot::channel();
        self.sender
            .send(HeartbeatRequest {
                pageservers,
                reply: sender,
            })
            .unwrap();
        receiver.await.unwrap()
    }
 }
 impl HeartbeaterTask {
    fn new(
        receiver: tokio::sync::mpsc::UnboundedReceiver<HeartbeatRequest>,
        jwt_token: Option<String>,
        max_unavailable_interval: Duration,
        cancel: CancellationToken,
    ) -> Self {
        Self {
            receiver,
            cancel,
            state: HashMap::new(),
            max_unavailable_interval,
            jwt_token,
        }
    }
    async fn run(&mut self) {
        loop {
            tokio::select! {
                request = self.receiver.recv() => {
                    match request {
                        Some(req) => {
                            let res = self.heartbeat(req.pageservers).await;
                            req.reply.send(res).unwrap();
                        },
                        None => { return; }
                    }
                },
                _ = self.cancel.cancelled() => return
            }
        }
    }
    async fn heartbeat(
        &mut self,
        pageservers: Arc<HashMap<NodeId, Node>>,
    ) -> Result<AvailablityDeltas, HeartbeaterError> {
        let mut new_state = HashMap::new();
        let mut heartbeat_futs = FuturesUnordered::new();
        for (node_id, node) in &*pageservers {
            heartbeat_futs.push({
                let jwt_token = self.jwt_token.clone();
                let cancel = self.cancel.clone();
                // Clone the node and mark it as available such that the request
                // goes through to the pageserver even when the node is marked offline.
                // This doesn't impact the availability observed by [`crate::service::Service`].
                let mut node = node.clone();
                node.set_availability(NodeAvailability::Active(UtilizationScore::worst()));
                async move {
                    let response = node
                        .with_client_retries(
                            |client| async move { client.get_utilization().await },
                            &jwt_token,
                            3,
                            3,
                            Duration::from_secs(1),
                            &cancel,
                        )
                        .await;
                    let response = match response {
                        Some(r) => r,
                        None => {
                            // This indicates cancellation of the request.
                            // We ignore the node in this case.
                            return None;
                        }
                    };
                    let status = if let Ok(utilization) = response {
                        PageserverState::Available {
                            last_seen_at: Instant::now(),
                            utilization,
                        }
                    } else {
                        PageserverState::Offline
                    };
                    Some((*node_id, status))
                }
            });
            loop {
                let maybe_status = tokio::select! {
                    next = heartbeat_futs.next() => {
                        match next {
                            Some(result) => result,
                            None => { break; }
                        }
                    },
                    _ = self.cancel.cancelled() => { return Err(HeartbeaterError::Cancel); }
                };
                if let Some((node_id, status)) = maybe_status {
                    new_state.insert(node_id, status);
                }
            }
        }
        let mut deltas = Vec::new();
        let now = Instant::now();
        for (node_id, ps_state) in new_state {
            use std::collections::hash_map::Entry::*;
            let entry = self.state.entry(node_id);
            let mut needs_update = false;
            match entry {
                Occupied(ref occ) => match (occ.get(), &ps_state) {
                    (PageserverState::Offline, PageserverState::Offline) => {}
                    (PageserverState::Available { last_seen_at, .. }, PageserverState::Offline) => {
                        if now - *last_seen_at >= self.max_unavailable_interval {
                            deltas.push((node_id, ps_state.clone()));
                            needs_update = true;
                        }
                    }
                    _ => {
                        deltas.push((node_id, ps_state.clone()));
                        needs_update = true;
                    }
                },
                Vacant(_) => {
                    deltas.push((node_id, ps_state.clone()));
                }
            }
            match entry {
                Occupied(mut occ) if needs_update => {
                    (*occ.get_mut()) = ps_state;
                }
                Vacant(vac) => {
                    vac.insert(ps_state);
                }
                _ => {}
            }
        }
        Ok(AvailablityDeltas(deltas))
    }
 }
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -1,5 +1,11 @@
 use crate::metrics::{
    HttpRequestLatencyLabelGroup, HttpRequestStatusLabelGroup, PageserverRequestLabelGroup,
    METRICS_REGISTRY,
 };
 use crate::reconciler::ReconcileError;
 use crate::service::{Service, STARTUP_RECONCILE_TIMEOUT};
 use futures::Future;
 use hyper::header::CONTENT_TYPE;
 use hyper::{Body, Request, Response};
 use hyper::{StatusCode, Uri};
 use pageserver_api::models::{
@@ -14,7 +20,7 @@ use tokio_util::sync::CancellationToken;
 use utils::auth::{Scope, SwappableJwtAuth};
 use utils::failpoint_support::failpoints_handler;
 use utils::http::endpoint::{auth_middleware, check_permission_with, request_span};
-use utils::http::request::{must_get_query_param, parse_request_param};
+use utils::http::request::{must_get_query_param, parse_query_param, parse_request_param};
 use utils::id::{TenantId, TimelineId};
 use utils::{
@@ -28,11 +34,13 @@ use utils::{
 };
 use pageserver_api::controller_api::{
-    NodeConfigureRequest, NodeRegisterRequest, TenantShardMigrateRequest,
+    NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, TenantShardMigrateRequest,
 };
 use pageserver_api::upcall_api::{ReAttachRequest, ValidateRequest};
-use control_plane::attachment_service::{AttachHookRequest, InspectRequest};
+use control_plane::storage_controller::{AttachHookRequest, InspectRequest};
 use routerify::Middleware;
 /// State available to HTTP request handlers
 #[derive(Clone)]
@@ -176,14 +184,14 @@ async fn handle_tenant_location_config(
    service: Arc<Service>,
    mut req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?;
    check_permissions(&req, Scope::PageServerApi)?;
    let config_req = json_request::<TenantLocationConfigRequest>(&mut req).await?;
    json_response(
        StatusCode::OK,
        service
-            .tenant_location_config(tenant_id, config_req)
+            .tenant_location_config(tenant_shard_id, config_req)
            .await?,
    )
 }
@@ -248,8 +256,10 @@ async fn handle_tenant_secondary_download(
    req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
-    service.tenant_secondary_download(tenant_id).await?;
+    let wait = parse_query_param(&req, "wait_ms")?.map(Duration::from_millis);
-    json_response(StatusCode::OK, ())
+
    let (status, progress) = service.tenant_secondary_download(tenant_id, wait).await?;
    json_response(status, progress)
 }
 async fn handle_tenant_delete(
@@ -311,7 +321,7 @@ async fn handle_tenant_timeline_passthrough(
    tracing::info!("Proxying request for tenant {} ({})", tenant_id, path);
    // Find the node that holds shard zero
-    let (base_url, tenant_shard_id) = service.tenant_shard0_baseurl(tenant_id)?;
+    let (node, tenant_shard_id) = service.tenant_shard0_node(tenant_id)?;
    // Callers will always pass an unsharded tenant ID.  Before proxying, we must
    // rewrite this to a shard-aware shard zero ID.
@@ -320,12 +330,39 @@ async fn handle_tenant_timeline_passthrough(
    let tenant_shard_str = format!("{}", tenant_shard_id);
    let path = path.replace(&tenant_str, &tenant_shard_str);
-    let client = mgmt_api::Client::new(base_url, service.get_config().jwt_token.as_deref());
+    let latency = &METRICS_REGISTRY
        .metrics_group
        .storage_controller_passthrough_request_latency;
    // This is a bit awkward. We remove the param from the request
    // and join the words by '_' to get a label for the request.
    let just_path = path.replace(&tenant_shard_str, "");
    let path_label = just_path
        .split('/')
        .filter(|token| !token.is_empty())
        .collect::<Vec<_>>()
        .join("_");
    let labels = PageserverRequestLabelGroup {
        pageserver_id: &node.get_id().to_string(),
        path: &path_label,
        method: crate::metrics::Method::Get,
    };
    let _timer = latency.start_timer(labels.clone());
    let client = mgmt_api::Client::new(node.base_url(), service.get_config().jwt_token.as_deref());
    let resp = client.get_raw(path).await.map_err(|_e|
        // FIXME: give APiError a proper Unavailable variant.  We return 503 here because
        // if we can't successfully send a request to the pageserver, we aren't available.
        ApiError::ShuttingDown)?;
    if !resp.status().is_success() {
        let error_counter = &METRICS_REGISTRY
            .metrics_group
            .storage_controller_passthrough_request_error;
        error_counter.inc(labels);
    }
    // We have a reqest::Response, would like a http::Response
    let mut builder = hyper::Response::builder()
        .status(resp.status())
@@ -351,6 +388,16 @@ async fn handle_tenant_locate(
    json_response(StatusCode::OK, service.tenant_locate(tenant_id)?)
 }
 async fn handle_tenant_describe(
    service: Arc<Service>,
    req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
    check_permissions(&req, Scope::Admin)?;
    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
    json_response(StatusCode::OK, service.tenant_describe(tenant_id)?)
 }
 async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
    check_permissions(&req, Scope::Admin)?;
@@ -389,7 +436,14 @@ async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>,
    json_response(
        StatusCode::OK,
-        state.service.node_configure(config_req).await?,
+        state
            .service
            .node_configure(
                config_req.node_id,
                config_req.availability.map(NodeAvailability::from),
                config_req.scheduling,
            )
            .await?,
    )
 }
@@ -440,24 +494,6 @@ async fn handle_tenants_dump(req: Request<Body>) -> Result<Response<Body>, ApiEr
    state.service.tenants_dump()
 }
 async fn handle_balance_all(
    service: Arc<Service>,
    req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
    check_permissions(&req, Scope::Admin)?;
    service.balance_all()?;
    json_response(StatusCode::OK, ())
 }
 async fn handle_balance_attached(
    service: Arc<Service>,
    req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
    check_permissions(&req, Scope::Admin)?;
    service.balance_attached()?;
    json_response(StatusCode::OK, ())
 }
 async fn handle_scheduler_dump(req: Request<Body>) -> Result<Response<Body>, ApiError> {
    check_permissions(&req, Scope::Admin)?;
@@ -497,7 +533,11 @@ impl From<ReconcileError> for ApiError {
 /// Common wrapper for request handlers that call into Service and will operate on tenants: they must only
 /// be allowed to run if Service has finished its initial reconciliation.
-async fn tenant_service_handler<R, H>(request: Request<Body>, handler: H) -> R::Output
+async fn tenant_service_handler<R, H>(
    request: Request<Body>,
    handler: H,
    request_name: RequestName,
 ) -> R::Output
 where
    R: std::future::Future<Output = Result<Response<Body>, ApiError>> + Send + 'static,
    H: FnOnce(Arc<Service>, Request<Body>) -> R + Send + Sync + 'static,
@@ -517,9 +557,10 @@ where
        ));
    }
-    request_span(
+    named_request_span(
        request,
        |request| async move { handler(service, request).await },
        request_name,
    )
    .await
 }
@@ -530,11 +571,98 @@ fn check_permissions(request: &Request<Body>, required_scope: Scope) -> Result<(
    })
 }
 #[derive(Clone, Debug)]
 struct RequestMeta {
    method: hyper::http::Method,
    at: Instant,
 }
 fn prologue_metrics_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
 ) -> Middleware<B, ApiError> {
    Middleware::pre(move |req| async move {
        let meta = RequestMeta {
            method: req.method().clone(),
            at: Instant::now(),
        };
        req.set_context(meta);
        Ok(req)
    })
 }
 fn epilogue_metrics_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
 ) -> Middleware<B, ApiError> {
    Middleware::post_with_info(move |resp, req_info| async move {
        let request_name = match req_info.context::<RequestName>() {
            Some(name) => name,
            None => {
                return Ok(resp);
            }
        };
        if let Some(meta) = req_info.context::<RequestMeta>() {
            let status = &crate::metrics::METRICS_REGISTRY
                .metrics_group
                .storage_controller_http_request_status;
            let latency = &crate::metrics::METRICS_REGISTRY
                .metrics_group
                .storage_controller_http_request_latency;
            status.inc(HttpRequestStatusLabelGroup {
                path: request_name.0,
                method: meta.method.clone().into(),
                status: crate::metrics::StatusCode(resp.status()),
            });
            latency.observe(
                HttpRequestLatencyLabelGroup {
                    path: request_name.0,
                    method: meta.method.into(),
                },
                meta.at.elapsed().as_secs_f64(),
            );
        }
        Ok(resp)
    })
 }
 pub async fn measured_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
    pub const TEXT_FORMAT: &str = "text/plain; version=0.0.4";
    let payload = crate::metrics::METRICS_REGISTRY.encode();
    let response = Response::builder()
        .status(200)
        .header(CONTENT_TYPE, TEXT_FORMAT)
        .body(payload.into())
        .unwrap();
    Ok(response)
 }
 #[derive(Clone)]
 struct RequestName(&'static str);
 async fn named_request_span<R, H>(
    request: Request<Body>,
    handler: H,
    name: RequestName,
 ) -> R::Output
 where
    R: Future<Output = Result<Response<Body>, ApiError>> + Send + 'static,
    H: FnOnce(Request<Body>) -> R + Send + Sync + 'static,
 {
    request.set_context(name);
    request_span(request, handler).await
 }
 pub fn make_router(
    service: Arc<Service>,
    auth: Option<Arc<SwappableJwtAuth>>,
 ) -> RouterBuilder<hyper::Body, ApiError> {
-    let mut router = endpoint::make_router();
+    let mut router = endpoint::make_router()
        .middleware(prologue_metrics_middleware())
        .middleware(epilogue_metrics_middleware());
    if auth.is_some() {
        router = router.middleware(auth_middleware(|request| {
            let state = get_state(request);
@@ -543,102 +671,166 @@ pub fn make_router(
            } else {
                state.auth.as_deref()
            }
-        }))
+        }));
    }
    router
        .data(Arc::new(HttpState::new(service, auth)))
        .get("/metrics", |r| {
            named_request_span(r, measured_metrics_handler, RequestName("metrics"))
        })
        // Non-prefixed generic endpoints (status, metrics)
-        .get("/status", |r| request_span(r, handle_status))
+        .get("/status", |r| {
-        .get("/ready", |r| request_span(r, handle_ready))
+            named_request_span(r, handle_status, RequestName("status"))
        })
        .get("/ready", |r| {
            named_request_span(r, handle_ready, RequestName("ready"))
        })
        // Upcalls for the pageserver: point the pageserver's `control_plane_api` config to this prefix
        .post("/upcall/v1/re-attach", |r| {
-            request_span(r, handle_re_attach)
+            named_request_span(r, handle_re_attach, RequestName("upcall_v1_reattach"))
        })
        .post("/upcall/v1/validate", |r| {
            named_request_span(r, handle_validate, RequestName("upcall_v1_validate"))
        })
        .post("/upcall/v1/validate", |r| request_span(r, handle_validate))
        // Test/dev/debug endpoints
        .post("/debug/v1/attach-hook", |r| {
-            request_span(r, handle_attach_hook)
+            named_request_span(r, handle_attach_hook, RequestName("debug_v1_attach_hook"))
        })
        .post("/debug/v1/inspect", |r| {
            named_request_span(r, handle_inspect, RequestName("debug_v1_inspect"))
        })
        .post("/debug/v1/inspect", |r| request_span(r, handle_inspect))
        .post("/debug/v1/tenant/:tenant_id/drop", |r| {
-            request_span(r, handle_tenant_drop)
+            named_request_span(r, handle_tenant_drop, RequestName("debug_v1_tenant_drop"))
        })
        .post("/debug/v1/node/:node_id/drop", |r| {
-            request_span(r, handle_node_drop)
+            named_request_span(r, handle_node_drop, RequestName("debug_v1_node_drop"))
        })
        .get("/debug/v1/tenant", |r| {
            named_request_span(r, handle_tenants_dump, RequestName("debug_v1_tenant"))
        })
        .get("/debug/v1/tenant/:tenant_id/locate", |r| {
            tenant_service_handler(
                r,
                handle_tenant_locate,
                RequestName("debug_v1_tenant_locate"),
            )
        })
        .get("/debug/v1/tenant", |r| request_span(r, handle_tenants_dump))
        .get("/debug/v1/scheduler", |r| {
-            request_span(r, handle_scheduler_dump)
+            named_request_span(r, handle_scheduler_dump, RequestName("debug_v1_scheduler"))
        })
        .post("/debug/v1/consistency_check", |r| {
-            request_span(r, handle_consistency_check)
+            named_request_span(
                r,
                handle_consistency_check,
                RequestName("debug_v1_consistency_check"),
            )
        })
        .put("/debug/v1/failpoints", |r| {
            request_span(r, |r| failpoints_handler(r, CancellationToken::new()))
        })
        .get("/control/v1/tenant/:tenant_id/locate", |r| {
            tenant_service_handler(r, handle_tenant_locate)
        })
        // Node operations
        .post("/control/v1/node", |r| {
-            request_span(r, handle_node_register)
+            named_request_span(r, handle_node_register, RequestName("control_v1_node"))
        })
        .get("/control/v1/node", |r| {
            named_request_span(r, handle_node_list, RequestName("control_v1_node"))
        })
        .get("/control/v1/node", |r| request_span(r, handle_node_list))
        .put("/control/v1/node/:node_id/config", |r| {
-            request_span(r, handle_node_configure)
+            named_request_span(
                r,
                handle_node_configure,
                RequestName("control_v1_node_config"),
            )
        })
        // Tenant Shard operations
        .put("/control/v1/tenant/:tenant_shard_id/migrate", |r| {
-            tenant_service_handler(r, handle_tenant_shard_migrate)
+            tenant_service_handler(
                r,
                handle_tenant_shard_migrate,
                RequestName("control_v1_tenant_migrate"),
            )
        })
        .put("/control/v1/tenant/:tenant_id/shard_split", |r| {
-            tenant_service_handler(r, handle_tenant_shard_split)
+            tenant_service_handler(
                r,
                handle_tenant_shard_split,
                RequestName("control_v1_tenant_shard_split"),
            )
        })
-        .post("/control/v1/balance/all", |r| {
+        .get("/control/v1/tenant/:tenant_id", |r| {
-            tenant_service_handler(r, handle_balance_all)
+            tenant_service_handler(
-        })
+                r,
-        .post("/control/v1/balance/attached", |r| {
+                handle_tenant_describe,
-            tenant_service_handler(r, handle_balance_attached)
+                RequestName("control_v1_tenant_describe"),
            )
        })
        // Tenant operations
        // The ^/v1/ endpoints act as a "Virtual Pageserver", enabling shard-naive clients to call into
        // this service to manage tenants that actually consist of many tenant shards, as if they are a single entity.
        .post("/v1/tenant", |r| {
-            tenant_service_handler(r, handle_tenant_create)
+            tenant_service_handler(r, handle_tenant_create, RequestName("v1_tenant"))
        })
        .delete("/v1/tenant/:tenant_id", |r| {
-            tenant_service_handler(r, handle_tenant_delete)
+            tenant_service_handler(r, handle_tenant_delete, RequestName("v1_tenant"))
        })
        .put("/v1/tenant/config", |r| {
-            tenant_service_handler(r, handle_tenant_config_set)
+            tenant_service_handler(r, handle_tenant_config_set, RequestName("v1_tenant_config"))
        })
        .get("/v1/tenant/:tenant_id/config", |r| {
-            tenant_service_handler(r, handle_tenant_config_get)
+            tenant_service_handler(r, handle_tenant_config_get, RequestName("v1_tenant_config"))
        })
-        .put("/v1/tenant/:tenant_id/location_config", |r| {
+        .put("/v1/tenant/:tenant_shard_id/location_config", |r| {
-            tenant_service_handler(r, handle_tenant_location_config)
+            tenant_service_handler(
                r,
                handle_tenant_location_config,
                RequestName("v1_tenant_location_config"),
            )
        })
        .put("/v1/tenant/:tenant_id/time_travel_remote_storage", |r| {
-            tenant_service_handler(r, handle_tenant_time_travel_remote_storage)
+            tenant_service_handler(
                r,
                handle_tenant_time_travel_remote_storage,
                RequestName("v1_tenant_time_travel_remote_storage"),
            )
        })
        .post("/v1/tenant/:tenant_id/secondary/download", |r| {
-            tenant_service_handler(r, handle_tenant_secondary_download)
+            tenant_service_handler(
                r,
                handle_tenant_secondary_download,
                RequestName("v1_tenant_secondary_download"),
            )
        })
        // Timeline operations
        .delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
-            tenant_service_handler(r, handle_tenant_timeline_delete)
+            tenant_service_handler(
                r,
                handle_tenant_timeline_delete,
                RequestName("v1_tenant_timeline"),
            )
        })
        .post("/v1/tenant/:tenant_id/timeline", |r| {
-            tenant_service_handler(r, handle_tenant_timeline_create)
+            tenant_service_handler(
                r,
                handle_tenant_timeline_create,
                RequestName("v1_tenant_timeline"),
            )
        })
        // Tenant detail GET passthrough to shard zero
        .get("/v1/tenant/:tenant_id", |r| {
-            tenant_service_handler(r, handle_tenant_timeline_passthrough)
+            tenant_service_handler(
                r,
                handle_tenant_timeline_passthrough,
                RequestName("v1_tenant_passthrough"),
            )
        })
        // Timeline GET passthrough to shard zero.  Note that the `*` in the URL is a wildcard: any future
        // timeline GET APIs will be implicitly included.
        .get("/v1/tenant/:tenant_id/timeline*", |r| {
-            tenant_service_handler(r, handle_tenant_timeline_passthrough)
+            tenant_service_handler(
                r,
                handle_tenant_timeline_passthrough,
                RequestName("v1_tenant_timeline_passthrough"),
            )
        })
 }
--- a/control_plane/attachment_service/src/lib.rs
+++ b/control_plane/attachment_service/src/lib.rs
@@ -3,10 +3,12 @@ use utils::seqwait::MonotonicCounter;
 mod auth;
 mod compute_hook;
 mod heartbeater;
 pub mod http;
 mod id_lock_map;
 pub mod metrics;
 mod node;
 mod pageserver_client;
 pub mod persistence;
 mod reconciler;
 mod scheduler;
--- a/control_plane/attachment_service/src/main.rs
+++ b/control_plane/attachment_service/src/main.rs
@@ -1,15 +1,8 @@
 /// The attachment service mimics the aspects of the control plane API
 /// that are required for a pageserver to operate.
 ///
 /// This enables running & testing pageservers without a full-blown
 /// deployment of the Neon cloud platform.
 ///
 use anyhow::{anyhow, Context};
 use attachment_service::http::make_router;
 use attachment_service::metrics::preinitialize_metrics;
 use attachment_service::persistence::Persistence;
-use attachment_service::service::{Config, Service};
+use attachment_service::service::{Config, Service, MAX_UNAVAILABLE_INTERVAL_DEFAULT};
 use aws_config::{BehaviorVersion, Region};
 use camino::Utf8PathBuf;
 use clap::Parser;
 use diesel::Connection;
@@ -60,6 +53,30 @@ struct Cli {
    /// URL to connect to postgres, like postgresql://localhost:1234/attachment_service
    #[arg(long)]
    database_url: Option<String>,
    /// Flag to enable dev mode, which permits running without auth
    #[arg(long, default_value = "false")]
    dev: bool,
    /// Grace period before marking unresponsive pageserver offline
    #[arg(long)]
    max_unavailable_interval: Option<humantime::Duration>,
 }
 enum StrictMode {
    /// In strict mode, we will require that all secrets are loaded, i.e. security features
    /// may not be implicitly turned off by omitting secrets in the environment.
    Strict,
    /// In dev mode, secrets are optional, and omitting a particular secret will implicitly
    /// disable the auth related to it (e.g. no pageserver jwt key -> send unauthenticated
    /// requests, no public key -> don't authenticate incoming requests).
    Dev,
 }
 impl Default for StrictMode {
    fn default() -> Self {
        Self::Strict
    }
 }
 /// Secrets may either be provided on the command line (for testing), or loaded from AWS SecretManager: this
@@ -72,13 +89,6 @@ struct Secrets {
 }
 impl Secrets {
    const DATABASE_URL_SECRET: &'static str = "rds-neon-storage-controller-url";
    const PAGESERVER_JWT_TOKEN_SECRET: &'static str =
        "neon-storage-controller-pageserver-jwt-token";
    const CONTROL_PLANE_JWT_TOKEN_SECRET: &'static str =
        "neon-storage-controller-control-plane-jwt-token";
    const PUBLIC_KEY_SECRET: &'static str = "neon-storage-controller-public-key";
    const DATABASE_URL_ENV: &'static str = "DATABASE_URL";
    const PAGESERVER_JWT_TOKEN_ENV: &'static str = "PAGESERVER_JWT_TOKEN";
    const CONTROL_PLANE_JWT_TOKEN_ENV: &'static str = "CONTROL_PLANE_JWT_TOKEN";
@@ -89,111 +99,41 @@ impl Secrets {
    /// - Environment variables if DATABASE_URL is set.
    /// - AWS Secrets Manager secrets
    async fn load(args: &Cli) -> anyhow::Result<Self> {
-        match &args.database_url {
+        let Some(database_url) =
-            Some(url) => Self::load_cli(url, args),
+            Self::load_secret(&args.database_url, Self::DATABASE_URL_ENV).await
            None => match std::env::var(Self::DATABASE_URL_ENV) {
                Ok(database_url) => Self::load_env(database_url),
                Err(_) => Self::load_aws_sm().await,
            },
        }
    }
    fn load_env(database_url: String) -> anyhow::Result<Self> {
        let public_key = match std::env::var(Self::PUBLIC_KEY_ENV) {
            Ok(public_key) => Some(JwtAuth::from_key(public_key).context("Loading public key")?),
            Err(_) => None,
        };
        Ok(Self {
            database_url,
            public_key,
            jwt_token: std::env::var(Self::PAGESERVER_JWT_TOKEN_ENV).ok(),
            control_plane_jwt_token: std::env::var(Self::CONTROL_PLANE_JWT_TOKEN_ENV).ok(),
        })
    }
    async fn load_aws_sm() -> anyhow::Result<Self> {
        let Ok(region) = std::env::var("AWS_REGION") else {
            anyhow::bail!("AWS_REGION is not set, cannot load secrets automatically: either set this, or use CLI args to supply secrets");
        };
        let config = aws_config::defaults(BehaviorVersion::v2023_11_09())
            .region(Region::new(region.clone()))
            .load()
            .await;
        let asm = aws_sdk_secretsmanager::Client::new(&config);
        let Some(database_url) = asm
            .get_secret_value()
            .secret_id(Self::DATABASE_URL_SECRET)
            .send()
            .await?
            .secret_string()
            .map(str::to_string)
        else {
            anyhow::bail!(
-                "Database URL secret not found at {region}/{}",
+                "Database URL is not set (set `--database-url`, or `DATABASE_URL` environment)"
                Self::DATABASE_URL_SECRET
            )
        };
-        let jwt_token = asm
+        let public_key = match Self::load_secret(&args.public_key, Self::PUBLIC_KEY_ENV).await {
-            .get_secret_value()
+            Some(v) => Some(JwtAuth::from_key(v).context("Loading public key")?),
-            .secret_id(Self::PAGESERVER_JWT_TOKEN_SECRET)
+            None => None,
            .send()
            .await?
            .secret_string()
            .map(str::to_string);
        if jwt_token.is_none() {
            tracing::warn!("No pageserver JWT token set: this will only work if authentication is disabled on the pageserver");
        }
        let control_plane_jwt_token = asm
            .get_secret_value()
            .secret_id(Self::CONTROL_PLANE_JWT_TOKEN_SECRET)
            .send()
            .await?
            .secret_string()
            .map(str::to_string);
        if jwt_token.is_none() {
            tracing::warn!("No control plane JWT token set: this will only work if authentication is disabled on the pageserver");
        }
        let public_key = asm
            .get_secret_value()
            .secret_id(Self::PUBLIC_KEY_SECRET)
            .send()
            .await?
            .secret_string()
            .map(str::to_string);
        let public_key = match public_key {
            Some(key) => Some(JwtAuth::from_key(key)?),
            None => {
                tracing::warn!(
                    "No public key set: inccoming HTTP requests will not be authenticated"
                );
                None
            }
        };
-        Ok(Self {
+        let this = Self {
            database_url,
            public_key,
-            jwt_token,
+            jwt_token: Self::load_secret(&args.jwt_token, Self::PAGESERVER_JWT_TOKEN_ENV).await,
-            control_plane_jwt_token,
+            control_plane_jwt_token: Self::load_secret(
-        })
+                &args.control_plane_jwt_token,
                Self::CONTROL_PLANE_JWT_TOKEN_ENV,
            )
            .await,
        };
        Ok(this)
    }
-    fn load_cli(database_url: &str, args: &Cli) -> anyhow::Result<Self> {
+    async fn load_secret(cli: &Option<String>, env_name: &str) -> Option<String> {
-        let public_key = match &args.public_key {
+        if let Some(v) = cli {
-            None => None,
+            Some(v.clone())
-            Some(key) => Some(JwtAuth::from_key(key.clone()).context("Loading public key")?),
+        } else if let Ok(v) = std::env::var(env_name) {
-        };
+            Some(v)
-        Ok(Self {
+        } else {
-            database_url: database_url.to_owned(),
+            None
-            public_key,
+        }
            jwt_token: args.jwt_token.clone(),
            control_plane_jwt_token: args.control_plane_jwt_token.clone(),
        })
    }
 }
@@ -212,6 +152,12 @@ async fn migration_run(database_url: &str) -> anyhow::Result<()> {
 }
 fn main() -> anyhow::Result<()> {
    let default_panic = std::panic::take_hook();
    std::panic::set_hook(Box::new(move |info| {
        default_panic(info);
        std::process::exit(1);
    }));
    tokio::runtime::Builder::new_current_thread()
        // We use spawn_blocking for database operations, so require approximately
        // as many blocking threads as we will open database connections.
@@ -243,12 +189,50 @@ async fn async_main() -> anyhow::Result<()> {
        args.listen
    );
    let strict_mode = if args.dev {
        StrictMode::Dev
    } else {
        StrictMode::Strict
    };
    let secrets = Secrets::load(&args).await?;
    // Validate required secrets and arguments are provided in strict mode
    match strict_mode {
        StrictMode::Strict
            if (secrets.public_key.is_none()
                || secrets.jwt_token.is_none()
                || secrets.control_plane_jwt_token.is_none()) =>
        {
            // Production systems should always have secrets configured: if public_key was not set
            // then we would implicitly disable auth.
            anyhow::bail!(
                    "Insecure config!  One or more secrets is not set.  This is only permitted in `--dev` mode"
                );
        }
        StrictMode::Strict if args.compute_hook_url.is_none() => {
            // Production systems should always have a compute hook set, to prevent falling
            // back to trying to use neon_local.
            anyhow::bail!(
                "`--compute-hook-url` is not set: this is only permitted in `--dev` mode"
            );
        }
        StrictMode::Strict => {
            tracing::info!("Starting in strict mode: configuration is OK.")
        }
        StrictMode::Dev => {
            tracing::warn!("Starting in dev mode: this may be an insecure configuration.")
        }
    }
    let config = Config {
        jwt_token: secrets.jwt_token,
        control_plane_jwt_token: secrets.control_plane_jwt_token,
        compute_hook_url: args.compute_hook_url,
        max_unavailable_interval: args
            .max_unavailable_interval
            .map(humantime::Duration::into)
            .unwrap_or(MAX_UNAVAILABLE_INTERVAL_DEFAULT),
    };
    // After loading secrets & config, but before starting anything else, apply database migrations
--- a/control_plane/attachment_service/src/metrics.rs
+++ b/control_plane/attachment_service/src/metrics.rs
@@ -1,32 +1,284 @@
-use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
+//!
 //! This module provides metric definitions for the storage controller.
 //!
 //! All metrics are grouped in [`StorageControllerMetricGroup`]. [`StorageControllerMetrics`] holds
 //! the mentioned metrics and their encoder. It's globally available via the [`METRICS_REGISTRY`]
 //! constant.
 //!
 //! The rest of the code defines label group types and deals with converting outer types to labels.
 //!
 use bytes::Bytes;
 use measured::{
    label::{LabelValue, StaticLabelSet},
    FixedCardinalityLabel, MetricGroup,
 };
 use once_cell::sync::Lazy;
 use std::sync::Mutex;
-pub(crate) struct ReconcilerMetrics {
+use crate::persistence::{DatabaseError, DatabaseOperation};
    pub(crate) spawned: IntCounter,
    pub(crate) complete: IntCounterVec,
 }
-impl ReconcilerMetrics {
+pub(crate) static METRICS_REGISTRY: Lazy<StorageControllerMetrics> =
-    // Labels used on [`Self::complete`]
+    Lazy::new(StorageControllerMetrics::default);
    pub(crate) const SUCCESS: &'static str = "ok";
    pub(crate) const ERROR: &'static str = "success";
    pub(crate) const CANCEL: &'static str = "cancel";
 }
 pub(crate) static RECONCILER: Lazy<ReconcilerMetrics> = Lazy::new(|| ReconcilerMetrics {
    spawned: register_int_counter!(
        "storage_controller_reconcile_spawn",
        "Count of how many times we spawn a reconcile task",
    )
    .expect("failed to define a metric"),
    complete: register_int_counter_vec!(
        "storage_controller_reconcile_complete",
        "Reconciler tasks completed, broken down by success/failure/cancelled",
        &["status"],
    )
    .expect("failed to define a metric"),
 });
 pub fn preinitialize_metrics() {
-    Lazy::force(&RECONCILER);
+    Lazy::force(&METRICS_REGISTRY);
 }
 pub(crate) struct StorageControllerMetrics {
    pub(crate) metrics_group: StorageControllerMetricGroup,
    encoder: Mutex<measured::text::TextEncoder>,
 }
 #[derive(measured::MetricGroup)]
 pub(crate) struct StorageControllerMetricGroup {
    /// Count of how many times we spawn a reconcile task
    pub(crate) storage_controller_reconcile_spawn: measured::Counter,
    /// Reconciler tasks completed, broken down by success/failure/cancelled
    pub(crate) storage_controller_reconcile_complete:
        measured::CounterVec<ReconcileCompleteLabelGroupSet>,
    /// HTTP request status counters for handled requests
    pub(crate) storage_controller_http_request_status:
        measured::CounterVec<HttpRequestStatusLabelGroupSet>,
    /// HTTP request handler latency across all status codes
    pub(crate) storage_controller_http_request_latency:
        measured::HistogramVec<HttpRequestLatencyLabelGroupSet, 5>,
    /// Count of HTTP requests to the pageserver that resulted in an error,
    /// broken down by the pageserver node id, request name and method
    pub(crate) storage_controller_pageserver_request_error:
        measured::CounterVec<PageserverRequestLabelGroupSet>,
    /// Latency of HTTP requests to the pageserver, broken down by pageserver
    /// node id, request name and method. This include both successful and unsuccessful
    /// requests.
    pub(crate) storage_controller_pageserver_request_latency:
        measured::HistogramVec<PageserverRequestLabelGroupSet, 5>,
    /// Count of pass-through HTTP requests to the pageserver that resulted in an error,
    /// broken down by the pageserver node id, request name and method
    pub(crate) storage_controller_passthrough_request_error:
        measured::CounterVec<PageserverRequestLabelGroupSet>,
    /// Latency of pass-through HTTP requests to the pageserver, broken down by pageserver
    /// node id, request name and method. This include both successful and unsuccessful
    /// requests.
    pub(crate) storage_controller_passthrough_request_latency:
        measured::HistogramVec<PageserverRequestLabelGroupSet, 5>,
    /// Count of errors in database queries, broken down by error type and operation.
    pub(crate) storage_controller_database_query_error:
        measured::CounterVec<DatabaseQueryErrorLabelGroupSet>,
    /// Latency of database queries, broken down by operation.
    pub(crate) storage_controller_database_query_latency:
        measured::HistogramVec<DatabaseQueryLatencyLabelGroupSet, 5>,
 }
 impl StorageControllerMetrics {
    pub(crate) fn encode(&self) -> Bytes {
        let mut encoder = self.encoder.lock().unwrap();
        self.metrics_group.collect_into(&mut *encoder);
        encoder.finish()
    }
 }
 impl Default for StorageControllerMetrics {
    fn default() -> Self {
        Self {
            metrics_group: StorageControllerMetricGroup::new(),
            encoder: Mutex::new(measured::text::TextEncoder::new()),
        }
    }
 }
 impl StorageControllerMetricGroup {
    pub(crate) fn new() -> Self {
        Self {
            storage_controller_reconcile_spawn: measured::Counter::new(),
            storage_controller_reconcile_complete: measured::CounterVec::new(
                ReconcileCompleteLabelGroupSet {
                    status: StaticLabelSet::new(),
                },
            ),
            storage_controller_http_request_status: measured::CounterVec::new(
                HttpRequestStatusLabelGroupSet {
                    path: lasso::ThreadedRodeo::new(),
                    method: StaticLabelSet::new(),
                    status: StaticLabelSet::new(),
                },
            ),
            storage_controller_http_request_latency: measured::HistogramVec::new(
                measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
            ),
            storage_controller_pageserver_request_error: measured::CounterVec::new(
                PageserverRequestLabelGroupSet {
                    pageserver_id: lasso::ThreadedRodeo::new(),
                    path: lasso::ThreadedRodeo::new(),
                    method: StaticLabelSet::new(),
                },
            ),
            storage_controller_pageserver_request_latency: measured::HistogramVec::new(
                measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
            ),
            storage_controller_passthrough_request_error: measured::CounterVec::new(
                PageserverRequestLabelGroupSet {
                    pageserver_id: lasso::ThreadedRodeo::new(),
                    path: lasso::ThreadedRodeo::new(),
                    method: StaticLabelSet::new(),
                },
            ),
            storage_controller_passthrough_request_latency: measured::HistogramVec::new(
                measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
            ),
            storage_controller_database_query_error: measured::CounterVec::new(
                DatabaseQueryErrorLabelGroupSet {
                    operation: StaticLabelSet::new(),
                    error_type: StaticLabelSet::new(),
                },
            ),
            storage_controller_database_query_latency: measured::HistogramVec::new(
                measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
            ),
        }
    }
 }
 #[derive(measured::LabelGroup)]
 #[label(set = ReconcileCompleteLabelGroupSet)]
 pub(crate) struct ReconcileCompleteLabelGroup {
    pub(crate) status: ReconcileOutcome,
 }
 #[derive(measured::LabelGroup)]
 #[label(set = HttpRequestStatusLabelGroupSet)]
 pub(crate) struct HttpRequestStatusLabelGroup<'a> {
    #[label(dynamic_with = lasso::ThreadedRodeo)]
    pub(crate) path: &'a str,
    pub(crate) method: Method,
    pub(crate) status: StatusCode,
 }
 #[derive(measured::LabelGroup)]
 #[label(set = HttpRequestLatencyLabelGroupSet)]
 pub(crate) struct HttpRequestLatencyLabelGroup<'a> {
    #[label(dynamic_with = lasso::ThreadedRodeo)]
    pub(crate) path: &'a str,
    pub(crate) method: Method,
 }
 impl Default for HttpRequestLatencyLabelGroupSet {
    fn default() -> Self {
        Self {
            path: lasso::ThreadedRodeo::new(),
            method: StaticLabelSet::new(),
        }
    }
 }
 #[derive(measured::LabelGroup, Clone)]
 #[label(set = PageserverRequestLabelGroupSet)]
 pub(crate) struct PageserverRequestLabelGroup<'a> {
    #[label(dynamic_with = lasso::ThreadedRodeo)]
    pub(crate) pageserver_id: &'a str,
    #[label(dynamic_with = lasso::ThreadedRodeo)]
    pub(crate) path: &'a str,
    pub(crate) method: Method,
 }
 impl Default for PageserverRequestLabelGroupSet {
    fn default() -> Self {
        Self {
            pageserver_id: lasso::ThreadedRodeo::new(),
            path: lasso::ThreadedRodeo::new(),
            method: StaticLabelSet::new(),
        }
    }
 }
 #[derive(measured::LabelGroup)]
 #[label(set = DatabaseQueryErrorLabelGroupSet)]
 pub(crate) struct DatabaseQueryErrorLabelGroup {
    pub(crate) error_type: DatabaseErrorLabel,
    pub(crate) operation: DatabaseOperation,
 }
 #[derive(measured::LabelGroup)]
 #[label(set = DatabaseQueryLatencyLabelGroupSet)]
 pub(crate) struct DatabaseQueryLatencyLabelGroup {
    pub(crate) operation: DatabaseOperation,
 }
 #[derive(FixedCardinalityLabel)]
 pub(crate) enum ReconcileOutcome {
    #[label(rename = "ok")]
    Success,
    Error,
    Cancel,
 }
 #[derive(FixedCardinalityLabel, Clone)]
 pub(crate) enum Method {
    Get,
    Put,
    Post,
    Delete,
    Other,
 }
 impl From<hyper::Method> for Method {
    fn from(value: hyper::Method) -> Self {
        if value == hyper::Method::GET {
            Method::Get
        } else if value == hyper::Method::PUT {
            Method::Put
        } else if value == hyper::Method::POST {
            Method::Post
        } else if value == hyper::Method::DELETE {
            Method::Delete
        } else {
            Method::Other
        }
    }
 }
 pub(crate) struct StatusCode(pub(crate) hyper::http::StatusCode);
 impl LabelValue for StatusCode {
    fn visit<V: measured::label::LabelVisitor>(&self, v: V) -> V::Output {
        v.write_int(self.0.as_u16() as u64)
    }
 }
 impl FixedCardinalityLabel for StatusCode {
    fn cardinality() -> usize {
        (100..1000).len()
    }
    fn encode(&self) -> usize {
        self.0.as_u16() as usize
    }
    fn decode(value: usize) -> Self {
        Self(hyper::http::StatusCode::from_u16(u16::try_from(value).unwrap()).unwrap())
    }
 }
 #[derive(FixedCardinalityLabel)]
 pub(crate) enum DatabaseErrorLabel {
    Query,
    Connection,
    ConnectionPool,
    Logical,
 }
 impl DatabaseError {
    pub(crate) fn error_label(&self) -> DatabaseErrorLabel {
        match self {
            Self::Query(_) => DatabaseErrorLabel::Query,
            Self::Connection(_) => DatabaseErrorLabel::Connection,
            Self::ConnectionPool(_) => DatabaseErrorLabel::ConnectionPool,
            Self::Logical(_) => DatabaseErrorLabel::Logical,
        }
    }
 }
--- a/control_plane/attachment_service/src/node.rs
+++ b/control_plane/attachment_service/src/node.rs
@@ -12,7 +12,9 @@ use serde::Serialize;
 use tokio_util::sync::CancellationToken;
 use utils::{backoff, id::NodeId};
-use crate::persistence::NodePersistence;
+use crate::{
    pageserver_client::PageserverClient, persistence::NodePersistence, scheduler::MaySchedule,
 };
 /// Represents the in-memory description of a Node.
 ///
@@ -83,29 +85,38 @@ impl Node {
        }
    }
-    pub(crate) fn set_availability(
+    pub(crate) fn set_availability(&mut self, availability: NodeAvailability) {
-        &mut self,
+        match self.get_availability_transition(availability) {
-        availability: NodeAvailability,
+            AvailabilityTransition::ToActive => {
    ) -> AvailabilityTransition {
        use NodeAvailability::*;
        let transition = match (self.availability, availability) {
            (Offline, Active) => {
                // Give the node a new cancellation token, effectively resetting it to un-cancelled.  Any
                // users of previously-cloned copies of the node will still see the old cancellation
                // state.  For example, Reconcilers in flight will have to complete and be spawned
                // again to realize that the node has become available.
                self.cancel = CancellationToken::new();
                AvailabilityTransition::ToActive
            }
-            (Active, Offline) => {
+            AvailabilityTransition::ToOffline => {
                // Fire the node's cancellation token to cancel any in-flight API requests to it
                self.cancel.cancel();
                AvailabilityTransition::ToOffline
            }
-            _ => AvailabilityTransition::Unchanged,
+            AvailabilityTransition::Unchanged => {}
-        };
+        }
        self.availability = availability;
-        transition
+    }
    /// Without modifying the availability of the node, convert the intended availability
    /// into a description of the transition.
    pub(crate) fn get_availability_transition(
        &self,
        availability: NodeAvailability,
    ) -> AvailabilityTransition {
        use AvailabilityTransition::*;
        use NodeAvailability::*;
        match (self.availability, availability) {
            (Offline, Active(_)) => ToActive,
            (Active(_), Offline) => ToOffline,
            _ => Unchanged,
        }
    }
    /// Whether we may send API requests to this node.
@@ -114,21 +125,21 @@ impl Node {
        // a reference to the original Node's cancellation status.  Checking both of these results
        // in a "pessimistic" check where we will consider a Node instance unavailable if it was unavailable
        // when we cloned it, or if the original Node instance's cancellation token was fired.
-        matches!(self.availability, NodeAvailability::Active) && !self.cancel.is_cancelled()
+        matches!(self.availability, NodeAvailability::Active(_)) && !self.cancel.is_cancelled()
    }
    /// Is this node elegible to have work scheduled onto it?
-    pub(crate) fn may_schedule(&self) -> bool {
+    pub(crate) fn may_schedule(&self) -> MaySchedule {
-        match self.availability {
+        let score = match self.availability {
-            NodeAvailability::Active => {}
+            NodeAvailability::Active(score) => score,
-            NodeAvailability::Offline => return false,
+            NodeAvailability::Offline => return MaySchedule::No,
-        }
+        };
        match self.scheduling {
-            NodeSchedulingPolicy::Active => true,
+            NodeSchedulingPolicy::Active => MaySchedule::Yes(score),
-            NodeSchedulingPolicy::Draining => false,
+            NodeSchedulingPolicy::Draining => MaySchedule::No,
-            NodeSchedulingPolicy::Filling => true,
+            NodeSchedulingPolicy::Filling => MaySchedule::Yes(score),
-            NodeSchedulingPolicy::Pause => false,
+            NodeSchedulingPolicy::Pause => MaySchedule::No,
        }
    }
@@ -146,8 +157,7 @@ impl Node {
            listen_pg_addr,
            listen_pg_port,
            scheduling: NodeSchedulingPolicy::Filling,
-            // TODO: we shouldn't really call this Active until we've heartbeated it.
+            availability: NodeAvailability::Offline,
            availability: NodeAvailability::Active,
            cancel: CancellationToken::new(),
        }
    }
@@ -194,7 +204,7 @@ impl Node {
        cancel: &CancellationToken,
    ) -> Option<mgmt_api::Result<T>>
    where
-        O: FnMut(mgmt_api::Client) -> F,
+        O: FnMut(PageserverClient) -> F,
        F: std::future::Future<Output = mgmt_api::Result<T>>,
    {
        fn is_fatal(e: &mgmt_api::Error) -> bool {
@@ -216,8 +226,12 @@ impl Node {
                    .build()
                    .expect("Failed to construct HTTP client");
-                let client =
+                let client = PageserverClient::from_client(
-                    mgmt_api::Client::from_client(http_client, self.base_url(), jwt.as_deref());
+                    self.get_id(),
                    http_client,
                    self.base_url(),
                    jwt.as_deref(),
                );
                let node_cancel_fut = self.cancel.cancelled();
--- a/control_plane/attachment_service/src/pageserver_client.rs
+++ b/control_plane/attachment_service/src/pageserver_client.rs
@@ -0,0 +1,203 @@
 use pageserver_api::{
    models::{
        LocationConfig, LocationConfigListResponse, PageserverUtilization, SecondaryProgress,
        TenantShardSplitRequest, TenantShardSplitResponse, TimelineCreateRequest, TimelineInfo,
    },
    shard::TenantShardId,
 };
 use pageserver_client::mgmt_api::{Client, Result};
 use reqwest::StatusCode;
 use utils::id::{NodeId, TimelineId};
 /// Thin wrapper around [`pageserver_client::mgmt_api::Client`]. It allows the storage
 /// controller to collect metrics in a non-intrusive manner.
 #[derive(Debug, Clone)]
 pub(crate) struct PageserverClient {
    inner: Client,
    node_id_label: String,
 }
 macro_rules! measured_request {
    ($name:literal, $method:expr, $node_id: expr, $invoke:expr) => {{
        let labels = crate::metrics::PageserverRequestLabelGroup {
            pageserver_id: $node_id,
            path: $name,
            method: $method,
        };
        let latency = &crate::metrics::METRICS_REGISTRY
            .metrics_group
            .storage_controller_pageserver_request_latency;
        let _timer_guard = latency.start_timer(labels.clone());
        let res = $invoke;
        if res.is_err() {
            let error_counters = &crate::metrics::METRICS_REGISTRY
                .metrics_group
                .storage_controller_pageserver_request_error;
            error_counters.inc(labels)
        }
        res
    }};
 }
 impl PageserverClient {
    pub(crate) fn new(node_id: NodeId, mgmt_api_endpoint: String, jwt: Option<&str>) -> Self {
        Self {
            inner: Client::from_client(reqwest::Client::new(), mgmt_api_endpoint, jwt),
            node_id_label: node_id.0.to_string(),
        }
    }
    pub(crate) fn from_client(
        node_id: NodeId,
        raw_client: reqwest::Client,
        mgmt_api_endpoint: String,
        jwt: Option<&str>,
    ) -> Self {
        Self {
            inner: Client::from_client(raw_client, mgmt_api_endpoint, jwt),
            node_id_label: node_id.0.to_string(),
        }
    }
    pub(crate) async fn tenant_delete(&self, tenant_shard_id: TenantShardId) -> Result<StatusCode> {
        measured_request!(
            "tenant",
            crate::metrics::Method::Delete,
            &self.node_id_label,
            self.inner.tenant_delete(tenant_shard_id).await
        )
    }
    pub(crate) async fn tenant_time_travel_remote_storage(
        &self,
        tenant_shard_id: TenantShardId,
        timestamp: &str,
        done_if_after: &str,
    ) -> Result<()> {
        measured_request!(
            "tenant_time_travel_remote_storage",
            crate::metrics::Method::Put,
            &self.node_id_label,
            self.inner
                .tenant_time_travel_remote_storage(tenant_shard_id, timestamp, done_if_after)
                .await
        )
    }
    pub(crate) async fn tenant_secondary_download(
        &self,
        tenant_id: TenantShardId,
        wait: Option<std::time::Duration>,
    ) -> Result<(StatusCode, SecondaryProgress)> {
        measured_request!(
            "tenant_secondary_download",
            crate::metrics::Method::Post,
            &self.node_id_label,
            self.inner.tenant_secondary_download(tenant_id, wait).await
        )
    }
    pub(crate) async fn location_config(
        &self,
        tenant_shard_id: TenantShardId,
        config: LocationConfig,
        flush_ms: Option<std::time::Duration>,
        lazy: bool,
    ) -> Result<()> {
        measured_request!(
            "location_config",
            crate::metrics::Method::Put,
            &self.node_id_label,
            self.inner
                .location_config(tenant_shard_id, config, flush_ms, lazy)
                .await
        )
    }
    pub(crate) async fn list_location_config(&self) -> Result<LocationConfigListResponse> {
        measured_request!(
            "location_configs",
            crate::metrics::Method::Get,
            &self.node_id_label,
            self.inner.list_location_config().await
        )
    }
    pub(crate) async fn get_location_config(
        &self,
        tenant_shard_id: TenantShardId,
    ) -> Result<Option<LocationConfig>> {
        measured_request!(
            "location_config",
            crate::metrics::Method::Get,
            &self.node_id_label,
            self.inner.get_location_config(tenant_shard_id).await
        )
    }
    pub(crate) async fn timeline_create(
        &self,
        tenant_shard_id: TenantShardId,
        req: &TimelineCreateRequest,
    ) -> Result<TimelineInfo> {
        measured_request!(
            "timeline",
            crate::metrics::Method::Post,
            &self.node_id_label,
            self.inner.timeline_create(tenant_shard_id, req).await
        )
    }
    pub(crate) async fn timeline_delete(
        &self,
        tenant_shard_id: TenantShardId,
        timeline_id: TimelineId,
    ) -> Result<StatusCode> {
        measured_request!(
            "timeline",
            crate::metrics::Method::Delete,
            &self.node_id_label,
            self.inner
                .timeline_delete(tenant_shard_id, timeline_id)
                .await
        )
    }
    pub(crate) async fn tenant_shard_split(
        &self,
        tenant_shard_id: TenantShardId,
        req: TenantShardSplitRequest,
    ) -> Result<TenantShardSplitResponse> {
        measured_request!(
            "tenant_shard_split",
            crate::metrics::Method::Put,
            &self.node_id_label,
            self.inner.tenant_shard_split(tenant_shard_id, req).await
        )
    }
    pub(crate) async fn timeline_list(
        &self,
        tenant_shard_id: &TenantShardId,
    ) -> Result<Vec<TimelineInfo>> {
        measured_request!(
            "timelines",
            crate::metrics::Method::Get,
            &self.node_id_label,
            self.inner.timeline_list(tenant_shard_id).await
        )
    }
    pub(crate) async fn get_utilization(&self) -> Result<PageserverUtilization> {
        measured_request!(
            "utilization",
            crate::metrics::Method::Get,
            &self.node_id_label,
            self.inner.get_utilization().await
        )
    }
 }
--- a/control_plane/attachment_service/src/persistence.rs
+++ b/control_plane/attachment_service/src/persistence.rs
@@ -19,11 +19,14 @@ use serde::{Deserialize, Serialize};
 use utils::generation::Generation;
 use utils::id::{NodeId, TenantId};
 use crate::metrics::{
    DatabaseQueryErrorLabelGroup, DatabaseQueryLatencyLabelGroup, METRICS_REGISTRY,
 };
 use crate::node::Node;
 /// ## What do we store?
 ///
-/// The attachment service does not store most of its state durably.
+/// The storage controller service does not store most of its state durably.
 ///
 /// The essential things to store durably are:
 /// - generation numbers, as these must always advance monotonically to ensure data safety.
@@ -37,7 +40,7 @@ use crate::node::Node;
 ///
 /// ## Performance/efficiency
 ///
-/// The attachment service does not go via the database for most things: there are
+/// The storage controller service does not go via the database for most things: there are
 /// a couple of places where we must, and where efficiency matters:
 /// - Incrementing generation numbers: the Reconciler has to wait for this to complete
 ///   before it can attach a tenant, so this acts as a bound on how fast things like
@@ -75,6 +78,25 @@ pub(crate) enum DatabaseError {
    Logical(String),
 }
 #[derive(measured::FixedCardinalityLabel, Clone)]
 pub(crate) enum DatabaseOperation {
    InsertNode,
    UpdateNode,
    DeleteNode,
    ListNodes,
    BeginShardSplit,
    CompleteShardSplit,
    AbortShardSplit,
    Detach,
    ReAttach,
    IncrementGeneration,
    ListTenantShards,
    InsertTenantShards,
    UpdateTenantShard,
    DeleteTenant,
    UpdateTenantConfig,
 }
 #[must_use]
 pub(crate) enum AbortShardSplitStatus {
    /// We aborted the split in the database by reverting to the parent shards
@@ -115,6 +137,34 @@ impl Persistence {
        }
    }
    /// Wraps `with_conn` in order to collect latency and error metrics
    async fn with_measured_conn<F, R>(&self, op: DatabaseOperation, func: F) -> DatabaseResult<R>
    where
        F: Fn(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
        R: Send + 'static,
    {
        let latency = &METRICS_REGISTRY
            .metrics_group
            .storage_controller_database_query_latency;
        let _timer = latency.start_timer(DatabaseQueryLatencyLabelGroup {
            operation: op.clone(),
        });
        let res = self.with_conn(func).await;
        if let Err(err) = &res {
            let error_counter = &METRICS_REGISTRY
                .metrics_group
                .storage_controller_database_query_error;
            error_counter.inc(DatabaseQueryErrorLabelGroup {
                error_type: err.error_label(),
                operation: op,
            })
        }
        res
    }
    /// Call the provided function in a tokio blocking thread, with a Diesel database connection.
    async fn with_conn<F, R>(&self, func: F) -> DatabaseResult<R>
    where
@@ -130,21 +180,27 @@ impl Persistence {
    /// When a node is first registered, persist it before using it for anything
    pub(crate) async fn insert_node(&self, node: &Node) -> DatabaseResult<()> {
        let np = node.to_persistent();
-        self.with_conn(move |conn| -> DatabaseResult<()> {
+        self.with_measured_conn(
-            diesel::insert_into(crate::schema::nodes::table)
+            DatabaseOperation::InsertNode,
-                .values(&np)
+            move |conn| -> DatabaseResult<()> {
-                .execute(conn)?;
+                diesel::insert_into(crate::schema::nodes::table)
-            Ok(())
+                    .values(&np)
-        })
+                    .execute(conn)?;
                Ok(())
            },
        )
        .await
    }
    /// At startup, populate the list of nodes which our shards may be placed on
    pub(crate) async fn list_nodes(&self) -> DatabaseResult<Vec<NodePersistence>> {
        let nodes: Vec<NodePersistence> = self
-            .with_conn(move |conn| -> DatabaseResult<_> {
+            .with_measured_conn(
-                Ok(crate::schema::nodes::table.load::<NodePersistence>(conn)?)
+                DatabaseOperation::ListNodes,
-            })
+                move |conn| -> DatabaseResult<_> {
                    Ok(crate::schema::nodes::table.load::<NodePersistence>(conn)?)
                },
            )
            .await?;
        tracing::info!("list_nodes: loaded {} nodes", nodes.len());
@@ -159,7 +215,7 @@ impl Persistence {
    ) -> DatabaseResult<()> {
        use crate::schema::nodes::dsl::*;
        let updated = self
-            .with_conn(move |conn| {
+            .with_measured_conn(DatabaseOperation::UpdateNode, move |conn| {
                let updated = diesel::update(nodes)
                    .filter(node_id.eq(input_node_id.0 as i64))
                    .set((scheduling_policy.eq(String::from(input_scheduling)),))
@@ -181,9 +237,12 @@ impl Persistence {
    /// be enriched at runtime with state discovered on pageservers.
    pub(crate) async fn list_tenant_shards(&self) -> DatabaseResult<Vec<TenantShardPersistence>> {
        let loaded = self
-            .with_conn(move |conn| -> DatabaseResult<_> {
+            .with_measured_conn(
-                Ok(crate::schema::tenant_shards::table.load::<TenantShardPersistence>(conn)?)
+                DatabaseOperation::ListTenantShards,
-            })
+                move |conn| -> DatabaseResult<_> {
                    Ok(crate::schema::tenant_shards::table.load::<TenantShardPersistence>(conn)?)
                },
            )
            .await?;
        if loaded.is_empty() {
@@ -211,15 +270,10 @@ impl Persistence {
        let mut decoded = serde_json::from_slice::<JsonPersistence>(&bytes)
            .map_err(|e| DatabaseError::Logical(format!("Deserialization error: {e}")))?;
-        for (tenant_id, tenant) in &mut decoded.tenants {
+        for shard in decoded.tenants.values_mut() {
-            // Backward compat: an old attachments.json from before PR #6251, replace
+            if shard.placement_policy == "\"Single\"" {
-            // empty strings with proper defaults.
+                // Backward compat for test data after PR https://github.com/neondatabase/neon/pull/7165
-            if tenant.tenant_id.is_empty() {
+                shard.placement_policy = "{\"Attached\":0}".to_string();
                tenant.tenant_id = tenant_id.to_string();
                tenant.config = serde_json::to_string(&TenantConfig::default())
                    .map_err(|e| DatabaseError::Logical(format!("Serialization error: {e}")))?;
                tenant.placement_policy = serde_json::to_string(&PlacementPolicy::Single)
                    .map_err(|e| DatabaseError::Logical(format!("Serialization error: {e}")))?;
            }
        }
@@ -265,17 +319,20 @@ impl Persistence {
        shards: Vec<TenantShardPersistence>,
    ) -> DatabaseResult<()> {
        use crate::schema::tenant_shards::dsl::*;
-        self.with_conn(move |conn| -> DatabaseResult<()> {
+        self.with_measured_conn(
-            conn.transaction(|conn| -> QueryResult<()> {
+            DatabaseOperation::InsertTenantShards,
-                for tenant in &shards {
+            move |conn| -> DatabaseResult<()> {
-                    diesel::insert_into(tenant_shards)
+                conn.transaction(|conn| -> QueryResult<()> {
-                        .values(tenant)
+                    for tenant in &shards {
-                        .execute(conn)?;
+                        diesel::insert_into(tenant_shards)
-                }
+                            .values(tenant)
                            .execute(conn)?;
                    }
                    Ok(())
                })?;
                Ok(())
-            })?;
+            },
-            Ok(())
+        )
        })
        .await
    }
@@ -283,25 +340,31 @@ impl Persistence {
    /// the tenant from memory on this server.
    pub(crate) async fn delete_tenant(&self, del_tenant_id: TenantId) -> DatabaseResult<()> {
        use crate::schema::tenant_shards::dsl::*;
-        self.with_conn(move |conn| -> DatabaseResult<()> {
+        self.with_measured_conn(
-            diesel::delete(tenant_shards)
+            DatabaseOperation::DeleteTenant,
-                .filter(tenant_id.eq(del_tenant_id.to_string()))
+            move |conn| -> DatabaseResult<()> {
-                .execute(conn)?;
+                diesel::delete(tenant_shards)
                    .filter(tenant_id.eq(del_tenant_id.to_string()))
                    .execute(conn)?;
-            Ok(())
+                Ok(())
-        })
+            },
        )
        .await
    }
    pub(crate) async fn delete_node(&self, del_node_id: NodeId) -> DatabaseResult<()> {
        use crate::schema::nodes::dsl::*;
-        self.with_conn(move |conn| -> DatabaseResult<()> {
+        self.with_measured_conn(
-            diesel::delete(nodes)
+            DatabaseOperation::DeleteNode,
-                .filter(node_id.eq(del_node_id.0 as i64))
+            move |conn| -> DatabaseResult<()> {
-                .execute(conn)?;
+                diesel::delete(nodes)
                    .filter(node_id.eq(del_node_id.0 as i64))
                    .execute(conn)?;
-            Ok(())
+                Ok(())
-        })
+            },
        )
        .await
    }
@@ -315,7 +378,7 @@ impl Persistence {
    ) -> DatabaseResult<HashMap<TenantShardId, Generation>> {
        use crate::schema::tenant_shards::dsl::*;
        let updated = self
-            .with_conn(move |conn| {
+            .with_measured_conn(DatabaseOperation::ReAttach, move |conn| {
                let rows_updated = diesel::update(tenant_shards)
                    .filter(generation_pageserver.eq(node_id.0 as i64))
                    .set(generation.eq(generation + 1))
@@ -365,7 +428,7 @@ impl Persistence {
    ) -> anyhow::Result<Generation> {
        use crate::schema::tenant_shards::dsl::*;
        let updated = self
-            .with_conn(move |conn| {
+            .with_measured_conn(DatabaseOperation::IncrementGeneration, move |conn| {
                let updated = diesel::update(tenant_shards)
                    .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
                    .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
@@ -409,7 +472,7 @@ impl Persistence {
    ) -> DatabaseResult<()> {
        use crate::schema::tenant_shards::dsl::*;
-        self.with_conn(move |conn| {
+        self.with_measured_conn(DatabaseOperation::UpdateTenantShard, move |conn| {
            let query = diesel::update(tenant_shards)
                .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
                .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
@@ -450,7 +513,7 @@ impl Persistence {
    ) -> DatabaseResult<()> {
        use crate::schema::tenant_shards::dsl::*;
-        self.with_conn(move |conn| {
+        self.with_measured_conn(DatabaseOperation::UpdateTenantConfig, move |conn| {
            diesel::update(tenant_shards)
                .filter(tenant_id.eq(input_tenant_id.to_string()))
                .set((config.eq(serde_json::to_string(&input_config).unwrap()),))
@@ -465,7 +528,7 @@ impl Persistence {
    pub(crate) async fn detach(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
        use crate::schema::tenant_shards::dsl::*;
-        self.with_conn(move |conn| {
+        self.with_measured_conn(DatabaseOperation::Detach, move |conn| {
            let updated = diesel::update(tenant_shards)
                .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
                .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
@@ -495,7 +558,7 @@ impl Persistence {
        parent_to_children: Vec<(TenantShardId, Vec<TenantShardPersistence>)>,
    ) -> DatabaseResult<()> {
        use crate::schema::tenant_shards::dsl::*;
-        self.with_conn(move |conn| -> DatabaseResult<()> {
+        self.with_measured_conn(DatabaseOperation::BeginShardSplit, move |conn| -> DatabaseResult<()> {
            conn.transaction(|conn| -> DatabaseResult<()> {
                // Mark parent shards as splitting
@@ -559,26 +622,29 @@ impl Persistence {
        old_shard_count: ShardCount,
    ) -> DatabaseResult<()> {
        use crate::schema::tenant_shards::dsl::*;
-        self.with_conn(move |conn| -> DatabaseResult<()> {
+        self.with_measured_conn(
-            conn.transaction(|conn| -> QueryResult<()> {
+            DatabaseOperation::CompleteShardSplit,
-                // Drop parent shards
+            move |conn| -> DatabaseResult<()> {
-                diesel::delete(tenant_shards)
+                conn.transaction(|conn| -> QueryResult<()> {
-                    .filter(tenant_id.eq(split_tenant_id.to_string()))
+                    // Drop parent shards
-                    .filter(shard_count.eq(old_shard_count.literal() as i32))
+                    diesel::delete(tenant_shards)
-                    .execute(conn)?;
+                        .filter(tenant_id.eq(split_tenant_id.to_string()))
                        .filter(shard_count.eq(old_shard_count.literal() as i32))
                        .execute(conn)?;
-                // Clear sharding flag
+                    // Clear sharding flag
-                let updated = diesel::update(tenant_shards)
+                    let updated = diesel::update(tenant_shards)
-                    .filter(tenant_id.eq(split_tenant_id.to_string()))
+                        .filter(tenant_id.eq(split_tenant_id.to_string()))
-                    .set((splitting.eq(0),))
+                        .set((splitting.eq(0),))
-                    .execute(conn)?;
+                        .execute(conn)?;
-                debug_assert!(updated > 0);
+                    debug_assert!(updated > 0);
                    Ok(())
                })?;
                Ok(())
-            })?;
+            },
-
+        )
            Ok(())
        })
        .await
    }
@@ -590,31 +656,44 @@ impl Persistence {
        new_shard_count: ShardCount,
    ) -> DatabaseResult<AbortShardSplitStatus> {
        use crate::schema::tenant_shards::dsl::*;
-        self.with_conn(move |conn| -> DatabaseResult<AbortShardSplitStatus> {
+        self.with_measured_conn(
-            let aborted = conn.transaction(|conn| -> QueryResult<AbortShardSplitStatus> {
+            DatabaseOperation::AbortShardSplit,
-                // Clear the splitting state on parent shards
+            move |conn| -> DatabaseResult<AbortShardSplitStatus> {
-                let updated = diesel::update(tenant_shards)
+                let aborted =
-                    .filter(tenant_id.eq(split_tenant_id.to_string()))
+                    conn.transaction(|conn| -> DatabaseResult<AbortShardSplitStatus> {
-                    .filter(shard_count.ne(new_shard_count.literal() as i32))
+                        // Clear the splitting state on parent shards
-                    .set((splitting.eq(0),))
+                        let updated = diesel::update(tenant_shards)
-                    .execute(conn)?;
+                            .filter(tenant_id.eq(split_tenant_id.to_string()))
                            .filter(shard_count.ne(new_shard_count.literal() as i32))
                            .set((splitting.eq(0),))
                            .execute(conn)?;
-                // Parent shards are already gone: we cannot abort.
+                        // Parent shards are already gone: we cannot abort.
-                if updated == 0 {
+                        if updated == 0 {
-                    return Ok(AbortShardSplitStatus::Complete);
+                            return Ok(AbortShardSplitStatus::Complete);
-                }
+                        }
-                // Erase child shards
+                        // Sanity check: if parent shards were present, their cardinality should
-                diesel::delete(tenant_shards)
+                        // be less than the number of child shards.
-                    .filter(tenant_id.eq(split_tenant_id.to_string()))
+                        if updated >= new_shard_count.count() as usize {
-                    .filter(shard_count.eq(new_shard_count.literal() as i32))
+                            return Err(DatabaseError::Logical(format!(
-                    .execute(conn)?;
+                                "Unexpected parent shard count {updated} while aborting split to \
                            count {new_shard_count:?} on tenant {split_tenant_id}"
                            )));
                        }
-                Ok(AbortShardSplitStatus::Aborted)
+                        // Erase child shards
-            })?;
+                        diesel::delete(tenant_shards)
                            .filter(tenant_id.eq(split_tenant_id.to_string()))
                            .filter(shard_count.eq(new_shard_count.literal() as i32))
                            .execute(conn)?;
-            Ok(aborted)
+                        Ok(AbortShardSplitStatus::Aborted)
-        })
+                    })?;
                Ok(aborted)
            },
        )
        .await
    }
 }
--- a/control_plane/attachment_service/src/reconciler.rs
+++ b/control_plane/attachment_service/src/reconciler.rs
@@ -1,3 +1,4 @@
 use crate::pageserver_client::PageserverClient;
 use crate::persistence::Persistence;
 use crate::service;
 use hyper::StatusCode;
@@ -8,7 +9,7 @@ use pageserver_api::shard::{ShardIdentity, TenantShardId};
 use pageserver_client::mgmt_api;
 use std::collections::HashMap;
 use std::sync::Arc;
-use std::time::Duration;
+use std::time::{Duration, Instant};
 use tokio_util::sync::CancellationToken;
 use utils::generation::Generation;
 use utils::id::{NodeId, TimelineId};
@@ -117,6 +118,15 @@ impl Reconciler {
        flush_ms: Option<Duration>,
        lazy: bool,
    ) -> Result<(), ReconcileError> {
        if !node.is_available() && config.mode == LocationConfigMode::Detached {
            // Attempts to detach from offline nodes may be imitated without doing I/O: a node which is offline
            // will get fully reconciled wrt the shard's intent state when it is reactivated, irrespective of
            // what we put into `observed`, in [`crate::service::Service::node_activate_reconcile`]
            tracing::info!("Node {node} is unavailable during detach: proceeding anyway, it will be detached on next activation");
            self.observed.locations.remove(&node.get_id());
            return Ok(());
        }
        self.observed
            .locations
            .insert(node.get_id(), ObservedStateLocation { conf: None });
@@ -149,9 +159,16 @@ impl Reconciler {
        };
        tracing::info!("location_config({node}) complete: {:?}", config);
-        self.observed
+        match config.mode {
-            .locations
+            LocationConfigMode::Detached => {
-            .insert(node.get_id(), ObservedStateLocation { conf: Some(config) });
+                self.observed.locations.remove(&node.get_id());
            }
            _ => {
                self.observed
                    .locations
                    .insert(node.get_id(), ObservedStateLocation { conf: Some(config) });
            }
        }
        Ok(())
    }
@@ -243,8 +260,11 @@ impl Reconciler {
        tenant_shard_id: TenantShardId,
        node: &Node,
    ) -> anyhow::Result<HashMap<TimelineId, Lsn>> {
-        let client =
+        let client = PageserverClient::new(
-            mgmt_api::Client::new(node.base_url(), self.service_config.jwt_token.as_deref());
+            node.get_id(),
            node.base_url(),
            self.service_config.jwt_token.as_deref(),
        );
        let timelines = client.timeline_list(&tenant_shard_id).await?;
        Ok(timelines
@@ -258,22 +278,81 @@ impl Reconciler {
        tenant_shard_id: TenantShardId,
        node: &Node,
    ) -> Result<(), ReconcileError> {
-        match node
+        // This is not the timeout for a request, but the total amount of time we're willing to wait
-            .with_client_retries(
+        // for a secondary location to get up to date before
-                |client| async move { client.tenant_secondary_download(tenant_shard_id).await },
+        const TOTAL_DOWNLOAD_TIMEOUT: Duration = Duration::from_secs(300);
-                &self.service_config.jwt_token,
+
-                1,
+        // This the long-polling interval for the secondary download requests we send to destination pageserver
-                1,
+        // during a migration.
-                Duration::from_secs(60),
+        const REQUEST_DOWNLOAD_TIMEOUT: Duration = Duration::from_secs(20);
-                &self.cancel,
+
-            )
+        let started_at = Instant::now();
-            .await
+
-        {
+        loop {
-            None => Err(ReconcileError::Cancel),
+            let (status, progress) = match node
-            Some(Ok(_)) => Ok(()),
+                .with_client_retries(
-            Some(Err(e)) => {
+                    |client| async move {
-                tracing::info!("  (skipping destination download: {})", e);
+                        client
-                Ok(())
+                            .tenant_secondary_download(
                                tenant_shard_id,
                                Some(REQUEST_DOWNLOAD_TIMEOUT),
                            )
                            .await
                    },
                    &self.service_config.jwt_token,
                    1,
                    3,
                    REQUEST_DOWNLOAD_TIMEOUT * 2,
                    &self.cancel,
                )
                .await
            {
                None => Err(ReconcileError::Cancel),
                Some(Ok(v)) => Ok(v),
                Some(Err(e)) => {
                    // Give up, but proceed: it's unfortunate if we couldn't freshen the destination before
                    // attaching, but we should not let an issue with a secondary location stop us proceeding
                    // with a live migration.
                    tracing::warn!("Failed to prepare by downloading layers on node {node}: {e})");
                    return Ok(());
                }
            }?;
            if status == StatusCode::OK {
                tracing::info!(
                    "Downloads to {} complete: {}/{} layers, {}/{} bytes",
                    node,
                    progress.layers_downloaded,
                    progress.layers_total,
                    progress.bytes_downloaded,
                    progress.bytes_total
                );
                return Ok(());
            } else if status == StatusCode::ACCEPTED {
                let total_runtime = started_at.elapsed();
                if total_runtime > TOTAL_DOWNLOAD_TIMEOUT {
                    tracing::warn!("Timed out after {}ms downloading layers to {node}.  Progress so far: {}/{} layers, {}/{} bytes",
                        total_runtime.as_millis(),
                        progress.layers_downloaded,
                        progress.layers_total,
                        progress.bytes_downloaded,
                        progress.bytes_total
                    );
                    // Give up, but proceed: an incompletely warmed destination doesn't prevent migration working,
                    // it just makes the I/O performance for users less good.
                    return Ok(());
                }
                // Log and proceed around the loop to retry.  We don't sleep between requests, because our HTTP call
                // to the pageserver is a long-poll.
                tracing::info!(
                    "Downloads to {} not yet complete: {}/{} layers, {}/{} bytes",
                    node,
                    progress.layers_downloaded,
                    progress.layers_total,
                    progress.bytes_downloaded,
                    progress.bytes_total
                );
            }
        }
    }
@@ -416,7 +495,7 @@ impl Reconciler {
            }
        }
-        // Downgrade the origin to secondary.  If the tenant's policy is PlacementPolicy::Single, then
+        // Downgrade the origin to secondary.  If the tenant's policy is PlacementPolicy::Attached(0), then
        // this location will be deleted in the general case reconciliation that runs after this.
        let origin_secondary_conf = build_location_config(
            &self.shard,
--- a/control_plane/attachment_service/src/scheduler.rs
+++ b/control_plane/attachment_service/src/scheduler.rs
@@ -1,4 +1,5 @@
 use crate::{node::Node, tenant_state::TenantState};
 use pageserver_api::controller_api::UtilizationScore;
 use serde::Serialize;
 use std::collections::HashMap;
 use utils::{http::error::ApiError, id::NodeId};
@@ -19,15 +20,34 @@ impl From<ScheduleError> for ApiError {
 }
 #[derive(Serialize, Eq, PartialEq)]
 pub enum MaySchedule {
    Yes(UtilizationScore),
    No,
 }
 #[derive(Serialize)]
 struct SchedulerNode {
    /// How many shards are currently scheduled on this node, via their [`crate::tenant_state::IntentState`].
    shard_count: usize,
    /// Whether this node is currently elegible to have new shards scheduled (this is derived
    /// from a node's availability state and scheduling policy).
-    may_schedule: bool,
+    may_schedule: MaySchedule,
 }
 impl PartialEq for SchedulerNode {
    fn eq(&self, other: &Self) -> bool {
        let may_schedule_matches = matches!(
            (&self.may_schedule, &other.may_schedule),
            (MaySchedule::Yes(_), MaySchedule::Yes(_)) | (MaySchedule::No, MaySchedule::No)
        );
        may_schedule_matches && self.shard_count == other.shard_count
    }
 }
 impl Eq for SchedulerNode {}
 /// This type is responsible for selecting which node is used when a tenant shard needs to choose a pageserver
 /// on which to run.
 ///
@@ -186,13 +206,15 @@ impl Scheduler {
            return None;
        }
        // TODO: When the utilization score returned by the pageserver becomes meaningful,
        // schedule based on that instead of the shard count.
        let node = nodes
            .iter()
            .map(|node_id| {
                let may_schedule = self
                    .nodes
                    .get(node_id)
-                    .map(|n| n.may_schedule)
+                    .map(|n| n.may_schedule != MaySchedule::No)
                    .unwrap_or(false);
                (*node_id, may_schedule)
            })
@@ -211,7 +233,7 @@ impl Scheduler {
            .nodes
            .iter()
            .filter_map(|(k, v)| {
-                if hard_exclude.contains(k) || !v.may_schedule {
+                if hard_exclude.contains(k) || v.may_schedule == MaySchedule::No {
                    None
                } else {
                    Some((*k, v.shard_count))
@@ -230,7 +252,7 @@ impl Scheduler {
            for (node_id, node) in &self.nodes {
                tracing::info!(
                    "Node {node_id}: may_schedule={} shards={}",
-                    node.may_schedule,
+                    node.may_schedule != MaySchedule::No,
                    node.shard_count
                );
            }
@@ -255,6 +277,7 @@ impl Scheduler {
 pub(crate) mod test_utils {
    use crate::node::Node;
    use pageserver_api::controller_api::{NodeAvailability, UtilizationScore};
    use std::collections::HashMap;
    use utils::id::NodeId;
    /// Test helper: synthesize the requested number of nodes, all in active state.
@@ -264,13 +287,14 @@ pub(crate) mod test_utils {
        (1..n + 1)
            .map(|i| {
                (NodeId(i), {
-                    let node = Node::new(
+                    let mut node = Node::new(
                        NodeId(i),
                        format!("httphost-{i}"),
                        80 + i as u16,
                        format!("pghost-{i}"),
                        5432 + i as u16,
                    );
                    node.set_availability(NodeAvailability::Active(UtilizationScore::worst()));
                    assert!(node.is_available());
                    node
                })
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
--- a/control_plane/attachment_service/src/tenant_state.rs
+++ b/control_plane/attachment_service/src/tenant_state.rs
@@ -4,7 +4,10 @@ use std::{
    time::Duration,
 };
-use crate::{metrics, persistence::TenantShardPersistence};
+use crate::{
    metrics::{self, ReconcileCompleteLabelGroup, ReconcileOutcome},
    persistence::TenantShardPersistence,
 };
 use pageserver_api::controller_api::PlacementPolicy;
 use pageserver_api::{
    models::{LocationConfig, LocationConfigMode, TenantConfig},
@@ -457,22 +460,7 @@ impl TenantState {
        // Add/remove nodes to fulfil policy
        use PlacementPolicy::*;
        match self.policy {
-            Single => {
+            Attached(secondary_count) => {
                // Should have exactly one attached, and zero secondaries
                if !self.intent.secondary.is_empty() {
                    self.intent.clear_secondary(scheduler);
                    modified = true;
                }
                let (modified_attached, _attached_node_id) = self.schedule_attached(scheduler)?;
                modified |= modified_attached;
                if !self.intent.secondary.is_empty() {
                    self.intent.clear_secondary(scheduler);
                    modified = true;
                }
            }
            Double(secondary_count) => {
                let retain_secondaries = if self.intent.attached.is_none()
                    && scheduler.node_preferred(&self.intent.secondary).is_some()
                {
@@ -622,7 +610,7 @@ impl TenantState {
    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
    pub(crate) fn maybe_reconcile(
        &mut self,
-        result_tx: tokio::sync::mpsc::UnboundedSender<ReconcileResult>,
+        result_tx: &tokio::sync::mpsc::UnboundedSender<ReconcileResult>,
        pageservers: &Arc<HashMap<NodeId, Node>>,
        compute_hook: &Arc<ComputeHook>,
        service_config: &service::Config,
@@ -733,7 +721,11 @@ impl TenantState {
        let reconciler_span = tracing::info_span!(parent: None, "reconciler", seq=%reconcile_seq,
                                                        tenant_id=%reconciler.tenant_shard_id.tenant_id,
                                                        shard_id=%reconciler.tenant_shard_id.shard_slug());
-        metrics::RECONCILER.spawned.inc();
+        metrics::METRICS_REGISTRY
            .metrics_group
            .storage_controller_reconcile_spawn
            .inc();
        let result_tx = result_tx.clone();
        let join_handle = tokio::task::spawn(
            async move {
                // Wait for any previous reconcile task to complete before we start
@@ -750,10 +742,12 @@ impl TenantState {
                // TODO: wrap all remote API operations in cancellation check
                // as well.
                if reconciler.cancel.is_cancelled() {
-                    metrics::RECONCILER
+                    metrics::METRICS_REGISTRY
-                        .complete
+                        .metrics_group
-                        .with_label_values(&[metrics::ReconcilerMetrics::CANCEL])
+                        .storage_controller_reconcile_complete
-                        .inc();
+                        .inc(ReconcileCompleteLabelGroup {
                            status: ReconcileOutcome::Cancel,
                        });
                    return;
                }
@@ -768,18 +762,18 @@ impl TenantState {
                }
                // Update result counter
-                match &result {
+                let outcome_label = match &result {
-                    Ok(_) => metrics::RECONCILER
+                    Ok(_) => ReconcileOutcome::Success,
-                        .complete
+                    Err(ReconcileError::Cancel) => ReconcileOutcome::Cancel,
-                        .with_label_values(&[metrics::ReconcilerMetrics::SUCCESS]),
+                    Err(_) => ReconcileOutcome::Error,
-                    Err(ReconcileError::Cancel) => metrics::RECONCILER
+                };
-                        .complete
+
-                        .with_label_values(&[metrics::ReconcilerMetrics::CANCEL]),
+                metrics::METRICS_REGISTRY
-                    Err(_) => metrics::RECONCILER
+                    .metrics_group
-                        .complete
+                    .storage_controller_reconcile_complete
-                        .with_label_values(&[metrics::ReconcilerMetrics::ERROR]),
+                    .inc(ReconcileCompleteLabelGroup {
-                }
+                        status: outcome_label,
-                .inc();
+                    });
                result_tx
                    .send(ReconcileResult {
@@ -894,7 +888,7 @@ pub(crate) mod tests {
        let mut scheduler = Scheduler::new(nodes.values());
-        let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Double(1));
+        let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Attached(1));
        tenant_state
            .schedule(&mut scheduler)
            .expect("we have enough nodes, scheduling should work");
@@ -942,7 +936,7 @@ pub(crate) mod tests {
        let nodes = make_test_nodes(3);
        let mut scheduler = Scheduler::new(nodes.values());
-        let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Double(1));
+        let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Attached(1));
        tenant_state.observed.locations.insert(
            NodeId(3),
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -294,7 +294,7 @@ where
    //      is in state 'taken' but the thread that would unlock it is
    //      not there.
    //   2. A rust object that represented some external resource in the
-    //      parent now got implicitly copied by the the fork, even though
+    //      parent now got implicitly copied by the fork, even though
    //      the object's type is not `Copy`. The parent program may use
    //      non-copyability as way to enforce unique ownership of an
    //      external resource in the typesystem. The fork breaks that
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -8,11 +8,11 @@
 use anyhow::{anyhow, bail, Context, Result};
 use clap::{value_parser, Arg, ArgAction, ArgMatches, Command, ValueEnum};
 use compute_api::spec::ComputeMode;
 use control_plane::attachment_service::AttachmentService;
 use control_plane::endpoint::ComputeControlPlane;
 use control_plane::local_env::{InitForceMode, LocalEnv};
 use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR};
 use control_plane::safekeeper::SafekeeperNode;
 use control_plane::storage_controller::StorageController;
 use control_plane::{broker, local_env};
 use pageserver_api::controller_api::{
    NodeAvailability, NodeConfigureRequest, NodeSchedulingPolicy, PlacementPolicy,
@@ -138,7 +138,7 @@ fn main() -> Result<()> {
            "start" => rt.block_on(handle_start_all(sub_args, &env)),
            "stop" => rt.block_on(handle_stop_all(sub_args, &env)),
            "pageserver" => rt.block_on(handle_pageserver(sub_args, &env)),
-            "attachment_service" => rt.block_on(handle_attachment_service(sub_args, &env)),
+            "storage_controller" => rt.block_on(handle_storage_controller(sub_args, &env)),
            "safekeeper" => rt.block_on(handle_safekeeper(sub_args, &env)),
            "endpoint" => rt.block_on(handle_endpoint(sub_args, &env)),
            "mappings" => handle_mappings(sub_args, &mut env),
@@ -437,7 +437,7 @@ async fn handle_tenant(
            let placement_policy = match create_match.get_one::<String>("placement-policy") {
                Some(s) if !s.is_empty() => serde_json::from_str::<PlacementPolicy>(s)?,
-                _ => PlacementPolicy::Single,
+                _ => PlacementPolicy::Attached(0),
            };
            let tenant_conf = PageServerNode::parse_config(tenant_conf)?;
@@ -445,14 +445,14 @@ async fn handle_tenant(
            // If tenant ID was not specified, generate one
            let tenant_id = parse_tenant_id(create_match)?.unwrap_or_else(TenantId::generate);
-            // We must register the tenant with the attachment service, so
+            // We must register the tenant with the storage controller, so
            // that when the pageserver restarts, it will be re-attached.
-            let attachment_service = AttachmentService::from_env(env);
+            let storage_controller = StorageController::from_env(env);
-            attachment_service
+            storage_controller
                .tenant_create(TenantCreateRequest {
                    // Note that ::unsharded here isn't actually because the tenant is unsharded, its because the
-                    // attachment service expecfs a shard-naive tenant_id in this attribute, and the TenantCreateRequest
+                    // storage controller expecfs a shard-naive tenant_id in this attribute, and the TenantCreateRequest
-                    // type is used both in attachment service (for creating tenants) and in pageserver (for creating shards)
+                    // type is used both in storage controller (for creating tenants) and in pageserver (for creating shards)
                    new_tenant_id: TenantShardId::unsharded(tenant_id),
                    generation: None,
                    shard_parameters: ShardParameters {
@@ -476,9 +476,9 @@ async fn handle_tenant(
                .context("Failed to parse postgres version from the argument string")?;
            // FIXME: passing None for ancestor_start_lsn is not kosher in a sharded world: we can't have
-            // different shards picking different start lsns.  Maybe we have to teach attachment service
+            // different shards picking different start lsns.  Maybe we have to teach storage controller
            // to let shard 0 branch first and then propagate the chosen LSN to other shards.
-            attachment_service
+            storage_controller
                .tenant_timeline_create(
                    tenant_id,
                    TimelineCreateRequest {
@@ -523,84 +523,6 @@ async fn handle_tenant(
                .with_context(|| format!("Tenant config failed for tenant with id {tenant_id}"))?;
            println!("tenant {tenant_id} successfully configured on the pageserver");
        }
        Some(("migrate", matches)) => {
            let tenant_shard_id = get_tenant_shard_id(matches, env)?;
            let new_pageserver = get_pageserver(env, matches)?;
            let new_pageserver_id = new_pageserver.conf.id;
            let attachment_service = AttachmentService::from_env(env);
            attachment_service
                .tenant_migrate(tenant_shard_id, new_pageserver_id)
                .await?;
            println!("tenant {tenant_shard_id} migrated to {}", new_pageserver_id);
        }
        Some(("status", matches)) => {
            let tenant_id = get_tenant_id(matches, env)?;
            let mut shard_table = comfy_table::Table::new();
            shard_table.set_header(["Shard", "Pageserver", "Physical Size"]);
            let mut tenant_synthetic_size = None;
            let attachment_service = AttachmentService::from_env(env);
            for shard in attachment_service.tenant_locate(tenant_id).await?.shards {
                let pageserver =
                    PageServerNode::from_env(env, env.get_pageserver_conf(shard.node_id)?);
                let size = pageserver
                    .http_client
                    .tenant_details(shard.shard_id)
                    .await?
                    .tenant_info
                    .current_physical_size
                    .unwrap();
                shard_table.add_row([
                    format!("{}", shard.shard_id.shard_slug()),
                    format!("{}", shard.node_id.0),
                    format!("{} MiB", size / (1024 * 1024)),
                ]);
                if shard.shard_id.is_zero() {
                    tenant_synthetic_size =
                        Some(pageserver.tenant_synthetic_size(shard.shard_id).await?);
                }
            }
            let Some(synthetic_size) = tenant_synthetic_size else {
                bail!("Shard 0 not found")
            };
            let mut tenant_table = comfy_table::Table::new();
            tenant_table.add_row(["Tenant ID".to_string(), tenant_id.to_string()]);
            tenant_table.add_row([
                "Synthetic size".to_string(),
                format!("{} MiB", synthetic_size.size.unwrap_or(0) / (1024 * 1024)),
            ]);
            println!("{tenant_table}");
            println!("{shard_table}");
        }
        Some(("shard-split", matches)) => {
            let tenant_id = get_tenant_id(matches, env)?;
            let shard_count: u8 = matches.get_one::<u8>("shard-count").cloned().unwrap_or(0);
            let attachment_service = AttachmentService::from_env(env);
            let result = attachment_service
                .tenant_split(tenant_id, shard_count)
                .await?;
            println!(
                "Split tenant {} into shards {}",
                tenant_id,
                result
                    .new_shards
                    .iter()
                    .map(|s| format!("{:?}", s))
                    .collect::<Vec<_>>()
                    .join(",")
            );
        }
        Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{}'", sub_name),
        None => bail!("no tenant subcommand provided"),
@@ -613,7 +535,7 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
    match timeline_match.subcommand() {
        Some(("list", list_match)) => {
-            // TODO(sharding): this command shouldn't have to specify a shard ID: we should ask the attachment service
+            // TODO(sharding): this command shouldn't have to specify a shard ID: we should ask the storage controller
            // where shard 0 is attached, and query there.
            let tenant_shard_id = get_tenant_shard_id(list_match, env)?;
            let timelines = pageserver.timeline_list(&tenant_shard_id).await?;
@@ -633,7 +555,7 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
            let new_timeline_id_opt = parse_timeline_id(create_match)?;
            let new_timeline_id = new_timeline_id_opt.unwrap_or(TimelineId::generate());
-            let attachment_service = AttachmentService::from_env(env);
+            let storage_controller = StorageController::from_env(env);
            let create_req = TimelineCreateRequest {
                new_timeline_id,
                ancestor_timeline_id: None,
@@ -641,7 +563,7 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
                ancestor_start_lsn: None,
                pg_version: Some(pg_version),
            };
-            let timeline_info = attachment_service
+            let timeline_info = storage_controller
                .tenant_timeline_create(tenant_id, create_req)
                .await?;
@@ -730,7 +652,7 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
                .transpose()
                .context("Failed to parse ancestor start Lsn from the request")?;
            let new_timeline_id = TimelineId::generate();
-            let attachment_service = AttachmentService::from_env(env);
+            let storage_controller = StorageController::from_env(env);
            let create_req = TimelineCreateRequest {
                new_timeline_id,
                ancestor_timeline_id: Some(ancestor_timeline_id),
@@ -738,7 +660,7 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
                ancestor_start_lsn: start_lsn,
                pg_version: None,
            };
-            let timeline_info = attachment_service
+            let timeline_info = storage_controller
                .tenant_timeline_create(tenant_id, create_req)
                .await?;
@@ -767,7 +689,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
    match sub_name {
        "list" => {
-            // TODO(sharding): this command shouldn't have to specify a shard ID: we should ask the attachment service
+            // TODO(sharding): this command shouldn't have to specify a shard ID: we should ask the storage controller
            // where shard 0 is attached, and query there.
            let tenant_shard_id = get_tenant_shard_id(sub_args, env)?;
            let timeline_infos = get_timeline_infos(env, &tenant_shard_id)
@@ -952,21 +874,21 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
                (
                    vec![(parsed.0, parsed.1.unwrap_or(5432))],
                    // If caller is telling us what pageserver to use, this is not a tenant which is
-                    // full managed by attachment service, therefore not sharded.
+                    // full managed by storage controller, therefore not sharded.
                    ShardParameters::DEFAULT_STRIPE_SIZE,
                )
            } else {
                // Look up the currently attached location of the tenant, and its striping metadata,
                // to pass these on to postgres.
-                let attachment_service = AttachmentService::from_env(env);
+                let storage_controller = StorageController::from_env(env);
-                let locate_result = attachment_service.tenant_locate(endpoint.tenant_id).await?;
+                let locate_result = storage_controller.tenant_locate(endpoint.tenant_id).await?;
                let pageservers = locate_result
                    .shards
                    .into_iter()
                    .map(|shard| {
                        (
                            Host::parse(&shard.listen_pg_addr)
-                                .expect("Attachment service reported bad hostname"),
+                                .expect("Storage controller reported bad hostname"),
                            shard.listen_pg_port,
                        )
                    })
@@ -1015,8 +937,8 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
                        pageserver.pg_connection_config.port(),
                    )]
                } else {
-                    let attachment_service = AttachmentService::from_env(env);
+                    let storage_controller = StorageController::from_env(env);
-                    attachment_service
+                    storage_controller
                        .tenant_locate(endpoint.tenant_id)
                        .await?
                        .shards
@@ -1024,7 +946,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
                        .map(|shard| {
                            (
                                Host::parse(&shard.listen_pg_addr)
-                                    .expect("Attachment service reported malformed host"),
+                                    .expect("Storage controller reported malformed host"),
                                shard.listen_pg_port,
                            )
                        })
@@ -1100,9 +1022,8 @@ fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result<PageSe
 async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
    match sub_match.subcommand() {
        Some(("start", subcommand_args)) => {
            let register = subcommand_args.get_one::<bool>("register").unwrap_or(&true);
            if let Err(e) = get_pageserver(env, subcommand_args)?
-                .start(&pageserver_config_overrides(subcommand_args), *register)
+                .start(&pageserver_config_overrides(subcommand_args))
                .await
            {
                eprintln!("pageserver start failed: {e}");
@@ -1131,7 +1052,7 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
            }
            if let Err(e) = pageserver
-                .start(&pageserver_config_overrides(subcommand_args), false)
+                .start(&pageserver_config_overrides(subcommand_args))
                .await
            {
                eprintln!("pageserver start failed: {e}");
@@ -1144,8 +1065,8 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
            let scheduling = subcommand_args.get_one("scheduling");
            let availability = subcommand_args.get_one("availability");
-            let attachment_service = AttachmentService::from_env(env);
+            let storage_controller = StorageController::from_env(env);
-            attachment_service
+            storage_controller
                .node_configure(NodeConfigureRequest {
                    node_id: pageserver.conf.id,
                    scheduling: scheduling.cloned(),
@@ -1170,11 +1091,11 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
    Ok(())
 }
-async fn handle_attachment_service(
+async fn handle_storage_controller(
    sub_match: &ArgMatches,
    env: &local_env::LocalEnv,
 ) -> Result<()> {
-    let svc = AttachmentService::from_env(env);
+    let svc = StorageController::from_env(env);
    match sub_match.subcommand() {
        Some(("start", _start_match)) => {
            if let Err(e) = svc.start().await {
@@ -1194,8 +1115,8 @@ async fn handle_attachment_service(
                exit(1);
            }
        }
-        Some((sub_name, _)) => bail!("Unexpected attachment_service subcommand '{}'", sub_name),
+        Some((sub_name, _)) => bail!("Unexpected storage_controller subcommand '{}'", sub_name),
-        None => bail!("no attachment_service subcommand provided"),
+        None => bail!("no storage_controller subcommand provided"),
    }
    Ok(())
 }
@@ -1280,11 +1201,11 @@ async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
    broker::start_broker_process(env).await?;
-    // Only start the attachment service if the pageserver is configured to need it
+    // Only start the storage controller if the pageserver is configured to need it
    if env.control_plane_api.is_some() {
-        let attachment_service = AttachmentService::from_env(env);
+        let storage_controller = StorageController::from_env(env);
-        if let Err(e) = attachment_service.start().await {
+        if let Err(e) = storage_controller.start().await {
-            eprintln!("attachment_service start failed: {:#}", e);
+            eprintln!("storage_controller start failed: {:#}", e);
            try_stop_all(env, true).await;
            exit(1);
        }
@@ -1293,7 +1214,7 @@ async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
    for ps_conf in &env.pageservers {
        let pageserver = PageServerNode::from_env(env, ps_conf);
        if let Err(e) = pageserver
-            .start(&pageserver_config_overrides(sub_match), true)
+            .start(&pageserver_config_overrides(sub_match))
            .await
        {
            eprintln!("pageserver {} start failed: {:#}", ps_conf.id, e);
@@ -1356,9 +1277,9 @@ async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
    }
    if env.control_plane_api.is_some() {
-        let attachment_service = AttachmentService::from_env(env);
+        let storage_controller = StorageController::from_env(env);
-        if let Err(e) = attachment_service.stop(immediate).await {
+        if let Err(e) = storage_controller.stop(immediate).await {
-            eprintln!("attachment service stop failed: {e:#}");
+            eprintln!("storage controller stop failed: {e:#}");
        }
    }
 }
@@ -1575,18 +1496,6 @@ fn cli() -> Command {
            .subcommand(Command::new("config")
                .arg(tenant_id_arg.clone())
                .arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false)))
            .subcommand(Command::new("migrate")
                .about("Migrate a tenant from one pageserver to another")
                .arg(tenant_id_arg.clone())
                .arg(pageserver_id_arg.clone()))
            .subcommand(Command::new("status")
                .about("Human readable summary of the tenant's shards and attachment locations")
                .arg(tenant_id_arg.clone()))
            .subcommand(Command::new("shard-split")
                .about("Increase the number of shards in the tenant")
                .arg(tenant_id_arg.clone())
                .arg(Arg::new("shard-count").value_parser(value_parser!(u8)).long("shard-count").action(ArgAction::Set).help("Number of shards in the new tenant (default 1)"))
                )
        )
        .subcommand(
            Command::new("pageserver")
@@ -1596,11 +1505,7 @@ fn cli() -> Command {
                .subcommand(Command::new("status"))
                .subcommand(Command::new("start")
                    .about("Start local pageserver")
-                    .arg(pageserver_config_args.clone()).arg(Arg::new("register")
+                    .arg(pageserver_config_args.clone())
                    .long("register")
                    .default_value("true").required(false)
                    .value_parser(value_parser!(bool))
                    .value_name("register"))
                )
                .subcommand(Command::new("stop")
                    .about("Stop local pageserver")
@@ -1618,9 +1523,9 @@ fn cli() -> Command {
                )
        )
        .subcommand(
-            Command::new("attachment_service")
+            Command::new("storage_controller")
                .arg_required_else_help(true)
-                .about("Manage attachment_service")
+                .about("Manage storage_controller")
                .subcommand(Command::new("start").about("Start local pageserver").arg(pageserver_config_args.clone()))
                .subcommand(Command::new("stop").about("Stop local pageserver")
                            .arg(stop_mode_arg.clone()))
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -12,7 +12,7 @@
 //!
 //! The endpoint is managed by the `compute_ctl` binary. When an endpoint is
 //! started, we launch `compute_ctl` It synchronizes the safekeepers, downloads
-//! the basebackup from the pageserver to initialize the the data directory, and
+//! the basebackup from the pageserver to initialize the data directory, and
 //! finally launches the PostgreSQL process. It watches the PostgreSQL process
 //! until it exits.
 //!
@@ -57,9 +57,9 @@ use serde::{Deserialize, Serialize};
 use url::Host;
 use utils::id::{NodeId, TenantId, TimelineId};
 use crate::attachment_service::AttachmentService;
 use crate::local_env::LocalEnv;
 use crate::postgresql_conf::PostgresConf;
 use crate::storage_controller::StorageController;
 use compute_api::responses::{ComputeState, ComputeStatus};
 use compute_api::spec::{Cluster, ComputeFeature, ComputeMode, ComputeSpec};
@@ -750,17 +750,17 @@ impl Endpoint {
        let postgresql_conf = self.read_postgresql_conf()?;
        spec.cluster.postgresql_conf = Some(postgresql_conf);
-        // If we weren't given explicit pageservers, query the attachment service
+        // If we weren't given explicit pageservers, query the storage controller
        if pageservers.is_empty() {
-            let attachment_service = AttachmentService::from_env(&self.env);
+            let storage_controller = StorageController::from_env(&self.env);
-            let locate_result = attachment_service.tenant_locate(self.tenant_id).await?;
+            let locate_result = storage_controller.tenant_locate(self.tenant_id).await?;
            pageservers = locate_result
                .shards
                .into_iter()
                .map(|shard| {
                    (
                        Host::parse(&shard.listen_pg_addr)
-                            .expect("Attachment service reported bad hostname"),
+                            .expect("Storage controller reported bad hostname"),
                        shard.listen_pg_port,
                    )
                })
--- a/control_plane/src/lib.rs
+++ b/control_plane/src/lib.rs
@@ -6,7 +6,6 @@
 //! local installations.
 #![deny(clippy::undocumented_unsafe_blocks)]
 pub mod attachment_service;
 mod background_process;
 pub mod broker;
 pub mod endpoint;
@@ -14,3 +13,4 @@ pub mod local_env;
 pub mod pageserver;
 pub mod postgresql_conf;
 pub mod safekeeper;
 pub mod storage_controller;
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -72,13 +72,13 @@ pub struct LocalEnv {
    #[serde(default)]
    pub safekeepers: Vec<SafekeeperConf>,
-    // Control plane upcall API for pageserver: if None, we will not run attachment_service.  If set, this will
+    // Control plane upcall API for pageserver: if None, we will not run storage_controller  If set, this will
    // be propagated into each pageserver's configuration.
    #[serde(default)]
    pub control_plane_api: Option<Url>,
-    // Control plane upcall API for attachment service.  If set, this will be propagated into the
+    // Control plane upcall API for storage controller.  If set, this will be propagated into the
-    // attachment service's configuration.
+    // storage controller's configuration.
    #[serde(default)]
    pub control_plane_compute_hook_api: Option<Url>,
@@ -114,7 +114,7 @@ impl NeonBroker {
 }
 #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
-#[serde(default)]
+#[serde(default, deny_unknown_fields)]
 pub struct PageServerConf {
    // node id
    pub id: NodeId,
@@ -126,6 +126,9 @@ pub struct PageServerConf {
    // auth type used for the PG and HTTP ports
    pub pg_auth_type: AuthType,
    pub http_auth_type: AuthType,
    pub(crate) virtual_file_io_engine: Option<String>,
    pub(crate) get_vectored_impl: Option<String>,
 }
 impl Default for PageServerConf {
@@ -136,6 +139,8 @@ impl Default for PageServerConf {
            listen_http_addr: String::new(),
            pg_auth_type: AuthType::Trust,
            http_auth_type: AuthType::Trust,
            virtual_file_io_engine: None,
            get_vectored_impl: None,
        }
    }
 }
@@ -227,10 +232,10 @@ impl LocalEnv {
        self.neon_distrib_dir.join("pageserver")
    }
-    pub fn attachment_service_bin(&self) -> PathBuf {
+    pub fn storage_controller_bin(&self) -> PathBuf {
-        // Irrespective of configuration, attachment service binary is always
+        // Irrespective of configuration, storage controller binary is always
        // run from the same location as neon_local.  This means that for compatibility
-        // tests that run old pageserver/safekeeper, they still run latest attachment service.
+        // tests that run old pageserver/safekeeper, they still run latest storage controller.
        let neon_local_bin_dir = env::current_exe().unwrap().parent().unwrap().to_owned();
        neon_local_bin_dir.join("storage_controller")
    }
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -17,8 +17,6 @@ use std::time::Duration;
 use anyhow::{bail, Context};
 use camino::Utf8PathBuf;
 use futures::SinkExt;
 use hyper::StatusCode;
 use pageserver_api::controller_api::NodeRegisterRequest;
 use pageserver_api::models::{
    self, LocationConfig, ShardParameters, TenantHistorySize, TenantInfo, TimelineInfo,
 };
@@ -32,7 +30,6 @@ use utils::{
    lsn::Lsn,
 };
 use crate::attachment_service::AttachmentService;
 use crate::local_env::PageServerConf;
 use crate::{background_process, local_env::LocalEnv};
@@ -81,18 +78,39 @@ impl PageServerNode {
    ///
    /// These all end up on the command line of the `pageserver` binary.
    fn neon_local_overrides(&self, cli_overrides: &[&str]) -> Vec<String> {
        let id = format!("id={}", self.conf.id);
        // FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc.
        let pg_distrib_dir_param = format!(
            "pg_distrib_dir='{}'",
            self.env.pg_distrib_dir_raw().display()
        );
-        let http_auth_type_param = format!("http_auth_type='{}'", self.conf.http_auth_type);
+        let PageServerConf {
-        let listen_http_addr_param = format!("listen_http_addr='{}'", self.conf.listen_http_addr);
+            id,
            listen_pg_addr,
            listen_http_addr,
            pg_auth_type,
            http_auth_type,
            virtual_file_io_engine,
            get_vectored_impl,
        } = &self.conf;
-        let pg_auth_type_param = format!("pg_auth_type='{}'", self.conf.pg_auth_type);
+        let id = format!("id={}", id);
-        let listen_pg_addr_param = format!("listen_pg_addr='{}'", self.conf.listen_pg_addr);
+
        let http_auth_type_param = format!("http_auth_type='{}'", http_auth_type);
        let listen_http_addr_param = format!("listen_http_addr='{}'", listen_http_addr);
        let pg_auth_type_param = format!("pg_auth_type='{}'", pg_auth_type);
        let listen_pg_addr_param = format!("listen_pg_addr='{}'", listen_pg_addr);
        let virtual_file_io_engine = if let Some(virtual_file_io_engine) = virtual_file_io_engine {
            format!("virtual_file_io_engine='{virtual_file_io_engine}'")
        } else {
            String::new()
        };
        let get_vectored_impl = if let Some(get_vectored_impl) = get_vectored_impl {
            format!("get_vectored_impl='{get_vectored_impl}'")
        } else {
            String::new()
        };
        let broker_endpoint_param = format!("broker_endpoint='{}'", self.env.broker.client_url());
@@ -104,6 +122,8 @@ impl PageServerNode {
            listen_http_addr_param,
            listen_pg_addr_param,
            broker_endpoint_param,
            virtual_file_io_engine,
            get_vectored_impl,
        ];
        if let Some(control_plane_api) = &self.env.control_plane_api {
@@ -112,9 +132,9 @@ impl PageServerNode {
                control_plane_api.as_str()
            ));
-            // Attachment service uses the same auth as pageserver: if JWT is enabled
+            // Storage controller uses the same auth as pageserver: if JWT is enabled
            // for us, we will also need it to talk to them.
-            if matches!(self.conf.http_auth_type, AuthType::NeonJWT) {
+            if matches!(http_auth_type, AuthType::NeonJWT) {
                let jwt_token = self
                    .env
                    .generate_auth_token(&Claims::new(None, Scope::GenerationsApi))
@@ -132,8 +152,7 @@ impl PageServerNode {
            ));
        }
-        if self.conf.http_auth_type != AuthType::Trust || self.conf.pg_auth_type != AuthType::Trust
+        if *http_auth_type != AuthType::Trust || *pg_auth_type != AuthType::Trust {
        {
            // Keys are generated in the toplevel repo dir, pageservers' workdirs
            // are one level below that, so refer to keys with ../
            overrides.push("auth_validation_public_key_path='../auth_public_key.pem'".to_owned());
@@ -164,8 +183,8 @@ impl PageServerNode {
            .expect("non-Unicode path")
    }
-    pub async fn start(&self, config_overrides: &[&str], register: bool) -> anyhow::Result<()> {
+    pub async fn start(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
-        self.start_node(config_overrides, false, register).await
+        self.start_node(config_overrides, false).await
    }
    fn pageserver_init(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
@@ -203,6 +222,28 @@ impl PageServerNode {
            String::from_utf8_lossy(&init_output.stderr),
        );
        // Write metadata file, used by pageserver on startup to register itself with
        // the storage controller
        let metadata_path = datadir.join("metadata.json");
        let (_http_host, http_port) =
            parse_host_port(&self.conf.listen_http_addr).expect("Unable to parse listen_http_addr");
        let http_port = http_port.unwrap_or(9898);
        // Intentionally hand-craft JSON: this acts as an implicit format compat test
        // in case the pageserver-side structure is edited, and reflects the real life
        // situation: the metadata is written by some other script.
        std::fs::write(
            metadata_path,
            serde_json::to_vec(&serde_json::json!({
                "host": "localhost",
                "port": self.pg_connection_config.port(),
                "http_host": "localhost",
                "http_port": http_port,
            }))
            .unwrap(),
        )
        .expect("Failed to write metadata file");
        Ok(())
    }
@@ -210,27 +251,7 @@ impl PageServerNode {
        &self,
        config_overrides: &[&str],
        update_config: bool,
        register: bool,
    ) -> anyhow::Result<()> {
        // Register the node with the storage controller before starting pageserver: pageserver must be registered to
        // successfully call /re-attach and finish starting up.
        if register {
            let attachment_service = AttachmentService::from_env(&self.env);
            let (pg_host, pg_port) =
                parse_host_port(&self.conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
            let (http_host, http_port) = parse_host_port(&self.conf.listen_http_addr)
                .expect("Unable to parse listen_http_addr");
            attachment_service
                .node_register(NodeRegisterRequest {
                    node_id: self.conf.id,
                    listen_pg_addr: pg_host.to_string(),
                    listen_pg_port: pg_port.unwrap_or(5432),
                    listen_http_addr: http_host.to_string(),
                    listen_http_port: http_port.unwrap_or(80),
                })
                .await?;
        }
        // TODO: using a thread here because start_process() is not async but we need to call check_status()
        let datadir = self.repo_path();
        print!(
@@ -263,11 +284,6 @@ impl PageServerNode {
                match st {
                    Ok(()) => Ok(true),
                    Err(mgmt_api::Error::ReceiveBody(_)) => Ok(false),
                    Err(mgmt_api::Error::ApiError(status, _msg))
                        if status == StatusCode::SERVICE_UNAVAILABLE =>
                    {
                        Ok(false)
                    }
                    Err(e) => Err(anyhow::anyhow!("Failed to check node status: {e}")),
                }
            },
@@ -560,13 +576,6 @@ impl PageServerNode {
        Ok(self.http_client.list_timelines(*tenant_shard_id).await?)
    }
    pub async fn tenant_secondary_download(&self, tenant_id: &TenantShardId) -> anyhow::Result<()> {
        Ok(self
            .http_client
            .tenant_secondary_download(*tenant_id)
            .await?)
    }
    pub async fn timeline_create(
        &self,
        tenant_shard_id: TenantShardId,
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -10,7 +10,7 @@ use pageserver_api::{
        TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse,
        TimelineCreateRequest, TimelineInfo,
    },
-    shard::TenantShardId,
+    shard::{ShardStripeSize, TenantShardId},
 };
 use pageserver_client::mgmt_api::ResponseErrorMessageExt;
 use postgres_backend::AuthType;
@@ -24,7 +24,7 @@ use utils::{
    id::{NodeId, TenantId},
 };
-pub struct AttachmentService {
+pub struct StorageController {
    env: LocalEnv,
    listen: String,
    path: Utf8PathBuf,
@@ -36,7 +36,10 @@ pub struct AttachmentService {
 const COMMAND: &str = "storage_controller";
-const ATTACHMENT_SERVICE_POSTGRES_VERSION: u32 = 16;
+const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16;
 // Use a shorter pageserver unavailability interval than the default to speed up tests.
 const NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL: std::time::Duration = std::time::Duration::from_secs(10);
 #[derive(Serialize, Deserialize)]
 pub struct AttachHookRequest {
@@ -59,7 +62,7 @@ pub struct InspectResponse {
    pub attachment: Option<(u32, NodeId)>,
 }
-impl AttachmentService {
+impl StorageController {
    pub fn from_env(env: &LocalEnv) -> Self {
        let path = Utf8PathBuf::from_path_buf(env.base_data_dir.clone())
            .unwrap()
@@ -136,27 +139,27 @@ impl AttachmentService {
    }
    fn pid_file(&self) -> Utf8PathBuf {
-        Utf8PathBuf::from_path_buf(self.env.base_data_dir.join("attachment_service.pid"))
+        Utf8PathBuf::from_path_buf(self.env.base_data_dir.join("storage_controller.pid"))
            .expect("non-Unicode path")
    }
-    /// PIDFile for the postgres instance used to store attachment service state
+    /// PIDFile for the postgres instance used to store storage controller state
    fn postgres_pid_file(&self) -> Utf8PathBuf {
        Utf8PathBuf::from_path_buf(
            self.env
                .base_data_dir
-                .join("attachment_service_postgres.pid"),
+                .join("storage_controller_postgres.pid"),
        )
        .expect("non-Unicode path")
    }
    /// Find the directory containing postgres binaries, such as `initdb` and `pg_ctl`
    ///
-    /// This usually uses ATTACHMENT_SERVICE_POSTGRES_VERSION of postgres, but will fall back
+    /// This usually uses STORAGE_CONTROLLER_POSTGRES_VERSION of postgres, but will fall back
    /// to other versions if that one isn't found.  Some automated tests create circumstances
    /// where only one version is available in pg_distrib_dir, such as `test_remote_extensions`.
    pub async fn get_pg_bin_dir(&self) -> anyhow::Result<Utf8PathBuf> {
-        let prefer_versions = [ATTACHMENT_SERVICE_POSTGRES_VERSION, 15, 14];
+        let prefer_versions = [STORAGE_CONTROLLER_POSTGRES_VERSION, 15, 14];
        for v in prefer_versions {
            let path = Utf8PathBuf::from_path_buf(self.env.pg_bin_dir(v)?).unwrap();
@@ -189,7 +192,7 @@ impl AttachmentService {
    ///
    /// Returns the database url
    pub async fn setup_database(&self) -> anyhow::Result<String> {
-        const DB_NAME: &str = "attachment_service";
+        const DB_NAME: &str = "storage_controller";
        let database_url = format!("postgresql://localhost:{}/{DB_NAME}", self.postgres_port);
        let pg_bin_dir = self.get_pg_bin_dir().await?;
@@ -219,10 +222,10 @@ impl AttachmentService {
    }
    pub async fn start(&self) -> anyhow::Result<()> {
-        // Start a vanilla Postgres process used by the attachment service for persistence.
+        // Start a vanilla Postgres process used by the storage controller for persistence.
        let pg_data_path = Utf8PathBuf::from_path_buf(self.env.base_data_dir.clone())
            .unwrap()
-            .join("attachment_service_db");
+            .join("storage_controller_db");
        let pg_bin_dir = self.get_pg_bin_dir().await?;
        let pg_log_path = pg_data_path.join("postgres.log");
@@ -245,7 +248,7 @@ impl AttachmentService {
            .await?;
        };
-        println!("Starting attachment service database...");
+        println!("Starting storage controller database...");
        let db_start_args = [
            "-w",
            "-D",
@@ -256,7 +259,7 @@ impl AttachmentService {
        ];
        background_process::start_process(
-            "attachment_service_db",
+            "storage_controller_db",
            &self.env.base_data_dir,
            pg_bin_dir.join("pg_ctl").as_std_path(),
            db_start_args,
@@ -269,13 +272,18 @@ impl AttachmentService {
        // Run migrations on every startup, in case something changed.
        let database_url = self.setup_database().await?;
        let max_unavailable: humantime::Duration = NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL.into();
        let mut args = vec![
            "-l",
            &self.listen,
            "-p",
            self.path.as_ref(),
            "--dev",
            "--database-url",
            &database_url,
            "--max-unavailable-interval",
            &max_unavailable.to_string(),
        ]
        .into_iter()
        .map(|s| s.to_string())
@@ -300,7 +308,7 @@ impl AttachmentService {
        background_process::start_process(
            COMMAND,
            &self.env.base_data_dir,
-            &self.env.attachment_service_bin(),
+            &self.env.storage_controller_bin(),
            args,
            [(
                "NEON_REPO_DIR".to_string(),
@@ -322,10 +330,10 @@ impl AttachmentService {
    pub async fn stop(&self, immediate: bool) -> anyhow::Result<()> {
        background_process::stop_process(immediate, COMMAND, &self.pid_file())?;
-        let pg_data_path = self.env.base_data_dir.join("attachment_service_db");
+        let pg_data_path = self.env.base_data_dir.join("storage_controller_db");
        let pg_bin_dir = self.get_pg_bin_dir().await?;
-        println!("Stopping attachment service database...");
+        println!("Stopping storage controller database...");
        let pg_stop_args = ["-D", &pg_data_path.to_string_lossy(), "stop"];
        let stop_status = Command::new(pg_bin_dir.join("pg_ctl"))
            .args(pg_stop_args)
@@ -344,10 +352,10 @@ impl AttachmentService {
            // fine that stop failed.  Otherwise it is an error that stop failed.
            const PG_STATUS_NOT_RUNNING: i32 = 3;
            if Some(PG_STATUS_NOT_RUNNING) == status_exitcode.code() {
-                println!("Attachment service data base is already stopped");
+                println!("Storage controller database is already stopped");
                return Ok(());
            } else {
-                anyhow::bail!("Failed to stop attachment service database: {stop_status}")
+                anyhow::bail!("Failed to stop storage controller database: {stop_status}")
            }
        }
@@ -368,7 +376,7 @@ impl AttachmentService {
        }
    }
-    /// Simple HTTP request wrapper for calling into attachment service
+    /// Simple HTTP request wrapper for calling into storage controller
    async fn dispatch<RQ, RS>(
        &self,
        method: hyper::Method,
@@ -468,7 +476,7 @@ impl AttachmentService {
    pub async fn tenant_locate(&self, tenant_id: TenantId) -> anyhow::Result<TenantLocateResponse> {
        self.dispatch::<(), _>(
            Method::GET,
-            format!("control/v1/tenant/{tenant_id}/locate"),
+            format!("debug/v1/tenant/{tenant_id}/locate"),
            None,
        )
        .await
@@ -496,11 +504,15 @@ impl AttachmentService {
        &self,
        tenant_id: TenantId,
        new_shard_count: u8,
        new_stripe_size: Option<ShardStripeSize>,
    ) -> anyhow::Result<TenantShardSplitResponse> {
        self.dispatch(
            Method::PUT,
            format!("control/v1/tenant/{tenant_id}/shard_split"),
-            Some(TenantShardSplitRequest { new_shard_count }),
+            Some(TenantShardSplitRequest {
                new_shard_count,
                new_stripe_size,
            }),
        )
        .await
    }
--- a/docs/authentication.md
+++ b/docs/authentication.md
@@ -70,9 +70,9 @@ Should only be used e.g. for status check/tenant creation/list.
 Should only be used e.g. for status check.
 Currently also used for connection from any pageserver to any safekeeper.
-"generations_api": Provides access to the upcall APIs served by the attachment service or the control plane.
+"generations_api": Provides access to the upcall APIs served by the storage controller or the control plane.
-"admin": Provides access to the control plane and admin APIs of the attachment service.
+"admin": Provides access to the control plane and admin APIs of the storage controller.
 ### CLI
 CLI generates a key pair during call to `neon_local init` with the following commands:
--- a/docs/rfcs/031-sharding-static.md
+++ b/docs/rfcs/031-sharding-static.md
@@ -0,0 +1,408 @@
 # Sharding Phase 1: Static Key-space Sharding
 ## Summary
 To enable databases with sizes approaching the capacity of a pageserver's disk,
 it is necessary to break up the storage for the database, or _shard_ it.
 Sharding in general is a complex area. This RFC aims to define an initial
 capability that will permit creating large-capacity databases using a static configuration
 defined at time of Tenant creation.
 ## Motivation
 Currently, all data for a Tenant, including all its timelines, is stored on a single
 pageserver. The local storage required may be several times larger than the actual
 database size, due to LSM write inflation.
 If a database is larger than what one pageserver can hold, then it becomes impossible
 for the pageserver to hold it in local storage, as it must do to provide service to
 clients.
 ### Prior art
 In Neon:
 - Layer File Spreading: https://www.notion.so/neondatabase/One-Pager-Layer-File-Spreading-Konstantin-21fd9b11b618475da5f39c61dd8ab7a4
 - Layer File SPreading: https://www.notion.so/neondatabase/One-Pager-Layer-File-Spreading-Christian-eb6b64182a214e11b3fceceee688d843
 - Key Space partitioning: https://www.notion.so/neondatabase/One-Pager-Key-Space-Partitioning-Stas-8e3a28a600a04a25a68523f42a170677
 Prior art in other distributed systems is too broad to capture here: pretty much
 any scale out storage system does something like this.
 ## Requirements
 - Enable creating a large (for example, 16TiB) database without requiring dedicated
  pageserver nodes.
 - Share read/write bandwidth costs for large databases across pageservers, as well
  as storage capacity, in order to avoid large capacity databases acting as I/O hotspots
  that disrupt service to other tenants.
 - Our data distribution scheme should handle sparse/nonuniform keys well, since postgres
  does not write out a single contiguous ranges of page numbers.
 _Note: the definition of 'large database' is arbitrary, but the lower bound is to ensure that a database
 that a user might create on a current-gen enterprise SSD should also work well on
 Neon. The upper bound is whatever postgres can handle: i.e. we must make sure that the
 pageserver backend is not the limiting factor in the database size_.
 ## Non Goals
 - Independently distributing timelines within the same tenant. If a tenant has many
  timelines, then sharding may be a less efficient mechanism for distributing load than
  sharing out timelines between pageservers.
 - Distributing work in the LSN dimension: this RFC focuses on the Key dimension only,
  based on the idea that separate mechanisms will make sense for each dimension.
 ## Impacted Components
 pageserver, control plane, postgres/smgr
 ## Terminology
 **Key**: a postgres page number, qualified by relation. In the sense that the pageserver is a versioned key-value store,
 the page number is the key in that store. `Key` is a literal data type in existing code.
 **LSN dimension**: this just means the range of LSNs (history), when talking about the range
 of keys and LSNs as a two dimensional space.
 ## Implementation
 ### Key sharding vs. LSN sharding
 When we think of sharding across the two dimensional key/lsn space, this is an
 opportunity to think about how the two dimensions differ:
 - Sharding the key space distributes the _write_ workload of ingesting data
  and compacting. This work must be carefully managed so that exactly one
  node owns a given key.
 - Sharding the LSN space distributes the _historical read_ workload. This work
  can be done by anyone without any special coordination, as long as they can
  see the remote index and layers.
 The key sharding is the harder part, and also the more urgent one, to support larger
 capacity databases. Because distributing historical LSN read work is a relatively
 simpler problem that most users don't have, we defer it to future work. It is anticipated
 that some quite simple P2P offload model will enable distributing work for historical
 reads: a node which is low on space can call out to peer to ask it to download and
 serve reads from a historical layer.
 ### Key mapping scheme
 Having decided to focus on key sharding, we must next decide how we will map
 keys to shards. It is proposed to use a "wide striping" approach, to obtain a good compromise
 between data locality and avoiding entire large relations mapping to the same shard.
 We will define two spaces:
 - Key space: unsigned integer
 - Shard space: integer from 0 to N-1, where we have N shards.
 ### Key -> Shard mapping
 Keys are currently defined in the pageserver's getpage@lsn interface as follows:
 ```
 pub struct Key {
    pub field1: u8,
    pub field2: u32,
    pub field3: u32,
    pub field4: u32,
    pub field5: u8,
    pub field6: u32,
 }
 fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key {
    Key {
        field1: 0x00,
        field2: rel.spcnode,
        field3: rel.dbnode,
        field4: rel.relnode,
        field5: rel.forknum,
        field6: blknum,
    }
 }
 ```
 _Note: keys for relation metadata are ignored here, as this data will be mirrored to all
 shards. For distribution purposes, we only care about user data keys_
 The properties we want from our Key->Shard mapping are:
 - Locality in `blknum`, such that adjacent `blknum` will usually map to
  the same stripe and consequently land on the same shard, even though the overall
  collection of blocks in a relation will be spread over many stripes and therefore
  many shards.
 - Avoid the same blknum on different relations landing on the same stripe, so that
  with many small relations we do not end up aliasing data to the same stripe/shard.
 - Avoid vulnerability to aliasing in the values of relation identity fields, such that
  if there are patterns in the value of `relnode`, these do not manifest as patterns
  in data placement.
 To accomplish this, the blknum is used to select a stripe, and stripes are
 assigned to shards in a pseudorandom order via a hash. The motivation for
 pseudo-random distribution (rather than sequential mapping of stripe to shard)
 is to avoid I/O hotspots when sequentially reading multiple relations: we don't want
 all relations' stripes to touch pageservers in the same order.
 To map a `Key` to a shard:
 - Hash the `Key` field 4 (relNode).
 - Divide field 6 (`blknum`) field by the stripe size in pages, and combine the
  hash of this with the hash from the previous step.
 - The total hash modulo the shard count gives the shard holding this key.
 Why don't we use the other fields in the Key?
 - We ignore `forknum` for key mapping, because it distinguishes different classes of data
  in the same relation, and we would like to keep the data in a relation together.
 - We would like to use spcNode and dbNode, but cannot. Postgres database creation operations can refer to an existing database as a template, such that the created
  database's blocks differ only by spcNode and dbNode from the original. To enable running
  this type of creation without cross-pageserver communication, we must ensure that these
  blocks map to the same shard -- we do this by excluding spcNode and dbNode from the hash.
 ### Data placement examples
 For example, consider the extreme large databases cases of postgres data layout in a system with 8 shards
 and a stripe size of 32k pages:
 - A single large relation: `blknum` division will break the data up into 4096
  stripes, which will be scattered across the shards.
 - 4096 relations of of 32k pages each: each relation will map to exactly one stripe,
  and that stripe will be placed according to the hash of the key fields 4. The
  data placement will be statistically uniform across shards.
 Data placement will be more uneven on smaller databases:
 - A tenant with 2 shards and 2 relations of one stripe size each: there is a 50% chance
  that both relations land on the same shard and no data lands on the other shard.
 - A tenant with 8 shards and one relation of size 12 stripes: 4 shards will have double
  the data of the other four shards.
 These uneven cases for small amounts of data do not matter, as long as the stripe size
 is an order of magnitude smaller than the amount of data we are comfortable holding
 in a single shard: if our system handles shard sizes up to 10-100GB, then it is not an issue if
 a tenant has some shards with 256MB size and some shards with 512MB size, even though
 the standard deviation of shard size within the tenant is very high. Our key mapping
 scheme provides a statistical guarantee that as the tenant's overall data size increases,
 uniformity of placement will improve.
 ### Important Types
 #### `ShardIdentity`
 Provides the information needed to know whether a particular key belongs
 to a particular shard:
 - Layout version
 - Stripe size
 - Shard count
 - Shard index
 This structure's size is constant. Note that if we had used a differnet key
 mapping scheme such as consistent hashing with explicit hash ranges assigned
 to each shard, then the ShardIdentity's size would grow with the shard count: the simpler
 key mapping scheme used here enables a small fixed size ShardIdentity.
 ### Pageserver changes
 #### Structural
 Everywhere the Pageserver currently deals with Tenants, it will move to dealing with
 `TenantShard`s, which are just a `Tenant` plus a `ShardIdentity` telling it which part
 of the keyspace it owns. An un-sharded tenant is just a `TenantShard` whose `ShardIdentity`
 covers the whole keyspace.
 When the pageserver writes layers and index_part.json to remote storage, it must
 include the shard index & count in the name, to avoid collisions (the count is
 necessary for future-proofing: the count will vary in time). These keys
 will also include a generation number: the [generation numbers](025-generation-numbers.md) system will work
 exactly the same for TenantShards as it does for Tenants today: each shard will have
 its own generation number.
 #### Storage Format: Keys
 For tenants with >1 shard, layer files implicitly become sparse: within the key
 range described in the layer name, the layer file for a shard will only hold the
 content relevant to stripes assigned to the shard.
 For this reason, the LayerFileName within a tenant is no longer unique: different shards
 may use the same LayerFileName to refer to different data. We may solve this simply
 by including the shard number in the keys used for layers.
 The shard number will be included as a prefix (as part of tenant ID), like this:
 `pageserver/v1/tenants/<tenant_id>-<shard_number><shard_count>/timelines/<timeline id>/<layer file name>-<generation>`
 `pageserver/v1/tenants/<tenant_id>-<shard_number><shard_count>/timelines/<timeline id>/index_part.json-<generation>`
 Reasons for this particular format:
 - Use of a prefix is convenient for implementation (no need to carry the shard ID everywhere
  we construct a layer file name), and enables efficient listing of index_parts within
  a particular shard-timeline prefix.
 - Including the shard _count_ as well as shard number means that in future when we implement
  shard splitting, it will be possible for a parent shard and one of its children to write
  the same layer file without a name collision. For example, a parent shard 0_1 might split
  into two (0_2, 1_2), and in the process of splitting shard 0_2 could write a layer or index_part
  that is distinct from what shard 0_1 would have written at the same place.
 In practice, we expect shard counts to be relatively small, so a `u8` will be sufficient,
 and therefore the shard part of the path can be a fixed-length hex string like `{:02X}{:02X}`,
 for example a single-shard tenant's prefix will be `0001`.
 For backward compatibility, we may define a special `ShardIdentity` that has shard_count==0,
 and use this as a cue to construct paths with no prefix at all.
 #### Storage Format: Indices
 In the phase 1 described in this RFC, shards only reference layers they write themselves. However,
 when we implement shard splitting in future, it will be useful to enable shards to reference layers
 written by other shards (specifically the parent shard during a split), so that shards don't
 have to exhaustively copy all data into their own shard-prefixed keys.
 To enable this, the `IndexPart` structure will be extended to store the (shard number, shard count)
 tuple on each layer, such that it can construct paths for layers written by other shards. This
 naturally raises the question of who "owns" such layers written by ancestral shards: this problem
 will be addressed in phase 2.
 For backward compatibility, any index entry without shard information will be assumed to be
 in the legacy shardidentity.
 #### WAL Ingest
 In Phase 1, all shards will subscribe to the safekeeper to download WAL content. They will filter
 it down to the pages relevant to their shard:
 - For ordinary user data writes, only retain a write if it matches the ShardIdentity
 - For metadata describing relations etc, all shards retain these writes.
 The pageservers must somehow give the safekeeper correct feedback on remote_consistent_lsn:
 one solution here is for the 0th shard to periodically peek at the IndexParts for all the other shards,
 and have only the 0th shard populate remote_consistent_lsn. However, this is relatively
 expensive: if the safekeeper can be made shard-aware then it could be taught to use
 the max() of all shards' remote_consistent_lsns to decide when to trim the WAL.
 #### Compaction/GC
 No changes needed.
 The pageserver doesn't have to do anything special during compaction
 or GC. It is implicitly operating on the subset of keys that map to its ShardIdentity.
 This will result in sparse layer files, containing keys only in the stripes that this
 shard owns. Where optimizations currently exist in compaction for spotting "gaps" in
 the key range, these should be updated to ignore gaps that are due to sharding, to
 avoid spuriously splitting up layers ito stripe-sized pieces.
 ### Compute Endpoints
 Compute endpoints will need to:
 - Accept a vector of connection strings as part of their configuration from the control plane
 - Route pageserver requests according to mapping the hash of key to the correct
  entry in the vector of connection strings.
 Doing this in compute rather than routing requests via a single pageserver is
 necessary to enable sharding tenants without adding latency from extra hops.
 ### Control Plane
 Tenants, or _Projects_ in the control plane, will each own a set of TenantShards (this will
 be 1 for small tenants). Logic for placement of tenant shards is just the same as the current logic for placing
 tenants.
 Tenant lifecycle operations like deletion will require fanning-out to all the shards
 in the tenant. The same goes for timeline creation and deletion: a timeline should
 not be considered created until it has been created in all shards.
 #### Selectively enabling sharding for large tenants
 Initially, we will explicitly enable sharding for large tenants only.
 In future, this hint mechanism will become optional when we implement automatic
 re-sharding of tenants.
 ## Future Phases
 This section exists to indicate what will likely come next after this phase.
 Phases 2a and 2b are amenable to execution in parallel.
 ### Phase 2a: WAL fan-out
 **Problem**: when all shards consume the whole WAL, the network bandwidth used
 for transmitting the WAL from safekeeper to pageservers is multiplied by a factor
 of the shard count.
 Network bandwidth is not our most pressing bottleneck, but it is likely to become
 a problem if we set a modest shard count (~8) on a significant number of tenants,
 especially as those larger tenants which we shard are also likely to have higher
 write bandwidth than average.
 ### Phase 2b: Shard Splitting
 **Problem**: the number of shards in a tenant is defined at creation time and cannot
 be changed. This causes excessive sharding for most small tenants, and an upper
 bound on scale for very large tenants.
 To address this, a _splitting_ feature will later be added. One shard can split its
 data into a number of children by doing a special compaction operation to generate
 image layers broken up child-shard-wise, and then writing out an `index_part.json` for
 each child. This will then require external coordination (by the control plane) to
 safely attach these new child shards and then move them around to distribute work.
 The opposite _merging_ operation can also be imagined, but is unlikely to be implemented:
 once a Tenant has been sharded, the marginal efficiency benefit of merging is unlikely to justify
 the risk/complexity of implementing such a rarely-encountered scenario.
 ### Phase N (future): distributed historical reads
 **Problem**: while sharding based on key is good for handling changes in overall
 database size, it is less suitable for spiky/unpredictable changes in the read
 workload to historical layers. Sudden increases in historical reads could result
 in sudden increases in local disk capacity required for a TenantShard.
 Example: the extreme case of this would be to run a tenant for a year, then create branches
 with ancestors at monthly intervals. This could lead to a sudden 12x inflation in
 the on-disk capacity footprint of a TenantShard, since it would be serving reads
 from all those disparate historical layers.
 If we can respond fast enough, then key-sharding a tenant more finely can help with
 this, but splitting may be a relatively expensive operation and the increased historical
 read load may be transient.
 A separate mechanism for handling heavy historical reads could be something like
 a gossip mechanism for pageservers to communicate
 about their workload, and then a getpageatlsn offload mechanism where one pageserver can
 ask another to go read the necessary layers from remote storage to serve the read. This
 requires relativly little coordination because it is read-only: any node can service any
 read. All reads to a particular shard would still flow through one node, but the
 disk capactity & I/O impact of servicing the read would be distributed.
 ## FAQ/Alternatives
 ### Why stripe the data, rather than using contiguous ranges of keyspace for each shard?
 When a database is growing under a write workload, writes may predominantly hit the
 end of the keyspace, creating a bandwidth hotspot on that shard. Similarly, if the user
 is intensively re-writing a particular relation, if that relation lived in a particular
 shard then it would not achieve our goal of distributing the write work across shards.
 ### Why not proxy read requests through one pageserver, so that endpoints don't have to change?
 1. This would not achieve scale-out of network bandwidth: a busy tenant with a large
   database would still cause a load hotspot on the pageserver routing its read requests.
 2. The additional hop through the "proxy" pageserver would add latency and overall
   resource cost (CPU, network bandwidth)
 ### Layer File Spreading: use one pageserver as the owner of a tenant, and have it spread out work on a per-layer basis to peers
 In this model, there would be no explicit sharding of work, but the pageserver to which
 a tenant is attached would not hold all layers on its disk: instead, it would call out
 to peers to have them store some layers, and call out to those peers to request reads
 in those layers.
 This mechanism will work well for distributing work in the LSN dimension, but in the key
 space dimension it has the major limitation of requiring one node to handle all
 incoming writes, and compactions. Even if the write workload for a large database
 fits in one pageserver, it will still be a hotspot and such tenants may still
 de-facto require their own pageserver.
--- a/docs/rfcs/032-shard-splitting.md
+++ b/docs/rfcs/032-shard-splitting.md
@@ -0,0 +1,479 @@
 # Shard splitting
 ## Summary
 This RFC describes a new pageserver API for splitting an existing tenant shard into
 multiple shards, and describes how to use this API to safely increase the total
 shard count of a tenant.
 ## Motivation
 In the [sharding RFC](031-sharding-static.md), a mechanism was introduced to scale
 tenants beyond the capacity of a single pageserver by breaking up the key space
 into stripes, and distributing these stripes across many pageservers. However,
 the shard count was defined once at tenant creation time and not varied thereafter.
 In practice, the expected size of a database is rarely known at creation time, and
 it is inefficient to enable sharding for very small tenants: we need to be
 able to create a tenant with a small number of shards (such as 1), and later expand
 when it becomes clear that the tenant has grown in size to a point where sharding
 is beneficial.
 ### Prior art
 Many distributed systems have the problem of choosing how many shards to create for
 tenants that do not specify an expected size up-front. There are a couple of general
 approaches:
 - Write to a key space in order, and start a new shard when the highest key advances
  past some point. This doesn't work well for Neon, because we write to our key space
  in many different contiguous ranges (per relation), rather than in one contiguous
  range. To adapt to this kind of model, we would need a sharding scheme where each
  relation had its own range of shards, which would be inefficient for the common
  case of databases with many small relations.
 - Monitor the system, and automatically re-shard at some size threshold. For
  example in Ceph, the [pg_autoscaler](https://github.com/ceph/ceph/blob/49c27499af4ee9a90f69fcc6bf3597999d6efc7b/src/pybind/mgr/pg_autoscaler/module.py)
  component monitors the size of each RADOS Pool, and adjusts the number of Placement
  Groups (Ceph's shard equivalent).
 ## Requirements
 - A configurable capacity limit per-shard is enforced.
 - Changes in shard count do not interrupt service beyond requiring postgres
  to reconnect (i.e. milliseconds).
 - Human being does not have to choose shard count
 ## Non Goals
 - Shard splitting is always a tenant-global operation: we will not enable splitting
  one shard while leaving others intact.
 - The inverse operation (shard merging) is not described in this RFC. This is a lower
  priority than splitting, because databases grow more often than they shrink, and
  a database with many shards will still work properly if the stored data shrinks, just
  with slightly more overhead (e.g. redundant WAL replication)
 - Shard splitting is only initiated based on capacity bounds, not load. Splitting
  a tenant based on load will make sense for some medium-capacity, high-load workloads,
  but is more complex to reason about and likely is not desirable until we have
  shard merging to reduce the shard count again if the database becomes less busy.
 ## Impacted Components
 pageserver, storage controller
 (the _storage controller_ is the evolution of what was called `attachment_service` in our test environment)
 ## Terminology
 **Parent** shards are the shards that exist before a split. **Child** shards are
 the new shards created during a split.
 **Shard** is synonymous with _tenant shard_.
 **Shard Index** is the 2-tuple of shard number and shard count, written in
 paths as {:02x}{:02x}, e.g. `0001`.
 ## Background
 In the implementation section, a couple of existing aspects of sharding are important
 to remember:
 - Shard identifiers contain the shard number and count, so that "shard 0 of 1" (`0001`) is
  a distinct shard from "shard 0 of 2" (`0002`). This is the case in key paths, local
  storage paths, and remote index metadata.
 - Remote layer file paths contain the shard index of the shard that created them, and
  remote indices contain the same index to enable building the layer file path. A shard's
  index may reference layers that were created by another shard.
 - Local tenant shard directories include the shard index. All layers downloaded by
  a tenant shard are stored in this shard-prefixed path, even if those layers were
  initially created by another shard: tenant shards do not read and write one anothers'
  paths.
 - The `Tenant` pageserver type represents one tenant _shard_, not the whole tenant.
  This is for historical reasons and will be cleaned up in future, but the existing
  name is used here to help comprehension when reading code.
 ## Implementation
 Note: this section focuses on the correctness of the core split process. This will
 be fairly inefficient in a naive implementation, and several important optimizations
 are described in a later section.
 There are broadly two parts to the implementation:
 1. The pageserver split API, which splits one shard on one pageserver
 2. The overall tenant split proccess which is coordinated by the storage controller,
   and calls into the pageserver split API as needed.
 ### Pageserver Split API
 The pageserver will expose a new API endpoint at `/v1/tenant/:tenant_shard_id/shard_split`
 that takes the new total shard count in the body.
 The pageserver split API operates on one tenant shard, on one pageserver. External
 coordination is required to use it safely, this is described in the later
 'Split procedure' section.
 #### Preparation
 First identify the shard indices for the new child shards. These are deterministic,
 calculated from the parent shard's index, and the number of children being created (this
 is an input to the API, and validated to be a power of two). In a trivial example, splitting
 0001 in two always results in 0002 and 0102.
 Child shard indices are chosen such that the childrens' parts of the keyspace will
 be subsets of the parent's parts of the keyspace.
 #### Step 1: write new remote indices
 In remote storage, splitting is very simple: we may just write new index_part.json
 objects for each child shard, containing exactly the same layers as the parent shard.
 The children will have more data than they need, but this avoids any exhausive
 re-writing or copying of layer files.
 The index key path includes a generation number: the parent shard's current
 attached generation number will also be used for the child shards' indices. This
 makes the operation safely retryable: if everything crashes and restarts, we may
 call the split API again on the parent shard, and the result will be some new remote
 indices for the child shards, under a higher generation number.
 #### Step 2: start new `Tenant` objects
 A new `Tenant` object may be instantiated for each child shard, while the parent
 shard still exists. When calling the tenant_spawn function for this object,
 the remote index from step 1 will be read, and the child shard will start
 to ingest WAL to catch up from whatever was in the remote storage at step 1.
 We now wait for child shards' WAL ingestion to catch up with the parent shard,
 so that we can safely tear down the parent shard without risking an availability
 gap to clients reading recent LSNs.
 #### Step 3: tear down parent `Tenant` object
 Once child shards are running and have caught up with WAL ingest, we no longer
 need the parent shard. Note that clients may still be using it -- when we
 shut it down, any page_service handlers will also shut down, causing clients
 to disconnect. When the client reconnects, it will re-lookup the tenant,
 and hit the child shard instead of the parent (shard lookup from page_service
 should bias toward higher ShardCount shards).
 Note that at this stage the page service client has not yet been notified of
 any split. In the trivial single split example:
 - Shard 0001 is gone: Tenant object torn down
 - Shards 0002 and 0102 are running on the same pageserver where Shard 0001 used to live.
 - Clients will continue to connect to that server thinking that shard 0001 is there,
  and all requests will work, because any key that was in shard 0001 is definitely
  available in either shard 0002 or shard 0102.
 - Eventually, the storage controller (not the pageserver) will decide to migrate
  some child shards away: at that point it will do a live migration, ensuring
  that the client has an updated configuration before it detaches anything
  from the original server.
 #### Complete
 When we send a 200 response to the split request, we are promising the caller:
 - That the child shards are persistent in remote storage
 - That the parent shard has been shut down
 This enables the caller to proceed with the overall shard split operation, which
 may involve other shards on other pageservers.
 ### Storage Controller Split procedure
 Splitting a tenant requires calling the pageserver split API, and tracking
 enough state to ensure recovery + completion in the event of any component (pageserver
 or storage controller) crashing (or request timing out) during the split.
 1. call the split API on all existing shards. Ensure that the resulting
   child shards are pinned to their pageservers until _all_ the split calls are done.
   This pinning may be implemented as a "split bit" on the tenant shards, that
   blocks any migrations, and also acts as a sign that if we restart, we must go
   through some recovery steps to resume the split.
 2. Once all the split calls are done, we may unpin the child shards (clear
   the split bit). The split is now complete: subsequent steps are just migrations,
   not strictly part of the split.
 3. Try to schedule new pageserver locations for the child shards, using
   a soft anti-affinity constraint to place shards from the same tenant onto different
   pageservers.
 Updating computes about the new shard count is not necessary until we migrate
 any of the child shards away from the parent's location.
 ### Recovering from failures
 #### Rolling back an incomplete split
 An incomplete shard split may be rolled back quite simply, by attaching the parent shards to pageservers,
 and detaching child shards. This will lose any WAL ingested into the children after the parents
 were detached earlier, but the parents will catch up.
 No special pageserver API is needed for this. From the storage controllers point of view, the
 procedure is:
 1. For all parent shards in the tenant, ensure they are attached
 2. For all child shards, ensure they are not attached
 3. Drop child shards from the storage controller's database, and clear the split bit on the parent shards.
 Any remote storage content for child shards is left behind. This is similar to other cases where
 we may leave garbage objects in S3 (e.g. when we upload a layer but crash before uploading an
 index that references it). Future online scrub/cleanup functionality can remove these objects, or
 they will be removed when the tenant is deleted, as tenant deletion lists all objects in the prefix,
 which would include any child shards that were rolled back.
 If any timelines had been created on child shards, they will be lost when rolling back. To mitigate
 this, we will **block timeline creation during splitting**, so that we can safely roll back until
 the split is complete, without risking losing timelines.
 Rolling back an incomplete split will happen automatically if a split fails due to some fatal
 reason, and will not be accessible via an API:
 - A pageserver fails to complete its split API request after too many retries
 - A pageserver returns a fatal unexpected error such as 400 or 500
 - The storage controller database returns a non-retryable error
 - Some internal invariant is violated in the storage controller split code
 #### Rolling back a complete split
 A complete shard split may be rolled back similarly to an incomplete split, with the following
 modifications:
 - The parent shards will no longer exist in the storage controller database, so these must
  be re-synthesized somehow: the hard part of this is figuring the parent shards' generations. This
  may be accomplished either by probing in S3, or by retaining some tombstone state for deleted
  shards in the storage controller database.
 - Any timelines that were created after the split complete will disappear when rolling back
  to the tenant shards. For this reason, rolling back after a complete split should only
  be done due to serious issues where loss of recently created timelines is acceptable, or
  in cases where we have confirmed that no timelines were created in the intervening period.
 - Parent shards' layers must not have been deleted: this property will come "for free" when
  we first roll out sharding, by simply not implementing deletion of parent layers after
  a split. When we do implement such deletion (see "Cleaning up parent-shard layers" in the
  Optimizations section), it should apply a TTL to layers such that we have a
  defined walltime window in which rollback will be possible.
 The storage controller will expose an API for rolling back a complete split, for use
 in the field if we encounter some critical bug with a post-split tenant.
 #### Retrying API calls during Pageserver Restart
 When a pageserver restarts during a split API call, it may witness on-disk content for both parent and
 child shards from an ongoing split. This does not intrinsically break anything, and the
 pageserver may include all these shards in its `/re-attach` request to the storage controller.
 In order to support such restarts, it is important that the storage controller stores
 persistent records of each child shard before it calls into a pageserver, as these child shards
 may require generation increments via a `/re-attach` request.
 The pageserver restart will also result in a failed API call from the storage controller's point
 of view. Recall that if _any_ pageserver fails to split, the overall split operation may not
 complete, and all shards must remain pinned to their current pageserver locations until the
 split is done.
 The pageserver API calls during splitting will retry on transient errors, so that
 short availability gaps do not result in a failure of the overall operation. The
 split in progress will be automatically rolled back if the threshold for API
 retries is reached (e.g. if a pageserver stays offline for longer than a typical
 restart).
 #### Rollback on Storage Controller Restart
 On startup, the storage controller will inspect the split bit for tenant shards that
 it loads from the database. If any splits are in progress:
 - Database content will be reverted to the parent shards
 - Child shards will be dropped from memory
 - The parent and child shards will be included in the general startup reconciliation that
  the storage controller does: any child shards will be detached from pageservers because
  they don't exist in the storage controller's expected set of shards, and parent shards
  will be attached if they aren't already.
 #### Storage controller API request failures/retries
 The split request handler will implement idempotency: if the [`Tenant`] requested to split
 doesn't exist, we will check for the would-be child shards, and if they already exist,
 we consider the request complete.
 If a request is retried while the original request is still underway, then the split
 request handler will notice an InProgress marker in TenantManager, and return 503
 to encourage the client to backoff/retry. This is the same as the general pageserver
 API handling for calls that try to act on an InProgress shard.
 #### Compute start/restart during a split
 If a compute starts up during split, it will be configured with the old sharding
 configuration. This will work for reads irrespective of the progress of the split
 as long as no child hards have been migrated away from their original location, and
 this is guaranteed in the split procedure (see earlier section).
 #### Pageserver fails permanently during a split
 If a pageserver permanently fails (i.e. the storage controller availability state for it
 goes to Offline) while a split is in progress, the splitting operation will roll back, and
 during the roll back it will skip any API calls to the offline pageserver. If the offline
 pageserver becomes available again, any stale locations will be cleaned up via the normal reconciliation process (the `/re-attach` API).
 ### Handling secondary locations
 For correctness, it is not necessary to split secondary locations. We can simply detach
 the secondary locations for parent shards, and then attach new secondary locations
 for child shards.
 Clearly this is not optimal, as it will result in re-downloads of layer files that
 were already present on disk. See "Splitting secondary locations"
 ### Conditions to trigger a split
 The pageserver will expose a new API for reporting on shards that are candidates
 for split: this will return a top-N report of the largest tenant shards by
 physical size (remote size). This should exclude any tenants that are already
 at the maximum configured shard count.
 The API would look something like:
 `/v1/top_n_tenant?shard_count_lt=8&sort_by=resident_size`
 The storage controller will poll that API across all pageservers it manages at some appropriate interval (e.g. 60 seconds).
 A split operation will be started when the tenant exceeds some threshold. This threshold
 should be _less than_ how large we actually want shards to be, perhaps much less. That's to
 minimize the amount of work involved in splitting -- if we want 100GiB shards, we shouldn't
 wait for a tenant to exceed 100GiB before we split anything. Some data analysis of existing
 tenant size distribution may be useful here: if we can make a statement like "usually, if
 a tenant has exceeded 20GiB they're probably going to exceed 100GiB later", then we might
 make our policy to split a tenant at 20GiB.
 The finest split we can do is by factors of two, but we can do higher-cardinality splits
 too, and this will help to reduce the overhead of repeatedly re-splitting a tenant
 as it grows. An example of a very simple heuristic for early deployment of the splitting
 feature would be: "Split tenants into 8 shards when their physical size exceeds 64GiB": that
 would give us two kinds of tenant (1 shard and 8 shards), and the confidence that once we had
 split a tenant, it will not need re-splitting soon after.
 ## Optimizations
 ### Flush parent shard to remote storage during split
 Any data that is in WAL but not remote storage at time of split will need
 to be replayed by child shards when they start for the first time. To minimize
 this work, we may flush the parent shard to remote storage before writing the
 remote indices for child shards.
 It is important that this flush is subject to some time bounds: we may be splitting
 in response to a surge of write ingest, so it may be time-critical to split. A
 few seconds to flush latest data should be sufficient to optimize common cases without
 running the risk of holding up a split for a harmful length of time when a parent
 shard is being written heavily. If the flush doesn't complete in time, we may proceed
 to shut down the parent shard and carry on with the split.
 ### Hard linking parent layers into child shard directories
 Before we start the Tenant objects for child shards, we may pre-populate their
 local storage directories with hard links to the layer files already present
 in the parent shard's local directory. When the child shard starts and downloads
 its remote index, it will find all those layer files already present on local disk.
 This avoids wasting download capacity and makes splitting faster, but more importantly
 it avoids taking up a factor of N more disk space when splitting 1 shard into N.
 This mechanism will work well in typical flows where shards are migrated away
 promptly after a split, but for the general case including what happens when
 layers are evicted and re-downloaded after a split, see the 'Proactive compaction'
 section below.
 ### Filtering during compaction
 Compaction, especially image layer generation, should skip any keys that are
 present in a shard's layer files, but do not match the shard's ShardIdentity's
 is_key_local() check. This avoids carrying around data for longer than necessary
 in post-split compactions.
 This was already implemented in https://github.com/neondatabase/neon/pull/6246
 ### Proactive compaction
 In remote storage, there is little reason to rewrite any data on a shard split:
 all the children can reference parent layers via the very cheap write of the child
 index_part.json.
 In local storage, things are more nuanced. During the initial split there is no
 capacity cost to duplicating parent layers, if we implement the hard linking
 optimization described above. However, as soon as any layers are evicted from
 local disk and re-downloaded, the downloaded layers will not be hard-links any more:
 they'll have real capacity footprint. That isn't a problem if we migrate child shards
 away from the parent node swiftly, but it risks a significant over-use of local disk
 space if we do not.
 For example, if we did an 8-way split of a shard, and then _didn't_ migrate 7 of
 the shards elsewhere, then churned all the layers in all the shards via eviction,
 then we would blow up the storage capacity used on the node by 8x. If we're splitting
 a 100GB shard, that could take the pageserver to the point of exhausting disk space.
 To avoid this scenario, we could implement a special compaction mode where we just
 read historic layers, drop unwanted keys, and write back the layer file. This
 is pretty expensive, but useful if we have split a large shard and are not going to
 migrate the child shards away.
 The heuristic conditions for triggering such a compaction are:
 - A) eviction plus time: if a child shard
  has existed for more than a time threshold, and has been requested to perform at least one eviction, then it becomes urgent for this child shard to execute a proactive compaction to reduce its storage footprint, at the cost of I/O load.
 - B) resident size plus time: we may inspect the resident layers and calculate how
  many of them include the overhead of storing pre-split keys. After some time
  threshold (different to the one in case A) we still have such layers occupying
  local disk space, then we should proactively compact them.
 ### Cleaning up parent-shard layers
 It is functionally harmless to leave parent shard layers in remote storage indefinitely.
 They would be cleaned up in the event of the tenant's deletion.
 As an optimization to avoid leaking remote storage capacity (which costs money), we may
 lazily clean up parent shard layers once no child shards reference them.
 This may be done _very_ lazily: e.g. check every PITR interval. The cleanup procedure is:
 - list all the key prefixes beginning with the tenant ID, and select those shard prefixes
  which do not belong to the most-recently-split set of shards (_ancestral shards_, i.e. `shard*count < max(shard_count) over all shards)`, and those shard prefixes which do have the latest shard count (_current shards_)
 - If there are no _ancestral shard_ prefixes found, we have nothing to clean up and
  may drop out now.
 - find the latest-generation index for each _current shard_, read all and accumulate the set of layers belonging to ancestral shards referenced by these indices.
 - for all ancestral shards, list objects in the prefix and delete any layer which was not
  referenced by a current shard.
 If this cleanup is scheduled for 1-2 PITR periods after the split, there is a good chance that child shards will have written their own image layers covering the whole keyspace, such that all parent shard layers will be deletable.
 The cleanup may be done by the scrubber (external process), or we may choose to have
 the zeroth shard in the latest generation do the work -- there is no obstacle to one shard
 reading the other shard's indices at runtime, and we do not require visibility of the
 latest index writes.
 Cleanup should be artificially delayed by some period (for example 24 hours) to ensure
 that we retain the option to roll back a split in case of bugs.
 ### Splitting secondary locations
 We may implement a pageserver API similar to the main splitting API, which does a simpler
 operation for secondary locations: it would not write anything to S3, instead it would simply
 create the child shard directory on local disk, hard link in directories from the parent,
 and set up the in memory (TenantSlot) state for the children.
 Similar to attached locations, a subset of secondary locations will probably need re-locating
 after the split is complete, to avoid leaving multiple child shards on the same pageservers,
 where they may use excessive space for the tenant.
 ## FAQ/Alternatives
 ### What should the thresholds be set to?
 Shard size limit: the pre-sharding default capacity quota for databases was 200GiB, so this could be a starting point for the per-shard size limit.
 Max shard count:
 - The safekeeper overhead to sharding is currently O(N) network bandwidth because
  the un-filtered WAL is sent to all shards. To avoid this growing out of control,
  a limit of 8 shards should be temporarily imposed until WAL filtering is implemented
  on the safekeeper.
 - there is also little benefit to increasing the shard count beyond the number
  of pageservers in a region.
 ### Is it worth just rewriting all the data during a split to simplify reasoning about space?
--- a/libs/metrics/src/hll.rs
+++ b/libs/metrics/src/hll.rs
@@ -40,7 +40,7 @@ macro_rules! register_hll {
    }};
    ($N:literal, $NAME:expr, $HELP:expr $(,)?) => {{
-        $crate::register_hll!($N, $crate::opts!($NAME, $HELP), $LABELS_NAMES)
+        $crate::register_hll!($N, $crate::opts!($NAME, $HELP))
    }};
 }
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -29,7 +29,6 @@ pub mod launch_timestamp;
 mod wrappers;
 pub use wrappers::{CountedReader, CountedWriter};
 mod hll;
 pub mod metric_vec_duration;
 pub use hll::{HyperLogLog, HyperLogLogVec};
 #[cfg(target_os = "linux")]
 pub mod more_process_metrics;
--- a/libs/metrics/src/metric_vec_duration.rs
+++ b/libs/metrics/src/metric_vec_duration.rs
@@ -1,23 +0,0 @@
 //! Helpers for observing duration on `HistogramVec` / `CounterVec` / `GaugeVec` / `MetricVec<T>`.
 use std::{future::Future, time::Instant};
 pub trait DurationResultObserver {
    fn observe_result<T, E>(&self, res: &Result<T, E>, duration: std::time::Duration);
 }
 pub async fn observe_async_block_duration_by_result<
    T,
    E,
    F: Future<Output = Result<T, E>>,
    O: DurationResultObserver,
 >(
    observer: &O,
    block: F,
 ) -> Result<T, E> {
    let start = Instant::now();
    let result = block.await;
    let duration = start.elapsed();
    observer.observe_result(&result, duration);
    result
 }
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -6,7 +6,10 @@ use std::str::FromStr;
 use serde::{Deserialize, Serialize};
 use utils::id::NodeId;
-use crate::{models::ShardParameters, shard::TenantShardId};
+use crate::{
    models::{ShardParameters, TenantConfig},
    shard::{ShardStripeSize, TenantShardId},
 };
 #[derive(Serialize, Deserialize)]
 pub struct TenantCreateResponseShard {
@@ -35,7 +38,7 @@ pub struct NodeRegisterRequest {
 pub struct NodeConfigureRequest {
    pub node_id: NodeId,
-    pub availability: Option<NodeAvailability>,
+    pub availability: Option<NodeAvailabilityWrapper>,
    pub scheduling: Option<NodeSchedulingPolicy>,
 }
@@ -57,6 +60,31 @@ pub struct TenantLocateResponse {
    pub shard_params: ShardParameters,
 }
 #[derive(Serialize, Deserialize)]
 pub struct TenantDescribeResponse {
    pub shards: Vec<TenantDescribeResponseShard>,
    pub stripe_size: ShardStripeSize,
    pub policy: PlacementPolicy,
    pub config: TenantConfig,
 }
 #[derive(Serialize, Deserialize)]
 pub struct TenantDescribeResponseShard {
    pub tenant_shard_id: TenantShardId,
    pub node_attached: Option<NodeId>,
    pub node_secondary: Vec<NodeId>,
    pub last_error: String,
    /// A task is currently running to reconcile this tenant's intent state with the state on pageservers
    pub is_reconciling: bool,
    /// This shard failed in sending a compute notification to the cloud control plane, and a retry is pending.
    pub is_pending_compute_notification: bool,
    /// A shard split is currently underway
    pub is_splitting: bool,
 }
 /// Explicitly migrating a particular shard is a low level operation
 /// TODO: higher level "Reschedule tenant" operation where the request
 /// specifies some constraints, e.g. asking it to get off particular node(s)
@@ -66,30 +94,82 @@ pub struct TenantShardMigrateRequest {
    pub node_id: NodeId,
 }
-#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
+/// Utilisation score indicating how good a candidate a pageserver
 /// is for scheduling the next tenant. See [`crate::models::PageserverUtilization`].
 /// Lower values are better.
 #[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, PartialOrd, Ord)]
 pub struct UtilizationScore(pub u64);
 impl UtilizationScore {
    pub fn worst() -> Self {
        UtilizationScore(u64::MAX)
    }
 }
 #[derive(Serialize, Clone, Copy)]
 #[serde(into = "NodeAvailabilityWrapper")]
 pub enum NodeAvailability {
    // Normal, happy state
-    Active,
+    Active(UtilizationScore),
    // Offline: Tenants shouldn't try to attach here, but they may assume that their
    // secondary locations on this node still exist.  Newly added nodes are in this
    // state until we successfully contact them.
    Offline,
 }
 impl PartialEq for NodeAvailability {
    fn eq(&self, other: &Self) -> bool {
        use NodeAvailability::*;
        matches!((self, other), (Active(_), Active(_)) | (Offline, Offline))
    }
 }
 impl Eq for NodeAvailability {}
 // This wrapper provides serde functionality and it should only be used to
 // communicate with external callers which don't know or care about the
 // utilisation score of the pageserver it is targeting.
 #[derive(Serialize, Deserialize, Clone)]
 pub enum NodeAvailabilityWrapper {
    Active,
    Offline,
 }
 impl From<NodeAvailabilityWrapper> for NodeAvailability {
    fn from(val: NodeAvailabilityWrapper) -> Self {
        match val {
            // Assume the worst utilisation score to begin with. It will later be updated by
            // the heartbeats.
            NodeAvailabilityWrapper::Active => NodeAvailability::Active(UtilizationScore::worst()),
            NodeAvailabilityWrapper::Offline => NodeAvailability::Offline,
        }
    }
 }
 impl From<NodeAvailability> for NodeAvailabilityWrapper {
    fn from(val: NodeAvailability) -> Self {
        match val {
            NodeAvailability::Active(_) => NodeAvailabilityWrapper::Active,
            NodeAvailability::Offline => NodeAvailabilityWrapper::Offline,
        }
    }
 }
 impl FromStr for NodeAvailability {
    type Err = anyhow::Error;
    fn from_str(s: &str) -> Result<Self, Self::Err> {
        match s {
-            "active" => Ok(Self::Active),
+            // This is used when parsing node configuration requests from neon-local.
            // Assume the worst possible utilisation score
            // and let it get updated via the heartbeats.
            "active" => Ok(Self::Active(UtilizationScore::worst())),
            "offline" => Ok(Self::Offline),
            _ => Err(anyhow::anyhow!("Unknown availability state '{s}'")),
        }
    }
 }
 /// FIXME: this is a duplicate of the type in the attachment_service crate, because the
 /// type needs to be defined with diesel traits in there.
 #[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
 pub enum NodeSchedulingPolicy {
    Active,
@@ -129,11 +209,8 @@ impl From<NodeSchedulingPolicy> for String {
 /// to create secondary locations.
 #[derive(Clone, Serialize, Deserialize, Debug, PartialEq, Eq)]
 pub enum PlacementPolicy {
-    /// Cheapest way to attach a tenant: just one pageserver, no secondary
+    /// Normal live state: one attached pageserver and zero or more secondaries.
-    Single,
+    Attached(usize),
    /// Production-ready way to attach a tenant: one attached pageserver and
    /// some number of secondaries.
    Double(usize),
    /// Create one secondary mode locations. This is useful when onboarding
    /// a tenant, or for an idle tenant that we might want to bring online quickly.
    Secondary,
@@ -155,14 +232,14 @@ mod test {
    /// Check stability of PlacementPolicy's serialization
    #[test]
    fn placement_policy_encoding() -> anyhow::Result<()> {
-        let v = PlacementPolicy::Double(1);
+        let v = PlacementPolicy::Attached(1);
        let encoded = serde_json::to_string(&v)?;
-        assert_eq!(encoded, "{\"Double\":1}");
+        assert_eq!(encoded, "{\"Attached\":1}");
        assert_eq!(serde_json::from_str::<PlacementPolicy>(&encoded)?, v);
-        let v = PlacementPolicy::Single;
+        let v = PlacementPolicy::Detached;
        let encoded = serde_json::to_string(&v)?;
-        assert_eq!(encoded, "\"Single\"");
+        assert_eq!(encoded, "\"Detached\"");
        assert_eq!(serde_json::from_str::<PlacementPolicy>(&encoded)?, v);
        Ok(())
    }
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -4,6 +4,7 @@ pub mod utilization;
 pub use utilization::PageserverUtilization;
 use std::{
    borrow::Cow,
    collections::HashMap,
    io::{BufRead, Read},
    num::{NonZeroU64, NonZeroUsize},
@@ -198,6 +199,13 @@ pub struct TimelineCreateRequest {
 #[derive(Serialize, Deserialize)]
 pub struct TenantShardSplitRequest {
    pub new_shard_count: u8,
    // A tenant's stripe size is only meaningful the first time their shard count goes
    // above 1: therefore during a split from 1->N shards, we may modify the stripe size.
    //
    // If this is set while the stripe count is being increased from an already >1 value,
    // then the request will fail with 400.
    pub new_stripe_size: Option<ShardStripeSize>,
 }
 #[derive(Serialize, Deserialize)]
@@ -419,7 +427,7 @@ pub struct StatusResponse {
 #[derive(Serialize, Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
 pub struct TenantLocationConfigRequest {
-    pub tenant_id: TenantShardId,
+    pub tenant_id: Option<TenantShardId>,
    #[serde(flatten)]
    pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
 }
@@ -570,7 +578,7 @@ pub struct TimelineInfo {
    pub walreceiver_status: String,
 }
-#[derive(Debug, Clone, Serialize)]
+#[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct LayerMapInfo {
    pub in_memory_layers: Vec<InMemoryLayerInfo>,
    pub historic_layers: Vec<HistoricLayerInfo>,
@@ -588,7 +596,7 @@ pub enum LayerAccessKind {
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct LayerAccessStatFullDetails {
    pub when_millis_since_epoch: u64,
-    pub task_kind: &'static str,
+    pub task_kind: Cow<'static, str>,
    pub access_kind: LayerAccessKind,
 }
@@ -647,23 +655,23 @@ impl LayerResidenceEvent {
    }
 }
-#[derive(Debug, Clone, Serialize)]
+#[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct LayerAccessStats {
    pub access_count_by_access_kind: HashMap<LayerAccessKind, u64>,
-    pub task_kind_access_flag: Vec<&'static str>,
+    pub task_kind_access_flag: Vec<Cow<'static, str>>,
    pub first: Option<LayerAccessStatFullDetails>,
    pub accesses_history: HistoryBufferWithDropCounter<LayerAccessStatFullDetails, 16>,
    pub residence_events_history: HistoryBufferWithDropCounter<LayerResidenceEvent, 16>,
 }
-#[derive(Debug, Clone, Serialize)]
+#[derive(Debug, Clone, Serialize, Deserialize)]
 #[serde(tag = "kind")]
 pub enum InMemoryLayerInfo {
    Open { lsn_start: Lsn },
    Frozen { lsn_start: Lsn, lsn_end: Lsn },
 }
-#[derive(Debug, Clone, Serialize)]
+#[derive(Debug, Clone, Serialize, Deserialize)]
 #[serde(tag = "kind")]
 pub enum HistoricLayerInfo {
    Delta {
@@ -685,6 +693,32 @@ pub enum HistoricLayerInfo {
    },
 }
 impl HistoricLayerInfo {
    pub fn layer_file_name(&self) -> &str {
        match self {
            HistoricLayerInfo::Delta {
                layer_file_name, ..
            } => layer_file_name,
            HistoricLayerInfo::Image {
                layer_file_name, ..
            } => layer_file_name,
        }
    }
    pub fn is_remote(&self) -> bool {
        match self {
            HistoricLayerInfo::Delta { remote, .. } => *remote,
            HistoricLayerInfo::Image { remote, .. } => *remote,
        }
    }
    pub fn set_remote(&mut self, value: bool) {
        let field = match self {
            HistoricLayerInfo::Delta { remote, .. } => remote,
            HistoricLayerInfo::Image { remote, .. } => remote,
        };
        *field = value;
    }
 }
 #[derive(Debug, Serialize, Deserialize)]
 pub struct DownloadRemoteLayersTaskSpawnRequest {
    pub max_concurrent_downloads: NonZeroUsize,
@@ -717,6 +751,52 @@ pub struct WalRedoManagerStatus {
    pub pid: Option<u32>,
 }
 /// The progress of a secondary tenant is mostly useful when doing a long running download: e.g. initiating
 /// a download job, timing out while waiting for it to run, and then inspecting this status to understand
 /// what's happening.
 #[derive(Default, Debug, Serialize, Deserialize, Clone)]
 pub struct SecondaryProgress {
    /// The remote storage LastModified time of the heatmap object we last downloaded.
    #[serde(
        serialize_with = "opt_ser_rfc3339_millis",
        deserialize_with = "opt_deser_rfc3339_millis"
    )]
    pub heatmap_mtime: Option<SystemTime>,
    /// The number of layers currently on-disk
    pub layers_downloaded: usize,
    /// The number of layers in the most recently seen heatmap
    pub layers_total: usize,
    /// The number of layer bytes currently on-disk
    pub bytes_downloaded: u64,
    /// The number of layer bytes in the most recently seen heatmap
    pub bytes_total: u64,
 }
 fn opt_ser_rfc3339_millis<S: serde::Serializer>(
    ts: &Option<SystemTime>,
    serializer: S,
 ) -> Result<S::Ok, S::Error> {
    match ts {
        Some(ts) => serializer.collect_str(&humantime::format_rfc3339_millis(*ts)),
        None => serializer.serialize_none(),
    }
 }
 fn opt_deser_rfc3339_millis<'de, D>(deserializer: D) -> Result<Option<SystemTime>, D::Error>
 where
    D: serde::de::Deserializer<'de>,
 {
    let s: Option<String> = serde::de::Deserialize::deserialize(deserializer)?;
    match s {
        None => Ok(None),
        Some(s) => humantime::parse_rfc3339(&s)
            .map_err(serde::de::Error::custom)
            .map(Some),
    }
 }
 pub mod virtual_file {
    #[derive(
        Copy,
--- a/libs/pageserver_api/src/models/utilization.rs
+++ b/libs/pageserver_api/src/models/utilization.rs
@@ -7,7 +7,7 @@ use std::time::SystemTime;
 ///
 /// `format: int64` fields must use `ser_saturating_u63` because openapi generated clients might
 /// not handle full u64 values properly.
-#[derive(serde::Serialize, Debug)]
+#[derive(serde::Serialize, serde::Deserialize, Debug, Clone)]
 pub struct PageserverUtilization {
    /// Used disk space
    #[serde(serialize_with = "ser_saturating_u63")]
@@ -21,7 +21,10 @@ pub struct PageserverUtilization {
    /// When was this snapshot captured, pageserver local time.
    ///
    /// Use millis to give confidence that the value is regenerated often enough.
-    #[serde(serialize_with = "ser_rfc3339_millis")]
+    #[serde(
        serialize_with = "ser_rfc3339_millis",
        deserialize_with = "deser_rfc3339_millis"
    )]
    pub captured_at: SystemTime,
 }
@@ -32,6 +35,14 @@ fn ser_rfc3339_millis<S: serde::Serializer>(
    serializer.collect_str(&humantime::format_rfc3339_millis(*ts))
 }
 fn deser_rfc3339_millis<'de, D>(deserializer: D) -> Result<SystemTime, D::Error>
 where
    D: serde::de::Deserializer<'de>,
 {
    let s: String = serde::de::Deserialize::deserialize(deserializer)?;
    humantime::parse_rfc3339(&s).map_err(serde::de::Error::custom)
 }
 /// openapi knows only `format: int64`, so avoid outputting a non-parseable value by generated clients.
 ///
 /// Instead of newtype, use this because a newtype would get require handling deserializing values
--- a/libs/pageserver_api/src/upcall_api.rs
+++ b/libs/pageserver_api/src/upcall_api.rs
@@ -6,19 +6,36 @@
 use serde::{Deserialize, Serialize};
 use utils::id::NodeId;
-use crate::shard::TenantShardId;
+use crate::{
    controller_api::NodeRegisterRequest, models::LocationConfigMode, shard::TenantShardId,
 };
 /// Upcall message sent by the pageserver to the configured `control_plane_api` on
 /// startup.
 #[derive(Serialize, Deserialize)]
 pub struct ReAttachRequest {
    pub node_id: NodeId,
    /// Optional inline self-registration: this is useful with the storage controller,
    /// if the node already has a node_id set.
    #[serde(skip_serializing_if = "Option::is_none", default)]
    pub register: Option<NodeRegisterRequest>,
 }
-#[derive(Serialize, Deserialize)]
+fn default_mode() -> LocationConfigMode {
    LocationConfigMode::AttachedSingle
 }
 #[derive(Serialize, Deserialize, Debug)]
 pub struct ReAttachResponseTenant {
    pub id: TenantShardId,
-    pub gen: u32,
+    /// Mandatory if LocationConfigMode is None or set to an Attached* mode
-}
+    pub gen: Option<u32>,
    /// Default value only for backward compat: this field should be set
    #[serde(default = "default_mode")]
    pub mode: LocationConfigMode,
 }
 #[derive(Serialize, Deserialize)]
 pub struct ReAttachResponse {
    pub tenants: Vec<ReAttachResponseTenant>,
--- a/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs
+++ b/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs
@@ -1,5 +1,6 @@
 use anyhow::*;
 use clap::{value_parser, Arg, ArgMatches, Command};
 use postgres::Client;
 use std::{path::PathBuf, str::FromStr};
 use wal_craft::*;
@@ -8,8 +9,8 @@ fn main() -> Result<()> {
        .init();
    let arg_matches = cli().get_matches();
-    let wal_craft = |arg_matches: &ArgMatches, client| {
+    let wal_craft = |arg_matches: &ArgMatches, client: &mut Client| {
-        let (intermediate_lsns, end_of_wal_lsn) = match arg_matches
+        let intermediate_lsns = match arg_matches
            .get_one::<String>("type")
            .map(|s| s.as_str())
            .context("'type' is required")?
@@ -25,6 +26,7 @@ fn main() -> Result<()> {
            LastWalRecordCrossingSegment::NAME => LastWalRecordCrossingSegment::craft(client)?,
            a => panic!("Unknown --type argument: {a}"),
        };
        let end_of_wal_lsn = client.pg_current_wal_insert_lsn()?;
        for lsn in intermediate_lsns {
            println!("intermediate_lsn = {lsn}");
        }
--- a/libs/postgres_ffi/wal_craft/src/lib.rs
+++ b/libs/postgres_ffi/wal_craft/src/lib.rs
@@ -5,7 +5,6 @@ use postgres::types::PgLsn;
 use postgres::Client;
 use postgres_ffi::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ};
 use postgres_ffi::{XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD};
 use std::cmp::Ordering;
 use std::path::{Path, PathBuf};
 use std::process::Command;
 use std::time::{Duration, Instant};
@@ -232,59 +231,52 @@ pub fn ensure_server_config(client: &mut impl postgres::GenericClient) -> anyhow
 pub trait Crafter {
    const NAME: &'static str;
-    /// Generates WAL using the client `client`. Returns a pair of:
+    /// Generates WAL using the client `client`. Returns a vector of some valid
-    /// * A vector of some valid "interesting" intermediate LSNs which one may start reading from.
+    /// "interesting" intermediate LSNs which one may start reading from.
-    ///   May include or exclude Lsn(0) and the end-of-wal.
+    /// test_end_of_wal uses this to check various starting points.
-    /// * The expected end-of-wal LSN.
+    ///
-    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)>;
+    /// Note that postgres is generally keen about writing some WAL. While we
    /// try to disable it (autovacuum, big wal_writer_delay, etc) it is always
    /// possible, e.g. xl_running_xacts are dumped each 15s. So checks about
    /// stable WAL end would be flaky unless postgres is shut down. For this
    /// reason returning potential end of WAL here is pointless. Most of the
    /// time this doesn't happen though, so it is reasonable to create needed
    /// WAL structure and immediately kill postgres like test_end_of_wal does.
    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<Vec<PgLsn>>;
 }
 /// Wraps some WAL craft function, providing current LSN to it before the
 /// insertion and flushing WAL afterwards. Also pushes initial LSN to the
 /// result.
 fn craft_internal<C: postgres::GenericClient>(
    client: &mut C,
-    f: impl Fn(&mut C, PgLsn) -> anyhow::Result<(Vec<PgLsn>, Option<PgLsn>)>,
+    f: impl Fn(&mut C, PgLsn) -> anyhow::Result<Vec<PgLsn>>,
-) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
+) -> anyhow::Result<Vec<PgLsn>> {
    ensure_server_config(client)?;
    let initial_lsn = client.pg_current_wal_insert_lsn()?;
    info!("LSN initial = {}", initial_lsn);
-    let (mut intermediate_lsns, last_lsn) = f(client, initial_lsn)?;
+    let mut intermediate_lsns = f(client, initial_lsn)?;
    let last_lsn = match last_lsn {
        None => client.pg_current_wal_insert_lsn()?,
        Some(last_lsn) => {
            let insert_lsn = client.pg_current_wal_insert_lsn()?;
            match last_lsn.cmp(&insert_lsn) {
                Ordering::Less => bail!(
                    "Some records were inserted after the crafted WAL: {} vs {}",
                    last_lsn,
                    insert_lsn
                ),
                Ordering::Equal => last_lsn,
                Ordering::Greater => bail!("Reported LSN is greater than insert_lsn"),
            }
        }
    };
    if !intermediate_lsns.starts_with(&[initial_lsn]) {
        intermediate_lsns.insert(0, initial_lsn);
    }
    // Some records may be not flushed, e.g. non-transactional logical messages.
    //
    // Note: this is broken if pg_current_wal_insert_lsn is at page boundary
    // because pg_current_wal_insert_lsn skips page headers.
    client.execute("select neon_xlogflush(pg_current_wal_insert_lsn())", &[])?;
-    match last_lsn.cmp(&client.pg_current_wal_flush_lsn()?) {
+    Ok(intermediate_lsns)
        Ordering::Less => bail!("Some records were flushed after the crafted WAL"),
        Ordering::Equal => {}
        Ordering::Greater => bail!("Reported LSN is greater than flush_lsn"),
    }
    Ok((intermediate_lsns, last_lsn))
 }
 pub struct Simple;
 impl Crafter for Simple {
    const NAME: &'static str = "simple";
-    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
+    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<Vec<PgLsn>> {
        craft_internal(client, |client, _| {
            client.execute("CREATE table t(x int)", &[])?;
-            Ok((Vec::new(), None))
+            Ok(Vec::new())
        })
    }
 }
@@ -292,29 +284,36 @@ impl Crafter for Simple {
 pub struct LastWalRecordXlogSwitch;
 impl Crafter for LastWalRecordXlogSwitch {
    const NAME: &'static str = "last_wal_record_xlog_switch";
-    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
+    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<Vec<PgLsn>> {
-        // Do not use generate_internal because here we end up with flush_lsn exactly on
+        // Do not use craft_internal because here we end up with flush_lsn exactly on
        // the segment boundary and insert_lsn after the initial page header, which is unusual.
        ensure_server_config(client)?;
        client.execute("CREATE table t(x int)", &[])?;
        let before_xlog_switch = client.pg_current_wal_insert_lsn()?;
-        let after_xlog_switch: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
+        // pg_switch_wal returns end of last record of the switched segment,
-        let next_segment = PgLsn::from(0x0200_0000);
+        // i.e. end of SWITCH itself.
        let xlog_switch_record_end: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
        let before_xlog_switch_u64 = u64::from(before_xlog_switch);
        let next_segment = PgLsn::from(
            before_xlog_switch_u64 - (before_xlog_switch_u64 % WAL_SEGMENT_SIZE as u64)
                + WAL_SEGMENT_SIZE as u64,
        );
        ensure!(
-            after_xlog_switch <= next_segment,
+            xlog_switch_record_end <= next_segment,
-            "XLOG_SWITCH message ended after the expected segment boundary: {} > {}",
+            "XLOG_SWITCH record ended after the expected segment boundary: {} > {}",
-            after_xlog_switch,
+            xlog_switch_record_end,
            next_segment
        );
-        Ok((vec![before_xlog_switch, after_xlog_switch], next_segment))
+        Ok(vec![before_xlog_switch, xlog_switch_record_end])
    }
 }
 pub struct LastWalRecordXlogSwitchEndsOnPageBoundary;
 /// Craft xlog SWITCH record ending at page boundary.
 impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {
    const NAME: &'static str = "last_wal_record_xlog_switch_ends_on_page_boundary";
-    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
+    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<Vec<PgLsn>> {
        // Do not use generate_internal because here we end up with flush_lsn exactly on
        // the segment boundary and insert_lsn after the initial page header, which is unusual.
        ensure_server_config(client)?;
@@ -361,28 +360,29 @@ impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {
        // Emit the XLOG_SWITCH
        let before_xlog_switch = client.pg_current_wal_insert_lsn()?;
-        let after_xlog_switch: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
+        let xlog_switch_record_end: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
        let next_segment = PgLsn::from(0x0200_0000);
        ensure!(
-            after_xlog_switch < next_segment,
+            xlog_switch_record_end < next_segment,
-            "XLOG_SWITCH message ended on or after the expected segment boundary: {} > {}",
+            "XLOG_SWITCH record ended on or after the expected segment boundary: {} > {}",
-            after_xlog_switch,
+            xlog_switch_record_end,
            next_segment
        );
        ensure!(
-            u64::from(after_xlog_switch) as usize % XLOG_BLCKSZ == XLOG_SIZE_OF_XLOG_SHORT_PHD,
+            u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ == XLOG_SIZE_OF_XLOG_SHORT_PHD,
            "XLOG_SWITCH message ended not on page boundary: {}, offset = {}",
-            after_xlog_switch,
+            xlog_switch_record_end,
-            u64::from(after_xlog_switch) as usize % XLOG_BLCKSZ
+            u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ
        );
-        Ok((vec![before_xlog_switch, after_xlog_switch], next_segment))
+        Ok(vec![before_xlog_switch, xlog_switch_record_end])
    }
 }
-fn craft_single_logical_message(
+/// Write ~16MB logical message; it should cross WAL segment.
 fn craft_seg_size_logical_message(
    client: &mut impl postgres::GenericClient,
    transactional: bool,
-) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
+) -> anyhow::Result<Vec<PgLsn>> {
    craft_internal(client, |client, initial_lsn| {
        ensure!(
            initial_lsn < PgLsn::from(0x0200_0000 - 1024 * 1024),
@@ -405,34 +405,24 @@ fn craft_single_logical_message(
            "Logical message crossed two segments"
        );
-        if transactional {
+        Ok(vec![message_lsn])
            // Transactional logical messages are part of a transaction, so the one above is
            // followed by a small COMMIT record.
            let after_message_lsn = client.pg_current_wal_insert_lsn()?;
            ensure!(
                message_lsn < after_message_lsn,
                "No record found after the emitted message"
            );
            Ok((vec![message_lsn], Some(after_message_lsn)))
        } else {
            Ok((Vec::new(), Some(message_lsn)))
        }
    })
 }
 pub struct WalRecordCrossingSegmentFollowedBySmallOne;
 impl Crafter for WalRecordCrossingSegmentFollowedBySmallOne {
    const NAME: &'static str = "wal_record_crossing_segment_followed_by_small_one";
-    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
+    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<Vec<PgLsn>> {
-        craft_single_logical_message(client, true)
+        // Transactional message crossing WAL segment will be followed by small
        // commit record.
        craft_seg_size_logical_message(client, true)
    }
 }
 pub struct LastWalRecordCrossingSegment;
 impl Crafter for LastWalRecordCrossingSegment {
    const NAME: &'static str = "last_wal_record_crossing_segment";
-    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
+    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<Vec<PgLsn>> {
-        craft_single_logical_message(client, false)
+        craft_seg_size_logical_message(client, false)
    }
 }
--- a/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs
+++ b/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs
@@ -11,13 +11,15 @@ use utils::const_assert;
 use utils::lsn::Lsn;
 fn init_logging() {
-    let _ = env_logger::Builder::from_env(env_logger::Env::default().default_filter_or(
+    let _ = env_logger::Builder::from_env(env_logger::Env::default().default_filter_or(format!(
-        format!("crate=info,postgres_ffi::{PG_MAJORVERSION}::xlog_utils=trace"),
+        "crate=info,postgres_ffi::{PG_MAJORVERSION}::xlog_utils=trace"
-    ))
+    )))
    .is_test(true)
    .try_init();
 }
 /// Test that find_end_of_wal returns the same results as pg_dump on various
 /// WALs created by Crafter.
 fn test_end_of_wal<C: crate::Crafter>(test_name: &str) {
    use crate::*;
@@ -38,13 +40,13 @@ fn test_end_of_wal<C: crate::Crafter>(test_name: &str) {
    }
    cfg.initdb().unwrap();
    let srv = cfg.start_server().unwrap();
-    let (intermediate_lsns, expected_end_of_wal_partial) =
+    let intermediate_lsns = C::craft(&mut srv.connect_with_timeout().unwrap()).unwrap();
        C::craft(&mut srv.connect_with_timeout().unwrap()).unwrap();
    let intermediate_lsns: Vec<Lsn> = intermediate_lsns
        .iter()
        .map(|&lsn| u64::from(lsn).into())
        .collect();
-    let expected_end_of_wal: Lsn = u64::from(expected_end_of_wal_partial).into();
+    // Kill postgres. Note that it might have inserted to WAL something after
    // 'craft' did its job.
    srv.kill();
    // Check find_end_of_wal on the initial WAL
@@ -56,7 +58,7 @@ fn test_end_of_wal<C: crate::Crafter>(test_name: &str) {
        .filter(|fname| IsXLogFileName(fname))
        .max()
        .unwrap();
-    check_pg_waldump_end_of_wal(&cfg, &last_segment, expected_end_of_wal);
+    let expected_end_of_wal = find_pg_waldump_end_of_wal(&cfg, &last_segment);
    for start_lsn in intermediate_lsns
        .iter()
        .chain(std::iter::once(&expected_end_of_wal))
@@ -91,11 +93,7 @@ fn test_end_of_wal<C: crate::Crafter>(test_name: &str) {
    }
 }
-fn check_pg_waldump_end_of_wal(
+fn find_pg_waldump_end_of_wal(cfg: &crate::Conf, last_segment: &str) -> Lsn {
    cfg: &crate::Conf,
    last_segment: &str,
    expected_end_of_wal: Lsn,
 ) {
    // Get the actual end of WAL by pg_waldump
    let waldump_output = cfg
        .pg_waldump("000000010000000000000001", last_segment)
@@ -113,11 +111,8 @@ fn check_pg_waldump_end_of_wal(
        }
    };
    let waldump_wal_end = Lsn::from_str(caps.get(1).unwrap().as_str()).unwrap();
-    info!(
+    info!("waldump erred on {}", waldump_wal_end);
-        "waldump erred on {}, expected wal end at {}",
+    waldump_wal_end
        waldump_wal_end, expected_end_of_wal
    );
    assert_eq!(waldump_wal_end, expected_end_of_wal);
 }
 fn check_end_of_wal(
@@ -210,9 +205,9 @@ pub fn test_update_next_xid() {
 #[test]
 pub fn test_encode_logical_message() {
    let expected = [
-        64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 0, 0, 170, 34, 166, 227, 255,
+        64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 0, 0, 170, 34, 166, 227, 255, 38,
-        38, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 112, 114,
+        0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 112, 114, 101, 102,
-        101, 102, 105, 120, 0, 109, 101, 115, 115, 97, 103, 101,
+        105, 120, 0, 109, 101, 115, 115, 97, 103, 101,
    ];
    let actual = encode_logical_message("prefix", "message");
    assert_eq!(expected, actual[..]);
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -18,6 +18,7 @@ camino.workspace = true
 humantime.workspace = true
 hyper = { workspace = true, features = ["stream"] }
 futures.workspace = true
 rand.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 tokio = { workspace = true, features = ["sync", "fs", "io-util"] }
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -157,9 +157,8 @@ impl AzureBlobStorage {
            let mut bufs = Vec::new();
            while let Some(part) = response.next().await {
                let part = part?;
                let etag_str: &str = part.blob.properties.etag.as_ref();
                if etag.is_none() {
-                    etag = Some(etag.unwrap_or_else(|| etag_str.to_owned()));
+                    etag = Some(part.blob.properties.etag);
                }
                if last_modified.is_none() {
                    last_modified = Some(part.blob.properties.last_modified.into());
@@ -174,6 +173,16 @@ impl AzureBlobStorage {
                    .map_err(|e| DownloadError::Other(e.into()))?;
                bufs.push(data);
            }
            if bufs.is_empty() {
                return Err(DownloadError::Other(anyhow::anyhow!(
                    "Azure GET response contained no buffers"
                )));
            }
            // unwrap safety: if these were None, bufs would be empty and we would have returned an error already
            let etag = etag.unwrap();
            let last_modified = last_modified.unwrap();
            Ok(Download {
                download_stream: Box::pin(futures::stream::iter(bufs.into_iter().map(Ok))),
                etag,
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -42,6 +42,9 @@ pub use self::{
 };
 use s3_bucket::RequestKind;
 /// Azure SDK's ETag type is a simple String wrapper: we use this internally instead of repeating it here.
 pub use azure_core::Etag;
 pub use error::{DownloadError, TimeTravelError, TimeoutOrCancel};
 /// Currently, sync happens with AWS S3, that has two limits on requests per second:
@@ -291,9 +294,9 @@ pub type DownloadStream =
 pub struct Download {
    pub download_stream: DownloadStream,
    /// The last time the file was modified (`last-modified` HTTP header)
-    pub last_modified: Option<SystemTime>,
+    pub last_modified: SystemTime,
    /// A way to identify this specific version of the resource (`etag` HTTP header)
-    pub etag: Option<String>,
+    pub etag: Etag,
    /// Extra key-value data, associated with the current remote file.
    pub metadata: Option<StorageMetadata>,
 }
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -10,7 +10,7 @@ use std::{
    io::ErrorKind,
    num::NonZeroU32,
    pin::Pin,
-    time::{Duration, SystemTime},
+    time::{Duration, SystemTime, UNIX_EPOCH},
 };
 use anyhow::{bail, ensure, Context};
@@ -30,6 +30,7 @@ use crate::{
 };
 use super::{RemoteStorage, StorageMetadata};
 use crate::Etag;
 const LOCAL_FS_TEMP_FILE_SUFFIX: &str = "___temp";
@@ -197,6 +198,7 @@ impl LocalFs {
            fs::OpenOptions::new()
                .write(true)
                .create(true)
                .truncate(true)
                .open(&temp_file_path)
                .await
                .with_context(|| {
@@ -406,35 +408,37 @@ impl RemoteStorage for LocalFs {
        cancel: &CancellationToken,
    ) -> Result<Download, DownloadError> {
        let target_path = from.with_base(&self.storage_root);
        if file_exists(&target_path).map_err(DownloadError::BadInput)? {
            let source = ReaderStream::new(
                fs::OpenOptions::new()
                    .read(true)
                    .open(&target_path)
                    .await
                    .with_context(|| {
                        format!("Failed to open source file {target_path:?} to use in the download")
                    })
                    .map_err(DownloadError::Other)?,
            );
-            let metadata = self
+        let file_metadata = file_metadata(&target_path).await?;
-                .read_storage_metadata(&target_path)
+
        let source = ReaderStream::new(
            fs::OpenOptions::new()
                .read(true)
                .open(&target_path)
                .await
-                .map_err(DownloadError::Other)?;
+                .with_context(|| {
                    format!("Failed to open source file {target_path:?} to use in the download")
                })
                .map_err(DownloadError::Other)?,
        );
-            let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
+        let metadata = self
-            let source = crate::support::DownloadStream::new(cancel_or_timeout, source);
+            .read_storage_metadata(&target_path)
            .await
            .map_err(DownloadError::Other)?;
-            Ok(Download {
+        let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
-                metadata,
+        let source = crate::support::DownloadStream::new(cancel_or_timeout, source);
-                last_modified: None,
+
-                etag: None,
+        let etag = mock_etag(&file_metadata);
-                download_stream: Box::pin(source),
+        Ok(Download {
-            })
+            metadata,
-        } else {
+            last_modified: file_metadata
-            Err(DownloadError::NotFound)
+                .modified()
-        }
+                .map_err(|e| DownloadError::Other(anyhow::anyhow!(e).context("Reading mtime")))?,
            etag,
            download_stream: Box::pin(source),
        })
    }
    async fn download_byte_range(
@@ -452,50 +456,51 @@ impl RemoteStorage for LocalFs {
                return Err(DownloadError::Other(anyhow::anyhow!("Invalid range, start ({start_inclusive}) and end_exclusive ({end_exclusive:?}) difference is zero bytes")));
            }
        }
        let target_path = from.with_base(&self.storage_root);
-        if file_exists(&target_path).map_err(DownloadError::BadInput)? {
+        let file_metadata = file_metadata(&target_path).await?;
-            let mut source = tokio::fs::OpenOptions::new()
+        let mut source = tokio::fs::OpenOptions::new()
-                .read(true)
+            .read(true)
-                .open(&target_path)
+            .open(&target_path)
-                .await
+            .await
-                .with_context(|| {
+            .with_context(|| {
-                    format!("Failed to open source file {target_path:?} to use in the download")
+                format!("Failed to open source file {target_path:?} to use in the download")
                })
                .map_err(DownloadError::Other)?;
            let len = source
                .metadata()
                .await
                .context("query file length")
                .map_err(DownloadError::Other)?
                .len();
            source
                .seek(io::SeekFrom::Start(start_inclusive))
                .await
                .context("Failed to seek to the range start in a local storage file")
                .map_err(DownloadError::Other)?;
            let metadata = self
                .read_storage_metadata(&target_path)
                .await
                .map_err(DownloadError::Other)?;
            let source = source.take(end_exclusive.unwrap_or(len) - start_inclusive);
            let source = ReaderStream::new(source);
            let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
            let source = crate::support::DownloadStream::new(cancel_or_timeout, source);
            Ok(Download {
                metadata,
                last_modified: None,
                etag: None,
                download_stream: Box::pin(source),
            })
-        } else {
+            .map_err(DownloadError::Other)?;
-            Err(DownloadError::NotFound)
+
-        }
+        let len = source
            .metadata()
            .await
            .context("query file length")
            .map_err(DownloadError::Other)?
            .len();
        source
            .seek(io::SeekFrom::Start(start_inclusive))
            .await
            .context("Failed to seek to the range start in a local storage file")
            .map_err(DownloadError::Other)?;
        let metadata = self
            .read_storage_metadata(&target_path)
            .await
            .map_err(DownloadError::Other)?;
        let source = source.take(end_exclusive.unwrap_or(len) - start_inclusive);
        let source = ReaderStream::new(source);
        let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
        let source = crate::support::DownloadStream::new(cancel_or_timeout, source);
        let etag = mock_etag(&file_metadata);
        Ok(Download {
            metadata,
            last_modified: file_metadata
                .modified()
                .map_err(|e| DownloadError::Other(anyhow::anyhow!(e).context("Reading mtime")))?,
            etag,
            download_stream: Box::pin(source),
        })
    }
    async fn delete(&self, path: &RemotePath, _cancel: &CancellationToken) -> anyhow::Result<()> {
@@ -610,13 +615,22 @@ async fn create_target_directory(target_file_path: &Utf8Path) -> anyhow::Result<
    Ok(())
 }
-fn file_exists(file_path: &Utf8Path) -> anyhow::Result<bool> {
+async fn file_metadata(file_path: &Utf8Path) -> Result<std::fs::Metadata, DownloadError> {
-    if file_path.exists() {
+    tokio::fs::metadata(&file_path).await.map_err(|e| {
-        ensure!(file_path.is_file(), "file path '{file_path}' is not a file");
+        if e.kind() == ErrorKind::NotFound {
-        Ok(true)
+            DownloadError::NotFound
-    } else {
+        } else {
-        Ok(false)
+            DownloadError::BadInput(e.into())
-    }
+        }
    })
 }
 // Use mtime as stand-in for ETag.  We could calculate a meaningful one by md5'ing the contents of files we
 // read, but that's expensive and the local_fs test helper's whole reason for existence is to run small tests
 // quickly, with less overhead than using a mock S3 server.
 fn mock_etag(meta: &std::fs::Metadata) -> Etag {
    let mtime = meta.modified().expect("Filesystem mtime missing");
    format!("{}", mtime.duration_since(UNIX_EPOCH).unwrap().as_millis()).into()
 }
 #[cfg(test)]
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -35,8 +35,8 @@ use aws_sdk_s3::{
 };
 use aws_smithy_async::rt::sleep::TokioSleep;
 use aws_smithy_types::byte_stream::ByteStream;
 use aws_smithy_types::{body::SdkBody, DateTime};
 use aws_smithy_types::{byte_stream::ByteStream, date_time::ConversionError};
 use bytes::Bytes;
 use futures::stream::Stream;
 use hyper::Body;
@@ -287,8 +287,17 @@ impl S3Bucket {
        let remaining = self.timeout.saturating_sub(started_at.elapsed());
        let metadata = object_output.metadata().cloned().map(StorageMetadata);
-        let etag = object_output.e_tag;
+        let etag = object_output
-        let last_modified = object_output.last_modified.and_then(|t| t.try_into().ok());
+            .e_tag
            .ok_or(DownloadError::Other(anyhow::anyhow!("Missing ETag header")))?
            .into();
        let last_modified = object_output
            .last_modified
            .ok_or(DownloadError::Other(anyhow::anyhow!(
                "Missing LastModified header"
            )))?
            .try_into()
            .map_err(|e: ConversionError| DownloadError::Other(e.into()))?;
        let body = object_output.body;
        let body = ByteStreamAsStream::from(body);
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -17,6 +17,7 @@ use remote_storage::{
 };
 use test_context::test_context;
 use test_context::AsyncTestContext;
 use tokio::io::AsyncBufReadExt;
 use tokio_util::sync::CancellationToken;
 use tracing::info;
@@ -117,7 +118,7 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
    // A little check to ensure that our clock is not too far off from the S3 clock
    {
        let dl = retry(|| ctx.client.download(&path2, &cancel)).await?;
-        let last_modified = dl.last_modified.unwrap();
+        let last_modified = dl.last_modified;
        let half_wt = WAIT_TIME.mul_f32(0.5);
        let t0_hwt = t0 + half_wt;
        let t1_hwt = t1 - half_wt;
@@ -484,32 +485,33 @@ async fn download_is_cancelled(ctx: &mut MaybeEnabledStorage) {
    ))
    .unwrap();
-    let len = upload_large_enough_file(&ctx.client, &path, &cancel).await;
+    let file_len = upload_large_enough_file(&ctx.client, &path, &cancel).await;
    {
-        let mut stream = ctx
+        let stream = ctx
            .client
            .download(&path, &cancel)
            .await
            .expect("download succeeds")
            .download_stream;
-        let first = stream
+        let mut reader = std::pin::pin!(tokio_util::io::StreamReader::new(stream));
            .next()
            .await
            .expect("should have the first blob")
            .expect("should have succeeded");
-        tracing::info!(len = first.len(), "downloaded first chunk");
+        let first = reader.fill_buf().await.expect("should have the first blob");
        let len = first.len();
        tracing::info!(len, "downloaded first chunk");
        assert!(
-            first.len() < len,
+            first.len() < file_len,
            "uploaded file is too small, we downloaded all on first chunk"
        );
        reader.consume(len);
        cancel.cancel();
-        let next = stream.next().await.expect("stream should have more");
+        let next = reader.fill_buf().await;
        let e = next.expect_err("expected an error, but got a chunk?");
@@ -520,6 +522,10 @@ async fn download_is_cancelled(ctx: &mut MaybeEnabledStorage) {
                .is_some_and(|e| matches!(e, DownloadError::Cancelled)),
            "{inner:?}"
        );
        let e = DownloadError::from(e);
        assert!(matches!(e, DownloadError::Cancelled), "{e:?}");
    }
    let cancel = CancellationToken::new();
--- a/libs/tenant_size_model/tests/tests.rs
+++ b/libs/tenant_size_model/tests/tests.rs
@@ -247,7 +247,7 @@ fn scenario_4() {
    //
    // This is in total 5000 + 1000 + 5000 + 1000 = 12000
    //
-    // (If we used the the method from the previous scenario, and
+    // (If we used the method from the previous scenario, and
    // kept only snapshot at the branch point, we'd need to keep
    // all the WAL between 10000-18000 on the main branch, so
    // the total size would be 5000 + 1000 + 8000 = 14000. The
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -13,6 +13,7 @@ testing = ["fail/failpoints"]
 [dependencies]
 arc-swap.workspace = true
 sentry.workspace = true
 async-compression.workspace = true
 async-trait.workspace = true
 anyhow.workspace = true
 bincode.workspace = true
@@ -36,6 +37,7 @@ serde_json.workspace = true
 signal-hook.workspace = true
 thiserror.workspace = true
 tokio.workspace = true
 tokio-tar.workspace = true
 tokio-util.workspace = true
 tracing.workspace = true
 tracing-error.workspace = true
@@ -46,6 +48,7 @@ strum.workspace = true
 strum_macros.workspace = true
 url.workspace = true
 uuid.workspace = true
 walkdir.workspace = true
 pq_proto.workspace = true
 postgres_connection.workspace = true
--- a/libs/utils/src/history_buffer.rs
+++ b/libs/utils/src/history_buffer.rs
@@ -47,9 +47,10 @@ impl<T, const L: usize> ops::Deref for HistoryBufferWithDropCounter<T, L> {
    }
 }
-#[derive(serde::Serialize)]
+#[derive(serde::Serialize, serde::Deserialize)]
 struct SerdeRepr<T> {
    buffer: Vec<T>,
    buffer_size: usize,
    drop_count: u64,
 }
@@ -61,6 +62,7 @@ where
        let HistoryBufferWithDropCounter { buffer, drop_count } = value;
        SerdeRepr {
            buffer: buffer.iter().cloned().collect(),
            buffer_size: L,
            drop_count: *drop_count,
        }
    }
@@ -78,19 +80,52 @@ where
    }
 }
 impl<'de, T, const L: usize> serde::de::Deserialize<'de> for HistoryBufferWithDropCounter<T, L>
 where
    T: Clone + serde::Deserialize<'de>,
 {
    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
    where
        D: serde::Deserializer<'de>,
    {
        let SerdeRepr {
            buffer: des_buffer,
            drop_count,
            buffer_size,
        } = SerdeRepr::<T>::deserialize(deserializer)?;
        if buffer_size != L {
            use serde::de::Error;
            return Err(D::Error::custom(format!(
                "invalid buffer_size, expecting {L} got {buffer_size}"
            )));
        }
        let mut buffer = HistoryBuffer::new();
        buffer.extend(des_buffer);
        Ok(HistoryBufferWithDropCounter { buffer, drop_count })
    }
 }
 #[cfg(test)]
 mod test {
    use super::HistoryBufferWithDropCounter;
    #[test]
    fn test_basics() {
-        let mut b = HistoryBufferWithDropCounter::<_, 2>::default();
+        let mut b = HistoryBufferWithDropCounter::<usize, 2>::default();
        b.write(1);
        b.write(2);
        b.write(3);
        assert!(b.iter().any(|e| *e == 2));
        assert!(b.iter().any(|e| *e == 3));
        assert!(!b.iter().any(|e| *e == 1));
        // round-trip serde
        let round_tripped: HistoryBufferWithDropCounter<usize, 2> =
            serde_json::from_str(&serde_json::to_string(&b).unwrap()).unwrap();
        assert_eq!(
            round_tripped.iter().cloned().collect::<Vec<_>>(),
            b.iter().cloned().collect::<Vec<_>>()
        );
    }
    #[test]
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -245,7 +245,7 @@ impl std::io::Write for ChannelWriter {
    }
 }
-async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
+pub async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
    SERVE_METRICS_COUNT.inc();
    let started_at = std::time::Instant::now();
@@ -367,7 +367,6 @@ pub fn make_router() -> RouterBuilder<hyper::Body, ApiError> {
        .middleware(Middleware::post_with_info(
            add_request_id_header_to_response,
        ))
        .get("/metrics", |r| request_span(r, prometheus_metrics_handler))
        .err_handler(route_error_handler)
 }
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -87,6 +87,8 @@ pub mod failpoint_support;
 pub mod yielding_loop;
 pub mod zstd;
 /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
 ///
 /// we have several cases:
--- a/libs/utils/src/lock_file.rs
+++ b/libs/utils/src/lock_file.rs
@@ -63,6 +63,7 @@ impl UnwrittenLockFile {
 pub fn create_exclusive(lock_file_path: &Utf8Path) -> anyhow::Result<UnwrittenLockFile> {
    let lock_file = fs::OpenOptions::new()
        .create(true) // O_CREAT
        .truncate(true)
        .write(true)
        .open(lock_file_path)
        .context("open lock file")?;
--- a/libs/utils/src/pageserver_feedback.rs
+++ b/libs/utils/src/pageserver_feedback.rs
@@ -29,12 +29,10 @@ pub struct PageserverFeedback {
    // Serialize with RFC3339 format.
    #[serde(with = "serde_systemtime")]
    pub replytime: SystemTime,
    /// Used to track feedbacks from different shards. Always zero for unsharded tenants.
    pub shard_number: u32,
 }
 // NOTE: Do not forget to increment this number when adding new fields to PageserverFeedback.
 // Do not remove previously available fields because this might be backwards incompatible.
 pub const PAGESERVER_FEEDBACK_FIELDS_NUMBER: u8 = 5;
 impl PageserverFeedback {
    pub fn empty() -> PageserverFeedback {
        PageserverFeedback {
@@ -43,6 +41,7 @@ impl PageserverFeedback {
            remote_consistent_lsn: Lsn::INVALID,
            disk_consistent_lsn: Lsn::INVALID,
            replytime: *PG_EPOCH,
            shard_number: 0,
        }
    }
@@ -59,17 +58,26 @@ impl PageserverFeedback {
    //
    // TODO: change serialized fields names once all computes migrate to rename.
    pub fn serialize(&self, buf: &mut BytesMut) {
-        buf.put_u8(PAGESERVER_FEEDBACK_FIELDS_NUMBER); // # of keys
+        let buf_ptr = buf.len();
        buf.put_u8(0); // # of keys, will be filled later
        let mut nkeys = 0;
        nkeys += 1;
        buf.put_slice(b"current_timeline_size\0");
        buf.put_i32(8);
        buf.put_u64(self.current_timeline_size);
        nkeys += 1;
        buf.put_slice(b"ps_writelsn\0");
        buf.put_i32(8);
        buf.put_u64(self.last_received_lsn.0);
        nkeys += 1;
        buf.put_slice(b"ps_flushlsn\0");
        buf.put_i32(8);
        buf.put_u64(self.disk_consistent_lsn.0);
        nkeys += 1;
        buf.put_slice(b"ps_applylsn\0");
        buf.put_i32(8);
        buf.put_u64(self.remote_consistent_lsn.0);
@@ -80,9 +88,19 @@ impl PageserverFeedback {
            .expect("failed to serialize pg_replytime earlier than PG_EPOCH")
            .as_micros() as i64;
        nkeys += 1;
        buf.put_slice(b"ps_replytime\0");
        buf.put_i32(8);
        buf.put_i64(timestamp);
        if self.shard_number > 0 {
            nkeys += 1;
            buf.put_slice(b"shard_number\0");
            buf.put_i32(4);
            buf.put_u32(self.shard_number);
        }
        buf[buf_ptr] = nkeys;
    }
    // Deserialize PageserverFeedback message
@@ -123,6 +141,11 @@ impl PageserverFeedback {
                        rf.replytime = *PG_EPOCH - Duration::from_micros(-raw_time as u64);
                    }
                }
                b"shard_number" => {
                    let len = buf.get_i32();
                    assert_eq!(len, 4);
                    rf.shard_number = buf.get_u32();
                }
                _ => {
                    let len = buf.get_i32();
                    warn!(
@@ -194,10 +217,7 @@ mod tests {
        rf.serialize(&mut data);
        // Add an extra field to the buffer and adjust number of keys
-        if let Some(first) = data.first_mut() {
+        data[0] += 1;
            *first = PAGESERVER_FEEDBACK_FIELDS_NUMBER + 1;
        }
        data.put_slice(b"new_field_one\0");
        data.put_i32(8);
        data.put_u64(42);
--- a/libs/utils/src/sync/heavier_once_cell.rs
+++ b/libs/utils/src/sync/heavier_once_cell.rs
@@ -110,6 +110,49 @@ impl<T> OnceCell<T> {
        }
    }
    /// Returns a guard to an existing initialized value, or returns an unique initialization
    /// permit which can be used to initialize this `OnceCell` using `OnceCell::set`.
    pub async fn get_or_init_detached(&self) -> Result<Guard<'_, T>, InitPermit> {
        // It looks like OnceCell::get_or_init could be implemented using this method instead of
        // duplication. However, that makes the future be !Send due to possibly holding on to the
        // MutexGuard over an await point.
        loop {
            let sem = {
                let guard = self.inner.lock().unwrap();
                if guard.value.is_some() {
                    return Ok(Guard(guard));
                }
                guard.init_semaphore.clone()
            };
            {
                let permit = {
                    // increment the count for the duration of queued
                    let _guard = CountWaitingInitializers::start(self);
                    sem.acquire().await
                };
                let Ok(permit) = permit else {
                    let guard = self.inner.lock().unwrap();
                    if !Arc::ptr_eq(&sem, &guard.init_semaphore) {
                        // there was a take_and_deinit in between
                        continue;
                    }
                    assert!(
                        guard.value.is_some(),
                        "semaphore got closed, must be initialized"
                    );
                    return Ok(Guard(guard));
                };
                permit.forget();
            }
            let permit = InitPermit(sem);
            return Err(permit);
        }
    }
    /// Assuming a permit is held after previous call to [`Guard::take_and_deinit`], it can be used
    /// to complete initializing the inner value.
    ///
@@ -202,7 +245,7 @@ impl<'a, T> Guard<'a, T> {
    ///
    /// The permit will be on a semaphore part of the new internal value, and any following
    /// [`OnceCell::get_or_init`] will wait on it to complete.
-    pub fn take_and_deinit(&mut self) -> (T, InitPermit) {
+    pub fn take_and_deinit(mut self) -> (T, InitPermit) {
        let mut swapped = Inner::default();
        let sem = swapped.init_semaphore.clone();
        // acquire and forget right away, moving the control over to InitPermit
@@ -481,4 +524,39 @@ mod tests {
        assert_eq!("t1", *cell.get().unwrap());
    }
    #[tokio::test(start_paused = true)]
    async fn detached_init_smoke() {
        let target = OnceCell::default();
        let Err(permit) = target.get_or_init_detached().await else {
            unreachable!("it is not initialized")
        };
        tokio::time::timeout(
            std::time::Duration::from_secs(3600 * 24 * 7 * 365),
            target.get_or_init(|permit2| async { Ok::<_, Infallible>((11, permit2)) }),
        )
        .await
        .expect_err("should timeout since we are already holding the permit");
        target.set(42, permit);
        let (_answer, permit) = {
            let guard = target
                .get_or_init(|permit| async { Ok::<_, Infallible>((11, permit)) })
                .await
                .unwrap();
            assert_eq!(*guard, 42);
            guard.take_and_deinit()
        };
        assert!(target.get().is_none());
        target.set(11, permit);
        assert_eq!(*target.get().unwrap(), 11);
    }
 }
--- a/libs/utils/src/vec_map.rs
+++ b/libs/utils/src/vec_map.rs
@@ -1,27 +1,60 @@
 use std::{alloc::Layout, cmp::Ordering, ops::RangeBounds};
 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
 pub enum VecMapOrdering {
    Greater,
    GreaterOrEqual,
 }
 /// Ordered map datastructure implemented in a Vec.
 /// Append only - can only add keys that are larger than the
 /// current max key.
 /// Ordering can be adjusted using [`VecMapOrdering`]
 /// during `VecMap` construction.
 #[derive(Clone, Debug)]
-pub struct VecMap<K, V>(Vec<(K, V)>);
+pub struct VecMap<K, V> {
    data: Vec<(K, V)>,
    ordering: VecMapOrdering,
 }
 impl<K, V> Default for VecMap<K, V> {
    fn default() -> Self {
-        VecMap(Default::default())
+        VecMap {
            data: Default::default(),
            ordering: VecMapOrdering::Greater,
        }
    }
 }
-#[derive(Debug)]
+#[derive(thiserror::Error, Debug)]
-pub struct InvalidKey;
+pub enum VecMapError {
    #[error("Key violates ordering constraint")]
    InvalidKey,
    #[error("Mismatched ordering constraints")]
    ExtendOrderingError,
 }
 impl<K: Ord, V> VecMap<K, V> {
    pub fn new(ordering: VecMapOrdering) -> Self {
        Self {
            data: Vec::new(),
            ordering,
        }
    }
    pub fn with_capacity(capacity: usize, ordering: VecMapOrdering) -> Self {
        Self {
            data: Vec::with_capacity(capacity),
            ordering,
        }
    }
    pub fn is_empty(&self) -> bool {
-        self.0.is_empty()
+        self.data.is_empty()
    }
    pub fn as_slice(&self) -> &[(K, V)] {
-        self.0.as_slice()
+        self.data.as_slice()
    }
    /// This function may panic if given a range where the lower bound is
@@ -29,7 +62,7 @@ impl<K: Ord, V> VecMap<K, V> {
    pub fn slice_range<R: RangeBounds<K>>(&self, range: R) -> &[(K, V)] {
        use std::ops::Bound::*;
-        let binary_search = |k: &K| self.0.binary_search_by_key(&k, extract_key);
+        let binary_search = |k: &K| self.data.binary_search_by_key(&k, extract_key);
        let start_idx = match range.start_bound() {
            Unbounded => 0,
@@ -41,7 +74,7 @@ impl<K: Ord, V> VecMap<K, V> {
        };
        let end_idx = match range.end_bound() {
-            Unbounded => self.0.len(),
+            Unbounded => self.data.len(),
            Included(k) => match binary_search(k) {
                Ok(idx) => idx + 1,
                Err(idx) => idx,
@@ -49,34 +82,30 @@ impl<K: Ord, V> VecMap<K, V> {
            Excluded(k) => binary_search(k).unwrap_or_else(std::convert::identity),
        };
-        &self.0[start_idx..end_idx]
+        &self.data[start_idx..end_idx]
    }
    /// Add a key value pair to the map.
-    /// If `key` is less than or equal to the current maximum key
+    /// If `key` is not respective of the `self` ordering the
-    /// the pair will not be added and InvalidKey error will be returned.
+    /// pair will not be added and `InvalidKey` error will be returned.
-    pub fn append(&mut self, key: K, value: V) -> Result<usize, InvalidKey> {
+    pub fn append(&mut self, key: K, value: V) -> Result<usize, VecMapError> {
-        if let Some((last_key, _last_value)) = self.0.last() {
+        self.validate_key_order(&key)?;
            if &key <= last_key {
                return Err(InvalidKey);
            }
        }
        let delta_size = self.instrument_vec_op(|vec| vec.push((key, value)));
        Ok(delta_size)
    }
    /// Update the maximum key value pair or add a new key value pair to the map.
-    /// If `key` is less than the current maximum key no updates or additions
+    /// If `key` is not respective of the `self` ordering no updates or additions
-    /// will occur and InvalidKey error will be returned.
+    /// will occur and `InvalidKey` error will be returned.
    pub fn append_or_update_last(
        &mut self,
        key: K,
        mut value: V,
-    ) -> Result<(Option<V>, usize), InvalidKey> {
+    ) -> Result<(Option<V>, usize), VecMapError> {
-        if let Some((last_key, last_value)) = self.0.last_mut() {
+        if let Some((last_key, last_value)) = self.data.last_mut() {
            match key.cmp(last_key) {
-                Ordering::Less => return Err(InvalidKey),
+                Ordering::Less => return Err(VecMapError::InvalidKey),
                Ordering::Equal => {
                    std::mem::swap(last_value, &mut value);
                    const DELTA_SIZE: usize = 0;
@@ -100,40 +129,67 @@ impl<K: Ord, V> VecMap<K, V> {
        V: Clone,
    {
        let split_idx = self
-            .0
+            .data
            .binary_search_by_key(&cutoff, extract_key)
            .unwrap_or_else(std::convert::identity);
        (
-            VecMap(self.0[..split_idx].to_vec()),
+            VecMap {
-            VecMap(self.0[split_idx..].to_vec()),
+                data: self.data[..split_idx].to_vec(),
                ordering: self.ordering,
            },
            VecMap {
                data: self.data[split_idx..].to_vec(),
                ordering: self.ordering,
            },
        )
    }
    /// Move items from `other` to the end of `self`, leaving `other` empty.
-    /// If any keys in `other` is less than or equal to any key in `self`,
+    /// If the `other` ordering is different from `self` ordering
-    /// `InvalidKey` error will be returned and no mutation will occur.
+    /// `ExtendOrderingError` error will be returned.
-    pub fn extend(&mut self, other: &mut Self) -> Result<usize, InvalidKey> {
+    /// If any keys in `other` is not respective of the ordering defined in
-        let self_last_opt = self.0.last().map(extract_key);
+    /// `self`, `InvalidKey` error will be returned and no mutation will occur.
-        let other_first_opt = other.0.last().map(extract_key);
+    pub fn extend(&mut self, other: &mut Self) -> Result<usize, VecMapError> {
        if self.ordering != other.ordering {
            return Err(VecMapError::ExtendOrderingError);
        }
-        if let (Some(self_last), Some(other_first)) = (self_last_opt, other_first_opt) {
+        let other_first_opt = other.data.last().map(extract_key);
-            if self_last >= other_first {
+        if let Some(other_first) = other_first_opt {
-                return Err(InvalidKey);
+            self.validate_key_order(other_first)?;
        }
        let delta_size = self.instrument_vec_op(|vec| vec.append(&mut other.data));
        Ok(delta_size)
    }
    /// Validate the current last key in `self` and key being
    /// inserted against the order defined in `self`.
    fn validate_key_order(&self, key: &K) -> Result<(), VecMapError> {
        if let Some(last_key) = self.data.last().map(extract_key) {
            match (&self.ordering, &key.cmp(last_key)) {
                (VecMapOrdering::Greater, Ordering::Less | Ordering::Equal) => {
                    return Err(VecMapError::InvalidKey);
                }
                (VecMapOrdering::Greater, Ordering::Greater) => {}
                (VecMapOrdering::GreaterOrEqual, Ordering::Less) => {
                    return Err(VecMapError::InvalidKey);
                }
                (VecMapOrdering::GreaterOrEqual, Ordering::Equal | Ordering::Greater) => {}
            }
        }
-        let delta_size = self.instrument_vec_op(|vec| vec.append(&mut other.0));
+        Ok(())
        Ok(delta_size)
    }
    /// Instrument an operation on the underlying [`Vec`].
    /// Will panic if the operation decreases capacity.
    /// Returns the increase in memory usage caused by the op.
    fn instrument_vec_op(&mut self, op: impl FnOnce(&mut Vec<(K, V)>)) -> usize {
-        let old_cap = self.0.capacity();
+        let old_cap = self.data.capacity();
-        op(&mut self.0);
+        op(&mut self.data);
-        let new_cap = self.0.capacity();
+        let new_cap = self.data.capacity();
        match old_cap.cmp(&new_cap) {
            Ordering::Less => {
@@ -145,6 +201,36 @@ impl<K: Ord, V> VecMap<K, V> {
            Ordering::Greater => panic!("VecMap capacity shouldn't ever decrease"),
        }
    }
    /// Similar to `from_iter` defined in `FromIter` trait except
    /// that it accepts an [`VecMapOrdering`]
    pub fn from_iter<I: IntoIterator<Item = (K, V)>>(iter: I, ordering: VecMapOrdering) -> Self {
        let iter = iter.into_iter();
        let initial_capacity = {
            match iter.size_hint() {
                (lower_bound, None) => lower_bound,
                (_, Some(upper_bound)) => upper_bound,
            }
        };
        let mut vec_map = VecMap::with_capacity(initial_capacity, ordering);
        for (key, value) in iter {
            vec_map
                .append(key, value)
                .expect("The passed collection needs to be sorted!");
        }
        vec_map
    }
 }
 impl<K: Ord, V> IntoIterator for VecMap<K, V> {
    type Item = (K, V);
    type IntoIter = std::vec::IntoIter<(K, V)>;
    fn into_iter(self) -> Self::IntoIter {
        self.data.into_iter()
    }
 }
 fn extract_key<K, V>(entry: &(K, V)) -> &K {
@@ -155,7 +241,7 @@ fn extract_key<K, V>(entry: &(K, V)) -> &K {
 mod tests {
    use std::{collections::BTreeMap, ops::Bound};
-    use super::VecMap;
+    use super::{VecMap, VecMapOrdering};
    #[test]
    fn unbounded_range() {
@@ -310,5 +396,59 @@ mod tests {
        left.extend(&mut one_map).unwrap_err();
        assert_eq!(left.as_slice(), &[(0, ()), (1, ())]);
        assert_eq!(one_map.as_slice(), &[(1, ())]);
        let mut map_greater_or_equal = VecMap::new(VecMapOrdering::GreaterOrEqual);
        map_greater_or_equal.append(2, ()).unwrap();
        map_greater_or_equal.append(2, ()).unwrap();
        left.extend(&mut map_greater_or_equal).unwrap_err();
        assert_eq!(left.as_slice(), &[(0, ()), (1, ())]);
        assert_eq!(map_greater_or_equal.as_slice(), &[(2, ()), (2, ())]);
    }
    #[test]
    fn extend_with_ordering() {
        let mut left = VecMap::new(VecMapOrdering::GreaterOrEqual);
        left.append(0, ()).unwrap();
        assert_eq!(left.as_slice(), &[(0, ())]);
        let mut greater_right = VecMap::new(VecMapOrdering::Greater);
        greater_right.append(0, ()).unwrap();
        left.extend(&mut greater_right).unwrap_err();
        assert_eq!(left.as_slice(), &[(0, ())]);
        let mut greater_or_equal_right = VecMap::new(VecMapOrdering::GreaterOrEqual);
        greater_or_equal_right.append(2, ()).unwrap();
        greater_or_equal_right.append(2, ()).unwrap();
        left.extend(&mut greater_or_equal_right).unwrap();
        assert_eq!(left.as_slice(), &[(0, ()), (2, ()), (2, ())]);
    }
    #[test]
    fn vec_map_from_sorted() {
        let vec = vec![(1, ()), (2, ()), (3, ()), (6, ())];
        let vec_map = VecMap::from_iter(vec, VecMapOrdering::Greater);
        assert_eq!(vec_map.as_slice(), &[(1, ()), (2, ()), (3, ()), (6, ())]);
        let vec = vec![(1, ()), (2, ()), (3, ()), (3, ()), (6, ()), (6, ())];
        let vec_map = VecMap::from_iter(vec, VecMapOrdering::GreaterOrEqual);
        assert_eq!(
            vec_map.as_slice(),
            &[(1, ()), (2, ()), (3, ()), (3, ()), (6, ()), (6, ())]
        );
    }
    #[test]
    #[should_panic]
    fn vec_map_from_unsorted_greater() {
        let vec = vec![(1, ()), (2, ()), (2, ()), (3, ()), (6, ())];
        let _ = VecMap::from_iter(vec, VecMapOrdering::Greater);
    }
    #[test]
    #[should_panic]
    fn vec_map_from_unsorted_greater_or_equal() {
        let vec = vec![(1, ()), (2, ()), (3, ()), (6, ()), (5, ())];
        let _ = VecMap::from_iter(vec, VecMapOrdering::GreaterOrEqual);
    }
 }
--- a/libs/utils/src/zstd.rs
+++ b/libs/utils/src/zstd.rs
@@ -0,0 +1,78 @@
 use std::io::SeekFrom;
 use anyhow::{Context, Result};
 use async_compression::{
    tokio::{bufread::ZstdDecoder, write::ZstdEncoder},
    zstd::CParameter,
    Level,
 };
 use camino::Utf8Path;
 use nix::NixPath;
 use tokio::{
    fs::{File, OpenOptions},
    io::AsyncBufRead,
    io::AsyncSeekExt,
    io::AsyncWriteExt,
 };
 use tokio_tar::{Archive, Builder, HeaderMode};
 use walkdir::WalkDir;
 /// Creates a Zstandard tarball.
 pub async fn create_zst_tarball(path: &Utf8Path, tarball: &Utf8Path) -> Result<(File, u64)> {
    let file = OpenOptions::new()
        .create(true)
        .truncate(true)
        .read(true)
        .write(true)
        .open(&tarball)
        .await
        .with_context(|| format!("tempfile creation {tarball}"))?;
    let mut paths = Vec::new();
    for entry in WalkDir::new(path) {
        let entry = entry?;
        let metadata = entry.metadata().expect("error getting dir entry metadata");
        // Also allow directories so that we also get empty directories
        if !(metadata.is_file() || metadata.is_dir()) {
            continue;
        }
        let path = entry.into_path();
        paths.push(path);
    }
    // Do a sort to get a more consistent listing
    paths.sort_unstable();
    let zstd = ZstdEncoder::with_quality_and_params(
        file,
        Level::Default,
        &[CParameter::enable_long_distance_matching(true)],
    );
    let mut builder = Builder::new(zstd);
    // Use reproducible header mode
    builder.mode(HeaderMode::Deterministic);
    for p in paths {
        let rel_path = p.strip_prefix(path)?;
        if rel_path.is_empty() {
            // The top directory should not be compressed,
            // the tar crate doesn't like that
            continue;
        }
        builder.append_path_with_name(&p, rel_path).await?;
    }
    let mut zstd = builder.into_inner().await?;
    zstd.shutdown().await?;
    let mut compressed = zstd.into_inner();
    let compressed_len = compressed.metadata().await?.len();
    compressed.seek(SeekFrom::Start(0)).await?;
    Ok((compressed, compressed_len))
 }
 /// Creates a Zstandard tarball.
 pub async fn extract_zst_tarball(
    path: &Utf8Path,
    tarball: impl AsyncBufRead + Unpin,
 ) -> Result<()> {
    let decoder = Box::pin(ZstdDecoder::new(tarball));
    let mut archive = Archive::new(decoder);
    archive.unpack(path).await?;
    Ok(())
 }
--- a/libs/vm_monitor/src/runner.rs
+++ b/libs/vm_monitor/src/runner.rs
@@ -69,7 +69,7 @@ pub struct Config {
    /// should be removed once we have a better solution there.
    sys_buffer_bytes: u64,
-    /// Minimum fraction of total system memory reserved *before* the the cgroup threshold; in
+    /// Minimum fraction of total system memory reserved *before* the cgroup threshold; in
    /// other words, providing a ceiling for the highest value of the threshold by enforcing that
    /// there's at least `cgroup_min_overhead_fraction` of the total memory remaining beyond the
    /// threshold.
--- a/libs/walproposer/src/api_bindings.rs
+++ b/libs/walproposer/src/api_bindings.rs
@@ -324,11 +324,11 @@ extern "C" fn finish_sync_safekeepers(wp: *mut WalProposer, lsn: XLogRecPtr) {
    }
 }
-extern "C" fn process_safekeeper_feedback(wp: *mut WalProposer, commit_lsn: XLogRecPtr) {
+extern "C" fn process_safekeeper_feedback(wp: *mut WalProposer, sk: *mut Safekeeper) {
    unsafe {
        let callback_data = (*(*wp).config).callback_data;
        let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).process_safekeeper_feedback(&mut (*wp), commit_lsn)
+        (*api).process_safekeeper_feedback(&mut (*wp), &mut (*sk));
    }
 }
--- a/libs/walproposer/src/walproposer.rs
+++ b/libs/walproposer/src/walproposer.rs
@@ -142,7 +142,7 @@ pub trait ApiImpl {
        todo!()
    }
-    fn process_safekeeper_feedback(&self, _wp: &mut WalProposer, _commit_lsn: u64) {
+    fn process_safekeeper_feedback(&mut self, _wp: &mut WalProposer, _sk: &mut Safekeeper) {
        todo!()
    }
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -59,6 +59,7 @@ signal-hook.workspace = true
 smallvec = { workspace = true, features = ["write"] }
 svg_fmt.workspace = true
 sync_wrapper.workspace = true
 sysinfo.workspace = true
 tokio-tar.workspace = true
 thiserror.workspace = true
 tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
@@ -89,6 +90,9 @@ enumset = { workspace = true, features = ["serde"]}
 strum.workspace = true
 strum_macros.workspace = true
 [target.'cfg(target_os = "linux")'.dependencies]
 procfs.workspace = true
 [dev-dependencies]
 criterion.workspace = true
 hex-literal.workspace = true
--- a/pageserver/benches/bench_walredo.rs
+++ b/pageserver/benches/bench_walredo.rs
@@ -1,160 +1,156 @@
-//! Simple benchmarking around walredo.
+//! Quantify a single walredo manager's throughput under N concurrent callers.
 //!
-//! Right now they hope to just set a baseline. Later we can try to expand into latency and
+//! The benchmark implementation ([`bench_impl`]) is parametrized by
-//! throughput after figuring out the coordinated omission problems below.
+//! - `redo_work` => [`Request::short_request`] or [`Request::medium_request`]
 //! - `n_redos` => number of times the benchmark shell execute the `redo_work`
 //! - `nclients` => number of clients (more on this shortly).
 //!
-//! There are two sets of inputs; `short` and `medium`. They were collected on postgres v14 by
+//! The benchmark impl sets up a multi-threaded tokio runtime with default parameters.
-//! logging what happens when a sequential scan is requested on a small table, then picking out two
+//! It spawns `nclients` times [`client`] tokio tasks.
-//! suitable from logs.
+//! Each task executes the `redo_work` `n_redos/nclients` times.
 //!
 //! We exercise the following combinations:
 //! - `redo_work = short / medium``
 //! - `nclients = [1, 2, 4, 8, 16, 32, 64, 128]`
 //!
-//! Reference data (git blame to see commit) on an i3en.3xlarge
+//! We let `criterion` determine the `n_redos` using `iter_custom`.
-// ```text
+//! The idea is that for each `(redo_work, nclients)` combination,
-//! short/short/1           time:   [39.175 µs 39.348 µs 39.536 µs]
+//! criterion will run the `bench_impl` multiple times with different `n_redos`.
-//! short/short/2           time:   [51.227 µs 51.487 µs 51.755 µs]
+//! The `bench_impl` reports the aggregate wall clock time from the clients' perspective.
-//! short/short/4           time:   [76.048 µs 76.362 µs 76.674 µs]
+//! Criterion will divide that by `n_redos` to compute the "time per iteration".
-//! short/short/8           time:   [128.94 µs 129.82 µs 130.74 µs]
+//! In our case, "time per iteration" means "time per redo_work execution".
-//! short/short/16          time:   [227.84 µs 229.00 µs 230.28 µs]
+//!
-//! short/short/32          time:   [455.97 µs 457.81 µs 459.90 µs]
+//! NB: the way by which `iter_custom` determines the "number of iterations"
-//! short/short/64          time:   [902.46 µs 904.84 µs 907.32 µs]
+//! is called sampling. Apparently the idea here is to detect outliers.
-//! short/short/128         time:   [1.7416 ms 1.7487 ms 1.7561 ms]
+//! We're not sure whether the current choice of sampling method makes sense.
-//! ``
+//! See https://bheisler.github.io/criterion.rs/book/user_guide/command_line_output.html#collecting-samples
-
+//!
-use std::sync::Arc;
+//! # Reference Numbers
 //!
 //! 2024-03-20 on i3en.3xlarge
 //!
 //! ```text
 //! short/1                 time:   [26.483 µs 26.614 µs 26.767 µs]
 //! short/2                 time:   [32.223 µs 32.465 µs 32.767 µs]
 //! short/4                 time:   [47.203 µs 47.583 µs 47.984 µs]
 //! short/8                 time:   [89.135 µs 89.612 µs 90.139 µs]
 //! short/16                time:   [190.12 µs 191.52 µs 192.88 µs]
 //! short/32                time:   [380.96 µs 382.63 µs 384.20 µs]
 //! short/64                time:   [736.86 µs 741.07 µs 745.03 µs]
 //! short/128               time:   [1.4106 ms 1.4206 ms 1.4294 ms]
 //! medium/1                time:   [111.81 µs 112.25 µs 112.79 µs]
 //! medium/2                time:   [158.26 µs 159.13 µs 160.21 µs]
 //! medium/4                time:   [334.65 µs 337.14 µs 340.07 µs]
 //! medium/8                time:   [675.32 µs 679.91 µs 685.25 µs]
 //! medium/16               time:   [1.2929 ms 1.2996 ms 1.3067 ms]
 //! medium/32               time:   [2.4295 ms 2.4461 ms 2.4623 ms]
 //! medium/64               time:   [4.3973 ms 4.4458 ms 4.4875 ms]
 //! medium/128              time:   [7.5955 ms 7.7847 ms 7.9481 ms]
 //! ```
 use bytes::{Buf, Bytes};
-use pageserver::{
+use criterion::{BenchmarkId, Criterion};
-    config::PageServerConf, repository::Key, walrecord::NeonWalRecord, walredo::PostgresRedoManager,
+use pageserver::{config::PageServerConf, walrecord::NeonWalRecord, walredo::PostgresRedoManager};
 use pageserver_api::{key::Key, shard::TenantShardId};
 use std::{
    sync::Arc,
    time::{Duration, Instant},
 };
-use pageserver_api::shard::TenantShardId;
+use tokio::{sync::Barrier, task::JoinSet};
 use tokio::task::JoinSet;
 use utils::{id::TenantId, lsn::Lsn};
-use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
+fn bench(c: &mut Criterion) {
    {
        let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
        for nclients in nclients {
            let mut group = c.benchmark_group("short");
            group.bench_with_input(
                BenchmarkId::from_parameter(nclients),
                &nclients,
                |b, nclients| {
                    let redo_work = Arc::new(Request::short_input());
                    b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
                },
            );
        }
    }
-fn redo_scenarios(c: &mut Criterion) {
+    {
-    // logging should be enabled when adding more inputs, since walredo will only report malformed
+        let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
-    // input to the stderr.
+        for nclients in nclients {
-    // utils::logging::init(utils::logging::LogFormat::Plain).unwrap();
+            let mut group = c.benchmark_group("medium");
            group.bench_with_input(
                BenchmarkId::from_parameter(nclients),
                &nclients,
                |b, nclients| {
                    let redo_work = Arc::new(Request::medium_input());
                    b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
                },
            );
        }
    }
 }
 criterion::criterion_group!(benches, bench);
 criterion::criterion_main!(benches);
 // Returns the sum of each client's wall-clock time spent executing their share of the n_redos.
 fn bench_impl(redo_work: Arc<Request>, n_redos: u64, nclients: u64) -> Duration {
    let repo_dir = camino_tempfile::tempdir_in(env!("CARGO_TARGET_TMPDIR")).unwrap();
    let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
    let conf = Box::leak(Box::new(conf));
    let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());
    let manager = PostgresRedoManager::new(conf, tenant_shard_id);
    let manager = Arc::new(manager);
    {
        let rt = tokio::runtime::Builder::new_current_thread()
            .enable_all()
            .build()
            .unwrap();
        tracing::info!("executing first");
        rt.block_on(short().execute(&manager)).unwrap();
        tracing::info!("first executed");
    }
    let thread_counts = [1, 2, 4, 8, 16, 32, 64, 128];
    let mut group = c.benchmark_group("short");
    group.sampling_mode(criterion::SamplingMode::Flat);
    for thread_count in thread_counts {
        group.bench_with_input(
            BenchmarkId::new("short", thread_count),
            &thread_count,
            |b, thread_count| {
                add_multithreaded_walredo_requesters(b, *thread_count, &manager, short);
            },
        );
    }
    drop(group);
    let mut group = c.benchmark_group("medium");
    group.sampling_mode(criterion::SamplingMode::Flat);
    for thread_count in thread_counts {
        group.bench_with_input(
            BenchmarkId::new("medium", thread_count),
            &thread_count,
            |b, thread_count| {
                add_multithreaded_walredo_requesters(b, *thread_count, &manager, medium);
            },
        );
    }
    drop(group);
 }
 /// Sets up a multi-threaded tokio runtime with default worker thread count,
 /// then, spawn `requesters` tasks that repeatedly:
 /// - get input from `input_factor()`
 /// - call `manager.request_redo()` with their input
 ///
 /// This stress-tests the scalability of a single walredo manager at high tokio-level concurrency.
 ///
 /// Using tokio's default worker thread count means the results will differ on machines
 /// with different core countrs. We don't care about that, the performance will always
 /// be different on different hardware. To compare performance of different software versions,
 /// use the same hardware.
 fn add_multithreaded_walredo_requesters(
    b: &mut criterion::Bencher,
    nrequesters: usize,
    manager: &Arc<PostgresRedoManager>,
    input_factory: fn() -> Request,
 ) {
    assert_ne!(nrequesters, 0);
    let rt = tokio::runtime::Builder::new_multi_thread()
        .enable_all()
        .build()
        .unwrap();
-    let barrier = Arc::new(tokio::sync::Barrier::new(nrequesters + 1));
+    let start = Arc::new(Barrier::new(nclients as usize));
-    let mut requesters = JoinSet::new();
+    let mut tasks = JoinSet::new();
-    for _ in 0..nrequesters {
+
-        let _entered = rt.enter();
+    let manager = PostgresRedoManager::new(conf, tenant_shard_id);
-        let manager = manager.clone();
+    let manager = Arc::new(manager);
-        let barrier = barrier.clone();
+
-        requesters.spawn(async move {
+    for _ in 0..nclients {
-            loop {
+        rt.block_on(async {
-                let input = input_factory();
+            tasks.spawn(client(
-                barrier.wait().await;
+                Arc::clone(&manager),
-                let page = input.execute(&manager).await.unwrap();
+                Arc::clone(&start),
-                assert_eq!(page.remaining(), 8192);
+                Arc::clone(&redo_work),
-                barrier.wait().await;
+                // divide the amount of work equally among the clients
-            }
+                n_redos / nclients,
            ))
        });
    }
-    let do_one_iteration = || {
+    rt.block_on(async move {
-        rt.block_on(async {
+        let mut total_wallclock_time = std::time::Duration::from_millis(0);
-            barrier.wait().await;
+        while let Some(res) = tasks.join_next().await {
-            // wait for work to complete
+            total_wallclock_time += res.unwrap();
-            barrier.wait().await;
+        }
-        })
+        total_wallclock_time
-    };
+    })
    b.iter_batched(
        || {
            // warmup
            do_one_iteration();
        },
        |()| {
            // work loop
            do_one_iteration();
        },
        criterion::BatchSize::PerIteration,
    );
    rt.block_on(requesters.shutdown());
 }
-criterion_group!(benches, redo_scenarios);
+async fn client(
-criterion_main!(benches);
+    mgr: Arc<PostgresRedoManager>,
    start: Arc<Barrier>,
    redo_work: Arc<Request>,
    n_redos: u64,
 ) -> Duration {
    start.wait().await;
    let start = Instant::now();
    for _ in 0..n_redos {
        let page = redo_work.execute(&mgr).await.unwrap();
        assert_eq!(page.remaining(), 8192);
        // The real pageserver will rarely if ever do 2 walredos in a row without
        // yielding to the executor.
        tokio::task::yield_now().await;
    }
    start.elapsed()
 }
 macro_rules! lsn {
    ($input:expr) => {{
@@ -166,12 +162,46 @@ macro_rules! lsn {
    }};
 }
-/// Short payload, 1132 bytes.
+/// Simple wrapper around `WalRedoManager::request_redo`.
-// pg_records are copypasted from log, where they are put with Debug impl of Bytes, which uses \0
+///
-// for null bytes.
+/// In benchmarks this is cloned around.
-#[allow(clippy::octal_escapes)]
+#[derive(Clone)]
-fn short() -> Request {
+struct Request {
-    Request {
+    key: Key,
    lsn: Lsn,
    base_img: Option<(Lsn, Bytes)>,
    records: Vec<(Lsn, NeonWalRecord)>,
    pg_version: u32,
 }
 impl Request {
    async fn execute(&self, manager: &PostgresRedoManager) -> anyhow::Result<Bytes> {
        let Request {
            key,
            lsn,
            base_img,
            records,
            pg_version,
        } = self;
        // TODO: avoid these clones
        manager
            .request_redo(*key, *lsn, base_img.clone(), records.clone(), *pg_version)
            .await
    }
    fn pg_record(will_init: bool, bytes: &'static [u8]) -> NeonWalRecord {
        let rec = Bytes::from_static(bytes);
        NeonWalRecord::Postgres { will_init, rec }
    }
    /// Short payload, 1132 bytes.
    // pg_records are copypasted from log, where they are put with Debug impl of Bytes, which uses \0
    // for null bytes.
    #[allow(clippy::octal_escapes)]
    pub fn short_input() -> Request {
        let pg_record = Self::pg_record;
        Request {
        key: Key {
            field1: 0,
            field2: 1663,
@@ -194,13 +224,14 @@ fn short() -> Request {
        ],
        pg_version: 14,
    }
-}
+    }
-/// Medium sized payload, serializes as 26393 bytes.
+    /// Medium sized payload, serializes as 26393 bytes.
-// see [`short`]
+    // see [`short`]
-#[allow(clippy::octal_escapes)]
+    #[allow(clippy::octal_escapes)]
-fn medium() -> Request {
+    pub fn medium_input() -> Request {
-    Request {
+        let pg_record = Self::pg_record;
        Request {
        key: Key {
            field1: 0,
            field2: 1663,
@@ -442,37 +473,5 @@ fn medium() -> Request {
        ],
        pg_version: 14,
    }
 }
 fn pg_record(will_init: bool, bytes: &'static [u8]) -> NeonWalRecord {
    let rec = Bytes::from_static(bytes);
    NeonWalRecord::Postgres { will_init, rec }
 }
 /// Simple wrapper around `WalRedoManager::request_redo`.
 ///
 /// In benchmarks this is cloned around.
 #[derive(Clone)]
 struct Request {
    key: Key,
    lsn: Lsn,
    base_img: Option<(Lsn, Bytes)>,
    records: Vec<(Lsn, NeonWalRecord)>,
    pg_version: u32,
 }
 impl Request {
    async fn execute(self, manager: &PostgresRedoManager) -> anyhow::Result<Bytes> {
        let Request {
            key,
            lsn,
            base_img,
            records,
            pg_version,
        } = self;
        manager
            .request_redo(key, lsn, base_img, records, pg_version)
            .await
    }
 }
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -169,7 +169,7 @@ impl Client {
        self.request(Method::GET, uri, ()).await
    }
-    async fn request<B: serde::Serialize, U: reqwest::IntoUrl>(
+    async fn request_noerror<B: serde::Serialize, U: reqwest::IntoUrl>(
        &self,
        method: Method,
        uri: U,
@@ -181,7 +181,16 @@ impl Client {
        } else {
            req
        };
-        let res = req.json(&body).send().await.map_err(Error::ReceiveBody)?;
+        req.json(&body).send().await.map_err(Error::ReceiveBody)
    }
    async fn request<B: serde::Serialize, U: reqwest::IntoUrl>(
        &self,
        method: Method,
        uri: U,
        body: B,
    ) -> Result<reqwest::Response> {
        let res = self.request_noerror(method, uri, body).await?;
        let response = res.error_from_body().await?;
        Ok(response)
    }
@@ -240,13 +249,26 @@ impl Client {
        Ok(())
    }
-    pub async fn tenant_secondary_download(&self, tenant_id: TenantShardId) -> Result<()> {
+    pub async fn tenant_secondary_download(
-        let uri = format!(
+        &self,
        tenant_id: TenantShardId,
        wait: Option<std::time::Duration>,
    ) -> Result<(StatusCode, SecondaryProgress)> {
        let mut path = reqwest::Url::parse(&format!(
            "{}/v1/tenant/{}/secondary/download",
            self.mgmt_api_endpoint, tenant_id
-        );
+        ))
-        self.request(Method::POST, &uri, ()).await?;
+        .expect("Cannot build URL");
-        Ok(())
+
        if let Some(wait) = wait {
            path.query_pairs_mut()
                .append_pair("wait_ms", &format!("{}", wait.as_millis()));
        }
        let response = self.request(Method::POST, path, ()).await?;
        let status = response.status();
        let progress: SecondaryProgress = response.json().await.map_err(Error::ReceiveBody)?;
        Ok((status, progress))
    }
    pub async fn location_config(
@@ -257,7 +279,7 @@ impl Client {
        lazy: bool,
    ) -> Result<()> {
        let req_body = TenantLocationConfigRequest {
-            tenant_id: tenant_shard_id,
+            tenant_id: Some(tenant_shard_id),
            config,
        };
@@ -416,4 +438,77 @@ impl Client {
            .await
            .map_err(Error::ReceiveBody)
    }
    pub async fn get_utilization(&self) -> Result<PageserverUtilization> {
        let uri = format!("{}/v1/utilization", self.mgmt_api_endpoint);
        self.get(uri)
            .await?
            .json()
            .await
            .map_err(Error::ReceiveBody)
    }
    pub async fn layer_map_info(
        &self,
        tenant_shard_id: TenantShardId,
        timeline_id: TimelineId,
    ) -> Result<LayerMapInfo> {
        let uri = format!(
            "{}/v1/tenant/{}/timeline/{}/layer",
            self.mgmt_api_endpoint, tenant_shard_id, timeline_id,
        );
        self.get(&uri)
            .await?
            .json()
            .await
            .map_err(Error::ReceiveBody)
    }
    pub async fn layer_evict(
        &self,
        tenant_shard_id: TenantShardId,
        timeline_id: TimelineId,
        layer_file_name: &str,
    ) -> Result<bool> {
        let uri = format!(
            "{}/v1/tenant/{}/timeline/{}/layer/{}",
            self.mgmt_api_endpoint, tenant_shard_id, timeline_id, layer_file_name
        );
        let resp = self.request_noerror(Method::DELETE, &uri, ()).await?;
        match resp.status() {
            StatusCode::OK => Ok(true),
            StatusCode::NOT_MODIFIED => Ok(false),
            // TODO: dedupe this pattern / introduce separate error variant?
            status => Err(match resp.json::<HttpErrorBody>().await {
                Ok(HttpErrorBody { msg }) => Error::ApiError(status, msg),
                Err(_) => {
                    Error::ReceiveErrorBody(format!("Http error ({}) at {}.", status.as_u16(), uri))
                }
            }),
        }
    }
    pub async fn layer_ondemand_download(
        &self,
        tenant_shard_id: TenantShardId,
        timeline_id: TimelineId,
        layer_file_name: &str,
    ) -> Result<bool> {
        let uri = format!(
            "{}/v1/tenant/{}/timeline/{}/layer/{}",
            self.mgmt_api_endpoint, tenant_shard_id, timeline_id, layer_file_name
        );
        let resp = self.request_noerror(Method::GET, &uri, ()).await?;
        match resp.status() {
            StatusCode::OK => Ok(true),
            StatusCode::NOT_MODIFIED => Ok(false),
            // TODO: dedupe this pattern / introduce separate error variant?
            status => Err(match resp.json::<HttpErrorBody>().await {
                Ok(HttpErrorBody { msg }) => Error::ApiError(status, msg),
                Err(_) => {
                    Error::ReceiveErrorBody(format!("Http error ({}) at {}.", status.as_u16(), uri))
                }
            }),
        }
    }
 }
--- a/pageserver/pagebench/src/cmd/ondemand_download_churn.rs
+++ b/pageserver/pagebench/src/cmd/ondemand_download_churn.rs
@@ -0,0 +1,272 @@
 use pageserver_api::{models::HistoricLayerInfo, shard::TenantShardId};
 use pageserver_client::mgmt_api;
 use rand::seq::SliceRandom;
 use tracing::{debug, info};
 use utils::id::{TenantTimelineId, TimelineId};
 use tokio::{
    sync::{mpsc, OwnedSemaphorePermit},
    task::JoinSet,
 };
 use std::{
    num::NonZeroUsize,
    sync::{
        atomic::{AtomicU64, Ordering},
        Arc,
    },
    time::{Duration, Instant},
 };
 /// Evict & on-demand download random layers.
 #[derive(clap::Parser)]
 pub(crate) struct Args {
    #[clap(long, default_value = "http://localhost:9898")]
    mgmt_api_endpoint: String,
    #[clap(long)]
    pageserver_jwt: Option<String>,
    #[clap(long)]
    runtime: Option<humantime::Duration>,
    #[clap(long, default_value = "1")]
    tasks_per_target: NonZeroUsize,
    #[clap(long, default_value = "1")]
    concurrency_per_target: NonZeroUsize,
    /// Probability for sending `latest=true` in the request (uniform distribution).
    #[clap(long)]
    limit_to_first_n_targets: Option<usize>,
    /// Before starting the benchmark, live-reconfigure the pageserver to use the given
    /// [`pageserver_api::models::virtual_file::IoEngineKind`].
    #[clap(long)]
    set_io_engine: Option<pageserver_api::models::virtual_file::IoEngineKind>,
    targets: Option<Vec<TenantTimelineId>>,
 }
 pub(crate) fn main(args: Args) -> anyhow::Result<()> {
    let rt = tokio::runtime::Builder::new_multi_thread()
        .enable_all()
        .build()?;
    let task = rt.spawn(main_impl(args));
    rt.block_on(task).unwrap().unwrap();
    Ok(())
 }
 #[derive(Debug, Default)]
 struct LiveStats {
    evictions: AtomicU64,
    downloads: AtomicU64,
    timeline_restarts: AtomicU64,
 }
 impl LiveStats {
    fn eviction_done(&self) {
        self.evictions.fetch_add(1, Ordering::Relaxed);
    }
    fn download_done(&self) {
        self.downloads.fetch_add(1, Ordering::Relaxed);
    }
    fn timeline_restart_done(&self) {
        self.timeline_restarts.fetch_add(1, Ordering::Relaxed);
    }
 }
 async fn main_impl(args: Args) -> anyhow::Result<()> {
    let args: &'static Args = Box::leak(Box::new(args));
    let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
        args.mgmt_api_endpoint.clone(),
        args.pageserver_jwt.as_deref(),
    ));
    if let Some(engine_str) = &args.set_io_engine {
        mgmt_api_client.put_io_engine(engine_str).await?;
    }
    // discover targets
    let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
        &mgmt_api_client,
        crate::util::cli::targets::Spec {
            limit_to_first_n_targets: args.limit_to_first_n_targets,
            targets: args.targets.clone(),
        },
    )
    .await?;
    let mut tasks = JoinSet::new();
    let live_stats = Arc::new(LiveStats::default());
    tasks.spawn({
        let live_stats = Arc::clone(&live_stats);
        async move {
            let mut last_at = Instant::now();
            loop {
                tokio::time::sleep_until((last_at + Duration::from_secs(1)).into()).await;
                let now = Instant::now();
                let delta: Duration = now - last_at;
                last_at = now;
                let LiveStats {
                    evictions,
                    downloads,
                    timeline_restarts,
                } = &*live_stats;
                let evictions = evictions.swap(0, Ordering::Relaxed) as f64 / delta.as_secs_f64();
                let downloads = downloads.swap(0, Ordering::Relaxed) as f64 / delta.as_secs_f64();
                let timeline_restarts = timeline_restarts.swap(0, Ordering::Relaxed);
                info!("evictions={evictions:.2}/s downloads={downloads:.2}/s timeline_restarts={timeline_restarts}");
            }
        }
    });
    for tl in timelines {
        for _ in 0..args.tasks_per_target.get() {
            tasks.spawn(timeline_actor(
                args,
                Arc::clone(&mgmt_api_client),
                tl,
                Arc::clone(&live_stats),
            ));
        }
    }
    while let Some(res) = tasks.join_next().await {
        res.unwrap();
    }
    Ok(())
 }
 async fn timeline_actor(
    args: &'static Args,
    mgmt_api_client: Arc<pageserver_client::mgmt_api::Client>,
    timeline: TenantTimelineId,
    live_stats: Arc<LiveStats>,
 ) {
    // TODO: support sharding
    let tenant_shard_id = TenantShardId::unsharded(timeline.tenant_id);
    struct Timeline {
        joinset: JoinSet<()>,
        layers: Vec<mpsc::Sender<OwnedSemaphorePermit>>,
        concurrency: Arc<tokio::sync::Semaphore>,
    }
    loop {
        debug!("restarting timeline");
        let layer_map_info = mgmt_api_client
            .layer_map_info(tenant_shard_id, timeline.timeline_id)
            .await
            .unwrap();
        let concurrency = Arc::new(tokio::sync::Semaphore::new(
            args.concurrency_per_target.get(),
        ));
        let mut joinset = JoinSet::new();
        let layers = layer_map_info
            .historic_layers
            .into_iter()
            .map(|historic_layer| {
                let (tx, rx) = mpsc::channel(1);
                joinset.spawn(layer_actor(
                    tenant_shard_id,
                    timeline.timeline_id,
                    historic_layer,
                    rx,
                    Arc::clone(&mgmt_api_client),
                    Arc::clone(&live_stats),
                ));
                tx
            })
            .collect::<Vec<_>>();
        let mut timeline = Timeline {
            joinset,
            layers,
            concurrency,
        };
        live_stats.timeline_restart_done();
        loop {
            assert!(!timeline.joinset.is_empty());
            if let Some(res) = timeline.joinset.try_join_next() {
                debug!(?res, "a layer actor exited, should not happen");
                timeline.joinset.shutdown().await;
                break;
            }
            let mut permit = Some(
                Arc::clone(&timeline.concurrency)
                    .acquire_owned()
                    .await
                    .unwrap(),
            );
            loop {
                let layer_tx = {
                    let mut rng = rand::thread_rng();
                    timeline.layers.choose_mut(&mut rng).expect("no layers")
                };
                match layer_tx.try_send(permit.take().unwrap()) {
                    Ok(_) => break,
                    Err(e) => match e {
                        mpsc::error::TrySendError::Full(back) => {
                            // TODO: retrying introduces bias away from slow downloaders
                            permit.replace(back);
                        }
                        mpsc::error::TrySendError::Closed(_) => panic!(),
                    },
                }
            }
        }
    }
 }
 async fn layer_actor(
    tenant_shard_id: TenantShardId,
    timeline_id: TimelineId,
    mut layer: HistoricLayerInfo,
    mut rx: mpsc::Receiver<tokio::sync::OwnedSemaphorePermit>,
    mgmt_api_client: Arc<mgmt_api::Client>,
    live_stats: Arc<LiveStats>,
 ) {
    #[derive(Clone, Copy)]
    enum Action {
        Evict,
        OnDemandDownload,
    }
    while let Some(_permit) = rx.recv().await {
        let action = if layer.is_remote() {
            Action::OnDemandDownload
        } else {
            Action::Evict
        };
        let did_it = match action {
            Action::Evict => {
                let did_it = mgmt_api_client
                    .layer_evict(tenant_shard_id, timeline_id, layer.layer_file_name())
                    .await
                    .unwrap();
                live_stats.eviction_done();
                did_it
            }
            Action::OnDemandDownload => {
                let did_it = mgmt_api_client
                    .layer_ondemand_download(tenant_shard_id, timeline_id, layer.layer_file_name())
                    .await
                    .unwrap();
                live_stats.download_done();
                did_it
            }
        };
        if !did_it {
            debug!("local copy of layer map appears out of sync, re-downloading");
            return;
        }
        debug!("did it");
        layer.set_remote(match action {
            Action::Evict => true,
            Action::OnDemandDownload => false,
        });
    }
 }
--- a/pageserver/pagebench/src/main.rs
+++ b/pageserver/pagebench/src/main.rs
@@ -16,6 +16,7 @@ mod util {
 mod cmd {
    pub(super) mod basebackup;
    pub(super) mod getpage_latest_lsn;
    pub(super) mod ondemand_download_churn;
    pub(super) mod trigger_initial_size_calculation;
 }
@@ -25,6 +26,7 @@ enum Args {
    Basebackup(cmd::basebackup::Args),
    GetPageLatestLsn(cmd::getpage_latest_lsn::Args),
    TriggerInitialSizeCalculation(cmd::trigger_initial_size_calculation::Args),
    OndemandDownloadChurn(cmd::ondemand_download_churn::Args),
 }
 fn main() {
@@ -43,6 +45,7 @@ fn main() {
        Args::TriggerInitialSizeCalculation(args) => {
            cmd::trigger_initial_size_calculation::main(args)
        }
        Args::OndemandDownloadChurn(args) => cmd::ondemand_download_churn::main(args),
    }
    .unwrap()
 }
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -1,3 +1,5 @@
 #![recursion_limit = "300"]
 //! Main entry point for the Page Server executable.
 use std::env::{var, VarError};
@@ -118,6 +120,9 @@ fn main() -> anyhow::Result<()> {
        &[("node_id", &conf.id.to_string())],
    );
    // after setting up logging, log the effective IO engine choice
    info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
    let tenants_path = conf.tenants_path();
    if !tenants_path.exists() {
        utils::crashsafe::create_dir_all(conf.tenants_path())
@@ -312,6 +317,7 @@ fn start_pageserver(
    let http_listener = tcp_listener::bind(http_addr)?;
    let pg_addr = &conf.listen_pg_addr;
    info!("Starting pageserver pg protocol handler on {pg_addr}");
    let pageserver_listener = tcp_listener::bind(pg_addr)?;
@@ -544,7 +550,7 @@ fn start_pageserver(
        let router_state = Arc::new(
            http::routes::State::new(
                conf,
-                tenant_manager,
+                tenant_manager.clone(),
                http_auth.clone(),
                remote_storage.clone(),
                broker_client.clone(),
@@ -594,32 +600,37 @@ fn start_pageserver(
            None,
            "consumption metrics collection",
            true,
-            async move {
+            {
-                // first wait until background jobs are cleared to launch.
+                let tenant_manager = tenant_manager.clone();
-                //
+                async move {
-                // this is because we only process active tenants and timelines, and the
+                    // first wait until background jobs are cleared to launch.
-                // Timeline::get_current_logical_size will spawn the logical size calculation,
+                    //
-                // which will not be rate-limited.
+                    // this is because we only process active tenants and timelines, and the
-                let cancel = task_mgr::shutdown_token();
+                    // Timeline::get_current_logical_size will spawn the logical size calculation,
                    // which will not be rate-limited.
                    let cancel = task_mgr::shutdown_token();
-                tokio::select! {
+                    tokio::select! {
-                    _ = cancel.cancelled() => { return Ok(()); },
+                        _ = cancel.cancelled() => { return Ok(()); },
-                    _ = background_jobs_barrier.wait() => {}
+                        _ = background_jobs_barrier.wait() => {}
-                };
+                    };
-                pageserver::consumption_metrics::collect_metrics(
+                    pageserver::consumption_metrics::collect_metrics(
-                    metric_collection_endpoint,
+                        tenant_manager,
-                    conf.metric_collection_interval,
+                        metric_collection_endpoint,
-                    conf.cached_metric_collection_interval,
+                        &conf.metric_collection_bucket,
-                    conf.synthetic_size_calculation_interval,
+                        conf.metric_collection_interval,
-                    conf.id,
+                        conf.cached_metric_collection_interval,
-                    local_disk_storage,
+                        conf.synthetic_size_calculation_interval,
-                    cancel,
+                        conf.id,
-                    metrics_ctx,
+                        local_disk_storage,
-                )
+                        cancel,
-                .instrument(info_span!("metrics_collection"))
+                        metrics_ctx,
-                .await?;
+                    )
-                Ok(())
+                    .instrument(info_span!("metrics_collection"))
                    .await?;
                    Ok(())
                }
            },
        );
    }
@@ -688,6 +699,7 @@ fn start_pageserver(
                let bg_remote_storage = remote_storage.clone();
                let bg_deletion_queue = deletion_queue.clone();
                BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(
                    &tenant_manager,
                    bg_remote_storage.map(|_| bg_deletion_queue),
                    0,
                ));
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -7,8 +7,9 @@
 use anyhow::{anyhow, bail, ensure, Context, Result};
 use pageserver_api::shard::TenantShardId;
 use remote_storage::{RemotePath, RemoteStorageConfig};
 use serde;
 use serde::de::IntoDeserializer;
-use std::env;
+use std::{collections::HashMap, env};
 use storage_broker::Uri;
 use utils::crashsafe::path_with_suffix_extension;
 use utils::id::ConnectionId;
@@ -29,18 +30,17 @@ use utils::{
    logging::LogFormat,
 };
 use crate::disk_usage_eviction_task::DiskUsageEvictionTaskConfig;
 use crate::tenant::config::TenantConf;
 use crate::tenant::config::TenantConfOpt;
 use crate::tenant::timeline::GetVectoredImpl;
 use crate::tenant::vectored_blob_io::MaxVectoredReadBytes;
 use crate::tenant::{
    TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
 };
-use crate::virtual_file;
+use crate::{disk_usage_eviction_task::DiskUsageEvictionTaskConfig, virtual_file::io_engine};
 use crate::{tenant::config::TenantConf, virtual_file};
 use crate::{
    IGNORED_TENANT_FILE_NAME, TENANT_CONFIG_NAME, TENANT_HEATMAP_BASENAME,
-    TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX, TIMELINE_UNINIT_MARK_SUFFIX,
+    TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX,
 };
 use self::defaults::DEFAULT_CONCURRENT_TENANT_WARMUP;
@@ -83,6 +83,10 @@ pub mod defaults {
    pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
    #[cfg(target_os = "linux")]
    pub const DEFAULT_VIRTUAL_FILE_IO_ENGINE: &str = "tokio-epoll-uring";
    #[cfg(not(target_os = "linux"))]
    pub const DEFAULT_VIRTUAL_FILE_IO_ENGINE: &str = "std-fs";
    pub const DEFAULT_GET_VECTORED_IMPL: &str = "sequential";
@@ -91,6 +95,8 @@ pub mod defaults {
    pub const DEFAULT_VALIDATE_VECTORED_GET: bool = true;
    pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
    ///
    /// Default built-in configuration file.
    ///
@@ -152,6 +158,8 @@ pub mod defaults {
 #heatmap_upload_concurrency = {DEFAULT_HEATMAP_UPLOAD_CONCURRENCY}
 #secondary_download_concurrency = {DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY}
 #ephemeral_bytes_per_memory_kb = {DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB}
 [remote_storage]
 "#
@@ -230,6 +238,7 @@ pub struct PageServerConf {
    // How often to send unchanged cached metrics to the metrics endpoint.
    pub cached_metric_collection_interval: Duration,
    pub metric_collection_endpoint: Option<Url>,
    pub metric_collection_bucket: Option<RemoteStorageConfig>,
    pub synthetic_size_calculation_interval: Duration,
    pub disk_usage_based_eviction: Option<DiskUsageEvictionTaskConfig>,
@@ -274,6 +283,13 @@ pub struct PageServerConf {
    pub max_vectored_read_bytes: MaxVectoredReadBytes,
    pub validate_vectored_get: bool,
    /// How many bytes of ephemeral layer content will we allow per kilobyte of RAM.  When this
    /// is exceeded, we start proactively closing ephemeral layers to limit the total amount
    /// of ephemeral data.
    ///
    /// Setting this to zero disables limits on total ephemeral layer size.
    pub ephemeral_bytes_per_memory_kb: usize,
 }
 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -286,21 +302,49 @@ pub static SAFEKEEPER_AUTH_TOKEN: OnceCell<Arc<String>> = OnceCell::new();
 // use dedicated enum for builder to better indicate the intention
 // and avoid possible confusion with nested options
 #[derive(Clone, Default)]
 pub enum BuilderValue<T> {
    Set(T),
    #[default]
    NotSet,
 }
-impl<T> BuilderValue<T> {
+impl<T: Clone> BuilderValue<T> {
-    pub fn ok_or<E>(self, err: E) -> Result<T, E> {
+    pub fn ok_or(&self, field_name: &'static str, default: BuilderValue<T>) -> anyhow::Result<T> {
        match self {
-            Self::Set(v) => Ok(v),
+            Self::Set(v) => Ok(v.clone()),
-            Self::NotSet => Err(err),
+            Self::NotSet => match default {
                BuilderValue::Set(v) => Ok(v.clone()),
                BuilderValue::NotSet => {
                    anyhow::bail!("missing config value {field_name:?}")
                }
            },
        }
    }
 }
 // Certain metadata (e.g. externally-addressable name, AZ) is delivered
 // as a separate structure.  This information is not neeed by the pageserver
 // itself, it is only used for registering the pageserver with the control
 // plane and/or storage controller.
 //
 #[derive(serde::Deserialize)]
 pub(crate) struct NodeMetadata {
    #[serde(rename = "host")]
    pub(crate) postgres_host: String,
    #[serde(rename = "port")]
    pub(crate) postgres_port: u16,
    pub(crate) http_host: String,
    pub(crate) http_port: u16,
    // Deployment tools may write fields to the metadata file beyond what we
    // use in this type: this type intentionally only names fields that require.
    #[serde(flatten)]
    pub(crate) other: HashMap<String, serde_json::Value>,
 }
 // needed to simplify config construction
 #[derive(Default)]
 struct PageServerConfigBuilder {
    listen_pg_addr: BuilderValue<String>,
@@ -341,6 +385,7 @@ struct PageServerConfigBuilder {
    cached_metric_collection_interval: BuilderValue<Duration>,
    metric_collection_endpoint: BuilderValue<Option<Url>>,
    synthetic_size_calculation_interval: BuilderValue<Duration>,
    metric_collection_bucket: BuilderValue<Option<RemoteStorageConfig>>,
    disk_usage_based_eviction: BuilderValue<Option<DiskUsageEvictionTaskConfig>>,
@@ -366,10 +411,13 @@ struct PageServerConfigBuilder {
    max_vectored_read_bytes: BuilderValue<MaxVectoredReadBytes>,
    validate_vectored_get: BuilderValue<bool>,
    ephemeral_bytes_per_memory_kb: BuilderValue<usize>,
 }
-impl Default for PageServerConfigBuilder {
+impl PageServerConfigBuilder {
-    fn default() -> Self {
+    #[inline(always)]
    fn default_values() -> Self {
        use self::BuilderValue::*;
        use defaults::*;
        Self {
@@ -422,6 +470,8 @@ impl Default for PageServerConfigBuilder {
            .expect("cannot parse default synthetic size calculation interval")),
            metric_collection_endpoint: Set(DEFAULT_METRIC_COLLECTION_ENDPOINT),
            metric_collection_bucket: Set(None),
            disk_usage_based_eviction: Set(None),
            test_remote_failures: Set(0),
@@ -449,6 +499,7 @@ impl Default for PageServerConfigBuilder {
                NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
            )),
            validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET),
            ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
        }
    }
 }
@@ -553,6 +604,13 @@ impl PageServerConfigBuilder {
        self.metric_collection_endpoint = BuilderValue::Set(metric_collection_endpoint)
    }
    pub fn metric_collection_bucket(
        &mut self,
        metric_collection_bucket: Option<RemoteStorageConfig>,
    ) {
        self.metric_collection_bucket = BuilderValue::Set(metric_collection_bucket)
    }
    pub fn synthetic_size_calculation_interval(
        &mut self,
        synthetic_size_calculation_interval: Duration,
@@ -621,126 +679,103 @@ impl PageServerConfigBuilder {
        self.validate_vectored_get = BuilderValue::Set(value);
    }
    pub fn get_ephemeral_bytes_per_memory_kb(&mut self, value: usize) {
        self.ephemeral_bytes_per_memory_kb = BuilderValue::Set(value);
    }
    pub fn build(self) -> anyhow::Result<PageServerConf> {
-        let concurrent_tenant_warmup = self
+        let default = Self::default_values();
-            .concurrent_tenant_warmup
+
-            .ok_or(anyhow!("missing concurrent_tenant_warmup"))?;
+        macro_rules! conf {
-        let concurrent_tenant_size_logical_size_queries = self
+            (USING DEFAULT { $($field:ident,)* } CUSTOM LOGIC { $($custom_field:ident : $custom_value:expr,)* } ) => {
-            .concurrent_tenant_size_logical_size_queries
+                PageServerConf {
-            .ok_or(anyhow!(
+                    $(
-                "missing concurrent_tenant_size_logical_size_queries"
+                        $field: self.$field.ok_or(stringify!($field), default.$field)?,
-            ))?;
+                    )*
-        Ok(PageServerConf {
+                    $(
-            listen_pg_addr: self
+                        $custom_field: $custom_value,
-                .listen_pg_addr
+                    )*
-                .ok_or(anyhow!("missing listen_pg_addr"))?,
+                }
-            listen_http_addr: self
+            };
-                .listen_http_addr
+        }
-                .ok_or(anyhow!("missing listen_http_addr"))?,
+
-            availability_zone: self
+        Ok(conf!(
-                .availability_zone
+            USING DEFAULT
-                .ok_or(anyhow!("missing availability_zone"))?,
+            {
-            wait_lsn_timeout: self
+                listen_pg_addr,
-                .wait_lsn_timeout
+                listen_http_addr,
-                .ok_or(anyhow!("missing wait_lsn_timeout"))?,
+                availability_zone,
-            wal_redo_timeout: self
+                wait_lsn_timeout,
-                .wal_redo_timeout
+                wal_redo_timeout,
-                .ok_or(anyhow!("missing wal_redo_timeout"))?,
+                superuser,
-            superuser: self.superuser.ok_or(anyhow!("missing superuser"))?,
+                page_cache_size,
-            page_cache_size: self
+                max_file_descriptors,
-                .page_cache_size
+                workdir,
-                .ok_or(anyhow!("missing page_cache_size"))?,
+                pg_distrib_dir,
-            max_file_descriptors: self
+                http_auth_type,
-                .max_file_descriptors
+                pg_auth_type,
-                .ok_or(anyhow!("missing max_file_descriptors"))?,
+                auth_validation_public_key_path,
-            workdir: self.workdir.ok_or(anyhow!("missing workdir"))?,
+                remote_storage_config,
-            pg_distrib_dir: self
+                id,
-                .pg_distrib_dir
+                broker_endpoint,
-                .ok_or(anyhow!("missing pg_distrib_dir"))?,
+                broker_keepalive_interval,
-            http_auth_type: self
+                log_format,
-                .http_auth_type
+                metric_collection_interval,
-                .ok_or(anyhow!("missing http_auth_type"))?,
+                cached_metric_collection_interval,
-            pg_auth_type: self.pg_auth_type.ok_or(anyhow!("missing pg_auth_type"))?,
+                metric_collection_endpoint,
-            auth_validation_public_key_path: self
+                metric_collection_bucket,
-                .auth_validation_public_key_path
+                synthetic_size_calculation_interval,
-                .ok_or(anyhow!("missing auth_validation_public_key_path"))?,
+                disk_usage_based_eviction,
-            remote_storage_config: self
+                test_remote_failures,
-                .remote_storage_config
+                ondemand_download_behavior_treat_error_as_warn,
-                .ok_or(anyhow!("missing remote_storage_config"))?,
+                background_task_maximum_delay,
-            id: self.id.ok_or(anyhow!("missing id"))?,
+                control_plane_api,
-            // TenantConf is handled separately
+                control_plane_api_token,
-            default_tenant_conf: TenantConf::default(),
+                control_plane_emergency_mode,
-            broker_endpoint: self
+                heatmap_upload_concurrency,
-                .broker_endpoint
+                secondary_download_concurrency,
-                .ok_or(anyhow!("No broker endpoints provided"))?,
+                ingest_batch_size,
-            broker_keepalive_interval: self
+                get_vectored_impl,
-                .broker_keepalive_interval
+                max_vectored_read_bytes,
-                .ok_or(anyhow!("No broker keepalive interval provided"))?,
+                validate_vectored_get,
-            log_format: self.log_format.ok_or(anyhow!("missing log_format"))?,
+                ephemeral_bytes_per_memory_kb,
-            concurrent_tenant_warmup: ConfigurableSemaphore::new(concurrent_tenant_warmup),
+            }
-            concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::new(
+            CUSTOM LOGIC
-                concurrent_tenant_size_logical_size_queries,
+            {
-            ),
+                // TenantConf is handled separately
-            eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::new(
+                default_tenant_conf: TenantConf::default(),
-                concurrent_tenant_size_logical_size_queries,
+                concurrent_tenant_warmup: ConfigurableSemaphore::new({
-            ),
+                    self
-            metric_collection_interval: self
+                        .concurrent_tenant_warmup
-                .metric_collection_interval
+                        .ok_or("concurrent_tenant_warmpup",
-                .ok_or(anyhow!("missing metric_collection_interval"))?,
+                               default.concurrent_tenant_warmup)?
-            cached_metric_collection_interval: self
+                }),
-                .cached_metric_collection_interval
+                concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::new(
-                .ok_or(anyhow!("missing cached_metric_collection_interval"))?,
+                    self
-            metric_collection_endpoint: self
+                        .concurrent_tenant_size_logical_size_queries
-                .metric_collection_endpoint
+                        .ok_or("concurrent_tenant_size_logical_size_queries",
-                .ok_or(anyhow!("missing metric_collection_endpoint"))?,
+                               default.concurrent_tenant_size_logical_size_queries.clone())?
-            synthetic_size_calculation_interval: self
+                ),
-                .synthetic_size_calculation_interval
+                eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::new(
-                .ok_or(anyhow!("missing synthetic_size_calculation_interval"))?,
+                    // re-use `concurrent_tenant_size_logical_size_queries`
-            disk_usage_based_eviction: self
+                    self
-                .disk_usage_based_eviction
+                        .concurrent_tenant_size_logical_size_queries
-                .ok_or(anyhow!("missing disk_usage_based_eviction"))?,
+                        .ok_or("eviction_task_immitated_concurrent_logical_size_queries",
-            test_remote_failures: self
+                               default.concurrent_tenant_size_logical_size_queries.clone())?,
-                .test_remote_failures
+                ),
-                .ok_or(anyhow!("missing test_remote_failuers"))?,
+                virtual_file_io_engine: match self.virtual_file_io_engine {
-            ondemand_download_behavior_treat_error_as_warn: self
+                    BuilderValue::Set(v) => v,
-                .ondemand_download_behavior_treat_error_as_warn
+                    BuilderValue::NotSet => match crate::virtual_file::io_engine_feature_test().context("auto-detect virtual_file_io_engine")? {
-                .ok_or(anyhow!(
+                        io_engine::FeatureTestResult::PlatformPreferred(v) => v, // make no noise
-                    "missing ondemand_download_behavior_treat_error_as_warn"
+                        io_engine::FeatureTestResult::Worse { engine, remark } => {
-                ))?,
+                            // TODO: bubble this up to the caller so we can tracing::warn! it.
-            background_task_maximum_delay: self
+                            eprintln!("auto-detected IO engine is not platform-preferred: engine={engine:?} remark={remark:?}");
-                .background_task_maximum_delay
+                            engine
-                .ok_or(anyhow!("missing background_task_maximum_delay"))?,
+                        }
-            control_plane_api: self
+                    },
-                .control_plane_api
+                },
-                .ok_or(anyhow!("missing control_plane_api"))?,
+            }
-            control_plane_api_token: self
+        ))
                .control_plane_api_token
                .ok_or(anyhow!("missing control_plane_api_token"))?,
            control_plane_emergency_mode: self
                .control_plane_emergency_mode
                .ok_or(anyhow!("missing control_plane_emergency_mode"))?,
            heatmap_upload_concurrency: self
                .heatmap_upload_concurrency
                .ok_or(anyhow!("missing heatmap_upload_concurrency"))?,
            secondary_download_concurrency: self
                .secondary_download_concurrency
                .ok_or(anyhow!("missing secondary_download_concurrency"))?,
            ingest_batch_size: self
                .ingest_batch_size
                .ok_or(anyhow!("missing ingest_batch_size"))?,
            virtual_file_io_engine: self
                .virtual_file_io_engine
                .ok_or(anyhow!("missing virtual_file_io_engine"))?,
            get_vectored_impl: self
                .get_vectored_impl
                .ok_or(anyhow!("missing get_vectored_impl"))?,
            max_vectored_read_bytes: self
                .max_vectored_read_bytes
                .ok_or(anyhow!("missing max_vectored_read_bytes"))?,
            validate_vectored_get: self
                .validate_vectored_get
                .ok_or(anyhow!("missing validate_vectored_get"))?,
        })
    }
 }
@@ -757,6 +792,10 @@ impl PageServerConf {
        self.workdir.join("deletion")
    }
    pub fn metadata_path(&self) -> Utf8PathBuf {
        self.workdir.join("metadata.json")
    }
    pub fn deletion_list_path(&self, sequence: u64) -> Utf8PathBuf {
        // Encode a version in the filename, so that if we ever switch away from JSON we can
        // increment this.
@@ -816,18 +855,7 @@ impl PageServerConf {
            .join(timeline_id.to_string())
    }
-    pub fn timeline_uninit_mark_file_path(
+    pub(crate) fn timeline_delete_mark_file_path(
        &self,
        tenant_shard_id: TenantShardId,
        timeline_id: TimelineId,
    ) -> Utf8PathBuf {
        path_with_suffix_extension(
            self.timeline_path(&tenant_shard_id, &timeline_id),
            TIMELINE_UNINIT_MARK_SUFFIX,
        )
    }
    pub fn timeline_delete_mark_file_path(
        &self,
        tenant_shard_id: TenantShardId,
        timeline_id: TimelineId,
@@ -838,7 +866,10 @@ impl PageServerConf {
        )
    }
-    pub fn tenant_deleted_mark_file_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
+    pub(crate) fn tenant_deleted_mark_file_path(
        &self,
        tenant_shard_id: &TenantShardId,
    ) -> Utf8PathBuf {
        self.tenant_path(tenant_shard_id)
            .join(TENANT_DELETED_MARKER_FILE_NAME)
    }
@@ -942,6 +973,9 @@ impl PageServerConf {
                    let endpoint = parse_toml_string(key, item)?.parse().context("failed to parse metric_collection_endpoint")?;
                    builder.metric_collection_endpoint(Some(endpoint));
                },
                "metric_collection_bucket" => {
                    builder.metric_collection_bucket(RemoteStorageConfig::from_toml(item)?)
                }
                "synthetic_size_calculation_interval" =>
                    builder.synthetic_size_calculation_interval(parse_toml_duration(key, item)?),
                "test_remote_failures" => builder.test_remote_failures(parse_toml_u64(key, item)?),
@@ -995,6 +1029,9 @@ impl PageServerConf {
                "validate_vectored_get" => {
                    builder.get_validate_vectored_get(parse_toml_bool("validate_vectored_get", item)?)
                }
                "ephemeral_bytes_per_memory_kb" => {
                    builder.get_ephemeral_bytes_per_memory_kb(parse_toml_u64("ephemeral_bytes_per_memory_kb", item)? as usize)
                }
                _ => bail!("unrecognized pageserver option '{key}'"),
            }
        }
@@ -1057,6 +1094,7 @@ impl PageServerConf {
            metric_collection_interval: Duration::from_secs(60),
            cached_metric_collection_interval: Duration::from_secs(60 * 60),
            metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
            metric_collection_bucket: None,
            synthetic_size_calculation_interval: Duration::from_secs(60),
            disk_usage_based_eviction: None,
            test_remote_failures: 0,
@@ -1075,6 +1113,7 @@ impl PageServerConf {
                    .expect("Invalid default constant"),
            ),
            validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
            ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
        }
    }
 }
@@ -1289,6 +1328,7 @@ background_task_maximum_delay = '334 s'
                    defaults::DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL
                )?,
                metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
                metric_collection_bucket: None,
                synthetic_size_calculation_interval: humantime::parse_duration(
                    defaults::DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL
                )?,
@@ -1311,6 +1351,7 @@ background_task_maximum_delay = '334 s'
                        .expect("Invalid default constant")
                ),
                validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
                ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB
            },
            "Correct defaults should be used when no config values are provided"
        );
@@ -1363,6 +1404,7 @@ background_task_maximum_delay = '334 s'
                metric_collection_interval: Duration::from_secs(222),
                cached_metric_collection_interval: Duration::from_secs(22200),
                metric_collection_endpoint: Some(Url::parse("http://localhost:80/metrics")?),
                metric_collection_bucket: None,
                synthetic_size_calculation_interval: Duration::from_secs(333),
                disk_usage_based_eviction: None,
                test_remote_failures: 0,
@@ -1381,6 +1423,7 @@ background_task_maximum_delay = '334 s'
                        .expect("Invalid default constant")
                ),
                validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
                ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB
            },
            "Should be able to parse all basic config values correctly"
        );
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -3,10 +3,13 @@
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
 use crate::tenant::tasks::BackgroundLoopKind;
-use crate::tenant::{mgr, LogicalSizeCalculationCause, PageReconstructError, Tenant};
+use crate::tenant::{
    mgr::TenantManager, LogicalSizeCalculationCause, PageReconstructError, Tenant,
 };
 use camino::Utf8PathBuf;
 use consumption_metrics::EventType;
 use pageserver_api::models::TenantState;
 use remote_storage::{GenericRemoteStorage, RemoteStorageConfig};
 use reqwest::Url;
 use std::collections::HashMap;
 use std::sync::Arc;
@@ -40,7 +43,9 @@ type Cache = HashMap<MetricsKey, (EventType, u64)>;
 /// Main thread that serves metrics collection
 #[allow(clippy::too_many_arguments)]
 pub async fn collect_metrics(
    tenant_manager: Arc<TenantManager>,
    metric_collection_endpoint: &Url,
    metric_collection_bucket: &Option<RemoteStorageConfig>,
    metric_collection_interval: Duration,
    _cached_metric_collection_interval: Duration,
    synthetic_size_calculation_interval: Duration,
@@ -65,15 +70,19 @@ pub async fn collect_metrics(
        None,
        "synthetic size calculation",
        false,
-        async move {
+        {
-            calculate_synthetic_size_worker(
+            let tenant_manager = tenant_manager.clone();
-                synthetic_size_calculation_interval,
+            async move {
-                &cancel,
+                calculate_synthetic_size_worker(
-                &worker_ctx,
+                    tenant_manager,
-            )
+                    synthetic_size_calculation_interval,
-            .instrument(info_span!("synthetic_size_worker"))
+                    &cancel,
-            .await?;
+                    &worker_ctx,
-            Ok(())
+                )
                .instrument(info_span!("synthetic_size_worker"))
                .await?;
                Ok(())
            }
        },
    );
@@ -94,13 +103,27 @@ pub async fn collect_metrics(
        .build()
        .expect("Failed to create http client with timeout");
    let bucket_client = if let Some(bucket_config) = metric_collection_bucket {
        match GenericRemoteStorage::from_config(bucket_config) {
            Ok(client) => Some(client),
            Err(e) => {
                // Non-fatal error: if we were given an invalid config, we will proceed
                // with sending metrics over the network, but not to S3.
                tracing::warn!("Invalid configuration for metric_collection_bucket: {e}");
                None
            }
        }
    } else {
        None
    };
    let node_id = node_id.to_string();
    loop {
        let started_at = Instant::now();
        // these are point in time, with variable "now"
-        let metrics = metrics::collect_all_metrics(&cached_metrics, &ctx).await;
+        let metrics = metrics::collect_all_metrics(&tenant_manager, &cached_metrics, &ctx).await;
        let metrics = Arc::new(metrics);
@@ -118,10 +141,18 @@ pub async fn collect_metrics(
                    tracing::error!("failed to persist metrics to {path:?}: {e:#}");
                }
            }
            if let Some(bucket_client) = &bucket_client {
                let res =
                    upload::upload_metrics_bucket(bucket_client, &cancel, &node_id, &metrics).await;
                if let Err(e) = res {
                    tracing::error!("failed to upload to S3: {e:#}");
                }
            }
        };
        let upload = async {
-            let res = upload::upload_metrics(
+            let res = upload::upload_metrics_http(
                &client,
                metric_collection_endpoint,
                &cancel,
@@ -132,7 +163,7 @@ pub async fn collect_metrics(
            .await;
            if let Err(e) = res {
                // serialization error which should never happen
-                tracing::error!("failed to upload due to {e:#}");
+                tracing::error!("failed to upload via HTTP due to {e:#}");
            }
        };
@@ -247,6 +278,7 @@ async fn reschedule(
 /// Caclculate synthetic size for each active tenant
 async fn calculate_synthetic_size_worker(
    tenant_manager: Arc<TenantManager>,
    synthetic_size_calculation_interval: Duration,
    cancel: &CancellationToken,
    ctx: &RequestContext,
@@ -259,7 +291,7 @@ async fn calculate_synthetic_size_worker(
    loop {
        let started_at = Instant::now();
-        let tenants = match mgr::list_tenants().await {
+        let tenants = match tenant_manager.list_tenants() {
            Ok(tenants) => tenants,
            Err(e) => {
                warn!("cannot get tenant list: {e:#}");
@@ -278,10 +310,14 @@ async fn calculate_synthetic_size_worker(
                continue;
            }
-            let Ok(tenant) = mgr::get_tenant(tenant_shard_id, true) else {
+            let Ok(tenant) = tenant_manager.get_attached_tenant_shard(tenant_shard_id) else {
                continue;
            };
            if !tenant.is_active() {
                continue;
            }
            // there is never any reason to exit calculate_synthetic_size_worker following any
            // return value -- we don't need to care about shutdown because no tenant is found when
            // pageserver is shut down.
@@ -319,9 +355,7 @@ async fn calculate_and_log(tenant: &Tenant, cancel: &CancellationToken, ctx: &Re
    };
    // this error can be returned if timeline is shutting down, but it does not
-    // mean the synthetic size worker should terminate. we do not need any checks
+    // mean the synthetic size worker should terminate.
    // in this function because `mgr::get_tenant` will error out after shutdown has
    // progressed to shutting down tenants.
    let shutting_down = matches!(
        e.downcast_ref::<PageReconstructError>(),
        Some(PageReconstructError::Cancelled | PageReconstructError::AncestorStopping(_))
--- a/pageserver/src/consumption_metrics/metrics.rs
+++ b/pageserver/src/consumption_metrics/metrics.rs
@@ -1,3 +1,4 @@
 use crate::tenant::mgr::TenantManager;
 use crate::{context::RequestContext, tenant::timeline::logical_size::CurrentLogicalSize};
 use chrono::{DateTime, Utc};
 use consumption_metrics::EventType;
@@ -181,6 +182,7 @@ impl MetricsKey {
 }
 pub(super) async fn collect_all_metrics(
    tenant_manager: &Arc<TenantManager>,
    cached_metrics: &Cache,
    ctx: &RequestContext,
 ) -> Vec<RawMetric> {
@@ -188,7 +190,7 @@ pub(super) async fn collect_all_metrics(
    let started_at = std::time::Instant::now();
-    let tenants = match crate::tenant::mgr::list_tenants().await {
+    let tenants = match tenant_manager.list_tenants() {
        Ok(tenants) => tenants,
        Err(err) => {
            tracing::error!("failed to list tenants: {:?}", err);
@@ -200,7 +202,8 @@ pub(super) async fn collect_all_metrics(
        if state != TenantState::Active || !id.is_zero() {
            None
        } else {
-            crate::tenant::mgr::get_tenant(id, true)
+            tenant_manager
                .get_attached_tenant_shard(id)
                .ok()
                .map(|tenant| (id.tenant_id, tenant))
        }
--- a/pageserver/src/consumption_metrics/upload.rs
+++ b/pageserver/src/consumption_metrics/upload.rs
@@ -1,4 +1,9 @@
 use std::time::SystemTime;
 use chrono::{DateTime, Utc};
 use consumption_metrics::{Event, EventChunk, IdempotencyKey, CHUNK_SIZE};
 use remote_storage::{GenericRemoteStorage, RemotePath};
 use tokio::io::AsyncWriteExt;
 use tokio_util::sync::CancellationToken;
 use tracing::Instrument;
@@ -13,8 +18,9 @@ struct Ids {
    pub(super) timeline_id: Option<TimelineId>,
 }
 /// Serialize and write metrics to an HTTP endpoint
 #[tracing::instrument(skip_all, fields(metrics_total = %metrics.len()))]
-pub(super) async fn upload_metrics(
+pub(super) async fn upload_metrics_http(
    client: &reqwest::Client,
    metric_collection_endpoint: &reqwest::Url,
    cancel: &CancellationToken,
@@ -74,6 +80,60 @@ pub(super) async fn upload_metrics(
    Ok(())
 }
 /// Serialize and write metrics to a remote storage object
 #[tracing::instrument(skip_all, fields(metrics_total = %metrics.len()))]
 pub(super) async fn upload_metrics_bucket(
    client: &GenericRemoteStorage,
    cancel: &CancellationToken,
    node_id: &str,
    metrics: &[RawMetric],
 ) -> anyhow::Result<()> {
    if metrics.is_empty() {
        // Skip uploads if we have no metrics, so that readers don't have to handle the edge case
        // of an empty object.
        return Ok(());
    }
    // Compose object path
    let datetime: DateTime<Utc> = SystemTime::now().into();
    let ts_prefix = datetime.format("year=%Y/month=%m/day=%d/%H:%M:%SZ");
    let path = RemotePath::from_string(&format!("{ts_prefix}_{node_id}.ndjson.gz"))?;
    // Set up a gzip writer into a buffer
    let mut compressed_bytes: Vec<u8> = Vec::new();
    let compressed_writer = std::io::Cursor::new(&mut compressed_bytes);
    let mut gzip_writer = async_compression::tokio::write::GzipEncoder::new(compressed_writer);
    // Serialize and write into compressed buffer
    let started_at = std::time::Instant::now();
    for res in serialize_in_chunks(CHUNK_SIZE, metrics, node_id) {
        let (_chunk, body) = res?;
        gzip_writer.write_all(&body).await?;
    }
    gzip_writer.flush().await?;
    gzip_writer.shutdown().await?;
    let compressed_length = compressed_bytes.len();
    // Write to remote storage
    client
        .upload_storage_object(
            futures::stream::once(futures::future::ready(Ok(compressed_bytes.into()))),
            compressed_length,
            &path,
            cancel,
        )
        .await?;
    let elapsed = started_at.elapsed();
    tracing::info!(
        compressed_length,
        elapsed_ms = elapsed.as_millis(),
        "write metrics bucket at {path}",
    );
    Ok(())
 }
 // The return type is quite ugly, but we gain testability in isolation
 fn serialize_in_chunks<'a, F>(
    chunk_size: usize,
--- a/pageserver/src/control_plane_client.rs
+++ b/pageserver/src/control_plane_client.rs
@@ -2,9 +2,11 @@ use std::collections::HashMap;
 use futures::Future;
 use pageserver_api::{
    controller_api::NodeRegisterRequest,
    shard::TenantShardId,
    upcall_api::{
-        ReAttachRequest, ReAttachResponse, ValidateRequest, ValidateRequestTenant, ValidateResponse,
+        ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest,
        ValidateRequestTenant, ValidateResponse,
    },
 };
 use serde::{de::DeserializeOwned, Serialize};
@@ -12,7 +14,10 @@ use tokio_util::sync::CancellationToken;
 use url::Url;
 use utils::{backoff, generation::Generation, id::NodeId};
-use crate::config::PageServerConf;
+use crate::{
    config::{NodeMetadata, PageServerConf},
    virtual_file::on_fatal_io_error,
 };
 /// The Pageserver's client for using the control plane API: this is a small subset
 /// of the overall control plane API, for dealing with generations (see docs/rfcs/025-generation-numbers.md)
@@ -32,7 +37,10 @@ pub enum RetryForeverError {
 pub trait ControlPlaneGenerationsApi {
    fn re_attach(
        &self,
-    ) -> impl Future<Output = Result<HashMap<TenantShardId, Generation>, RetryForeverError>> + Send;
+        conf: &PageServerConf,
    ) -> impl Future<
        Output = Result<HashMap<TenantShardId, ReAttachResponseTenant>, RetryForeverError>,
    > + Send;
    fn validate(
        &self,
        tenants: Vec<(TenantShardId, Generation)>,
@@ -110,13 +118,59 @@ impl ControlPlaneClient {
 impl ControlPlaneGenerationsApi for ControlPlaneClient {
    /// Block until we get a successful response, or error out if we are shut down
-    async fn re_attach(&self) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
+    async fn re_attach(
        &self,
        conf: &PageServerConf,
    ) -> Result<HashMap<TenantShardId, ReAttachResponseTenant>, RetryForeverError> {
        let re_attach_path = self
            .base_url
            .join("re-attach")
            .expect("Failed to build re-attach path");
        // Include registration content in the re-attach request if a metadata file is readable
        let metadata_path = conf.metadata_path();
        let register = match tokio::fs::read_to_string(&metadata_path).await {
            Ok(metadata_str) => match serde_json::from_str::<NodeMetadata>(&metadata_str) {
                Ok(m) => {
                    // Since we run one time at startup, be generous in our logging and
                    // dump all metadata.
                    tracing::info!(
                        "Loaded node metadata: postgres {}:{}, http {}:{}, other fields: {:?}",
                        m.postgres_host,
                        m.postgres_port,
                        m.http_host,
                        m.http_port,
                        m.other
                    );
                    Some(NodeRegisterRequest {
                        node_id: conf.id,
                        listen_pg_addr: m.postgres_host,
                        listen_pg_port: m.postgres_port,
                        listen_http_addr: m.http_host,
                        listen_http_port: m.http_port,
                    })
                }
                Err(e) => {
                    tracing::error!("Unreadable metadata in {metadata_path}: {e}");
                    None
                }
            },
            Err(e) => {
                if e.kind() == std::io::ErrorKind::NotFound {
                    // This is legal: we may have been deployed with some external script
                    // doing registration for us.
                    tracing::info!("Metadata file not found at {metadata_path}");
                } else {
                    on_fatal_io_error(&e, &format!("Loading metadata at {metadata_path}"))
                }
                None
            }
        };
        let request = ReAttachRequest {
            node_id: self.node_id,
            register,
        };
        fail::fail_point!("control-plane-client-re-attach");
@@ -130,7 +184,7 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
        Ok(response
            .tenants
            .into_iter()
-            .map(|t| (t.id, Generation::new(t.gen)))
+            .map(|rart| (rart.id, rart))
            .collect::<HashMap<_, _>>())
    }
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -724,8 +724,8 @@ impl DeletionQueue {
 mod test {
    use camino::Utf8Path;
    use hex_literal::hex;
-    use pageserver_api::shard::ShardIndex;
+    use pageserver_api::{shard::ShardIndex, upcall_api::ReAttachResponseTenant};
-    use std::io::ErrorKind;
+    use std::{io::ErrorKind, time::Duration};
    use tracing::info;
    use remote_storage::{RemoteStorageConfig, RemoteStorageKind};
@@ -831,9 +831,13 @@ mod test {
    }
    impl ControlPlaneGenerationsApi for MockControlPlane {
-        async fn re_attach(&self) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
+        async fn re_attach(
            &self,
            _conf: &PageServerConf,
        ) -> Result<HashMap<TenantShardId, ReAttachResponseTenant>, RetryForeverError> {
            unimplemented!()
        }
        async fn validate(
            &self,
            tenants: Vec<(TenantShardId, Generation)>,
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -61,7 +61,6 @@ use crate::{
    metrics::disk_usage_based_eviction::METRICS,
    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
    tenant::{
        self,
        mgr::TenantManager,
        remote_timeline_client::LayerFileMetadata,
        secondary::SecondaryTenant,
@@ -814,8 +813,8 @@ async fn collect_eviction_candidates(
    const LOG_DURATION_THRESHOLD: std::time::Duration = std::time::Duration::from_secs(10);
    // get a snapshot of the list of tenants
-    let tenants = tenant::mgr::list_tenants()
+    let tenants = tenant_manager
-        .await
+        .list_tenants()
        .context("get list of tenants")?;
    // TODO: avoid listing every layer in every tenant: this loop can block the executor,
@@ -827,8 +826,12 @@ async fn collect_eviction_candidates(
        if cancel.is_cancelled() {
            return Ok(EvictionCandidates::Cancelled);
        }
-        let tenant = match tenant::mgr::get_tenant(tenant_id, true) {
+        let tenant = match tenant_manager.get_attached_tenant_shard(tenant_id) {
-            Ok(tenant) => tenant,
+            Ok(tenant) if tenant.is_active() => tenant,
            Ok(_) => {
                debug!(tenant_id=%tenant_id.tenant_id, shard_id=%tenant_id.shard_slug(), "Tenant shard is not active");
                continue;
            }
            Err(e) => {
                // this can happen if tenant has lifecycle transition after we fetched it
                debug!("failed to get tenant: {e:#}");
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -567,9 +567,9 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/ServiceUnavailableError"
-  /v1/tenant/{tenant_id}/location_config:
+  /v1/tenant/{tenant_shard_id}/location_config:
    parameters:
-      - name: tenant_id
+      - name: tenant_shard_id
        in: path
        required: true
        schema:
@@ -932,6 +932,75 @@ paths:
              schema:
                $ref: "#/components/schemas/ServiceUnavailableError"
  /v1/tenant/{tenant_shard_id}/heatmap_upload:
    parameters:
      - name: tenant_shard_id
        in: path
        required: true
        schema:
          type: string
    post:
      description: |
        If the location is in an attached mode, upload the current state to the remote heatmap
      responses:
        "200":
          description: Success
        "500":
          description: Generic operation error
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
        "503":
          description: Temporarily unavailable, please retry.
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ServiceUnavailableError"
  /v1/tenant/{tenant_shard_id}/secondary/download:
    parameters:
      - name: tenant_shard_id
        in: path
        required: true
        schema:
          type: string
      - name: wait_ms
        description: If set, we will wait this long for download to complete, and if it isn't complete then return 202
        in: query
        required: false
        schema:
          type: integer
    post:
      description: |
        If the location is in secondary mode, download latest heatmap and layers
      responses:
        "200":
          description: Success
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/SecondaryProgress"
        "202":
          description: Download has started but not yet finished
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/SecondaryProgress"
        "500":
          description: Generic operation error
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
        "503":
          description: Temporarily unavailable, please retry.
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ServiceUnavailableError"
  /v1/tenant/{tenant_id}/timeline/:
    parameters:
@@ -969,7 +1038,7 @@ paths:
                  format: hex
      responses:
        "201":
-          description: TimelineInfo
+          description: Timeline was created, or already existed with matching parameters
          content:
            application/json:
              schema:
@@ -999,11 +1068,17 @@ paths:
              schema:
                $ref: "#/components/schemas/Error"
        "409":
-          description: Timeline already exists, creation skipped
+          description: Timeline already exists, with different parameters.  Creation cannot proceed.
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ConflictError"
        "429":
          description: A creation request was sent for the same Timeline Id while a creation was already in progress.  Back off and retry.
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
        "500":
          description: Generic operation error
          content:
@@ -1314,10 +1389,11 @@ components:
    TenantLocationConfigRequest:
      type: object
      required:
-        - tenant_id
+        - mode
      properties:
        tenant_id:
          type: string
          description: Not used, scheduled for removal.
        mode:
          type: string
          enum: ["AttachedSingle", "AttachedMulti", "AttachedStale", "Secondary", "Detached"]
@@ -1391,7 +1467,7 @@ components:
        trace_read_requests:
          type: boolean
        heatmap_period:
-          type: integer
+          type: string
    TenantConfigResponse:
      type: object
      properties:
@@ -1569,6 +1645,37 @@ components:
            Lower is better score for how good this pageserver would be for the next tenant.
            The default or maximum value can be returned in situations when a proper score cannot (yet) be calculated.
    SecondaryProgress:
      type: object
      required:
        - heatmap_mtime
        - layers_downloaded
        - layers_total
        - bytes_downloaded
        - bytes_total
      properties:
        heatmap_mtime:
          type: string
          format: date-time
          description: Modification time of the most recently downloaded layer heatmap (RFC 3339 format)
        layers_downloaded:
          type: integer
          format: int64
          description: How many layers from the latest layer heatmap are present on disk
        bytes_downloaded:
          type: integer
          format: int64
          description: How many bytes of layer content from the latest layer heatmap are present on disk
        layers_total:
          type: integer
          format: int64
          description: How many layers were in the latest layer heatmap
        bytes_total:
          type: integer
          format: int64
          description: How many bytes of layer content were in the latest layer heatmap
    Error:
      type: object
      required:
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -36,6 +36,7 @@ use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::auth::JwtAuth;
 use utils::failpoint_support::failpoints_handler;
 use utils::http::endpoint::prometheus_metrics_handler;
 use utils::http::endpoint::request_span;
 use utils::http::json::json_request_or_empty_body;
 use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
@@ -48,8 +49,8 @@ use crate::task_mgr::TaskKind;
 use crate::tenant::config::{LocationConf, TenantConfOpt};
 use crate::tenant::mgr::GetActiveTenantError;
 use crate::tenant::mgr::{
-    GetTenantError, SetNewTenantConfigError, TenantManager, TenantMapError, TenantMapInsertError,
+    GetTenantError, TenantManager, TenantMapError, TenantMapInsertError, TenantSlotError,
-    TenantSlotError, TenantSlotUpsertError, TenantStateError,
+    TenantSlotUpsertError, TenantStateError,
 };
 use crate::tenant::mgr::{TenantSlot, UpsertLocationError};
 use crate::tenant::remote_timeline_client;
@@ -248,16 +249,11 @@ impl From<GetTenantError> for ApiError {
    fn from(tse: GetTenantError) -> ApiError {
        match tse {
            GetTenantError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid).into()),
            GetTenantError::Broken(reason) => {
                ApiError::InternalServerError(anyhow!("tenant is broken: {}", reason))
            }
            GetTenantError::NotActive(_) => {
                // Why is this not `ApiError::NotFound`?
                // Because we must be careful to never return 404 for a tenant if it does
                // in fact exist locally. If we did, the caller could draw the conclusion
                // that it can attach the tenant to another PS and we'd be in split-brain.
                //
                // (We can produce this variant only in `mgr::get_tenant(..., active=true)` calls).
                ApiError::ResourceUnavailable("Tenant not yet active".into())
            }
            GetTenantError::MapState(e) => ApiError::ResourceUnavailable(format!("{e}").into()),
@@ -268,6 +264,9 @@ impl From<GetTenantError> for ApiError {
 impl From<GetActiveTenantError> for ApiError {
    fn from(e: GetActiveTenantError) -> ApiError {
        match e {
            GetActiveTenantError::Broken(reason) => {
                ApiError::InternalServerError(anyhow!("tenant is broken: {}", reason))
            }
            GetActiveTenantError::WillNotBecomeActive(_) => ApiError::Conflict(format!("{}", e)),
            GetActiveTenantError::Cancelled => ApiError::ShuttingDown,
            GetActiveTenantError::NotFound(gte) => gte.into(),
@@ -278,19 +277,6 @@ impl From<GetActiveTenantError> for ApiError {
    }
 }
 impl From<SetNewTenantConfigError> for ApiError {
    fn from(e: SetNewTenantConfigError) -> ApiError {
        match e {
            SetNewTenantConfigError::GetTenant(tid) => {
                ApiError::NotFound(anyhow!("tenant {}", tid).into())
            }
            e @ (SetNewTenantConfigError::Persist(_) | SetNewTenantConfigError::Other(_)) => {
                ApiError::InternalServerError(anyhow::Error::new(e))
            }
        }
    }
 }
 impl From<crate::tenant::DeleteTimelineError> for ApiError {
    fn from(value: crate::tenant::DeleteTimelineError) -> Self {
        use crate::tenant::DeleteTimelineError::*;
@@ -494,7 +480,7 @@ async fn timeline_create_handler(
    async {
        let tenant = state
            .tenant_manager
-            .get_attached_tenant_shard(tenant_shard_id, false)?;
+            .get_attached_tenant_shard(tenant_shard_id)?;
        tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
@@ -534,10 +520,13 @@ async fn timeline_create_handler(
                    HttpErrorBody::from_msg("Tenant shutting down".to_string()),
                )
            }
-            Err(
+            Err(e @ tenant::CreateTimelineError::Conflict) => {
-                tenant::CreateTimelineError::Conflict
+                json_response(StatusCode::CONFLICT, HttpErrorBody::from_msg(e.to_string()))
-                | tenant::CreateTimelineError::AlreadyCreating,
+            }
-            ) => json_response(StatusCode::CONFLICT, ()),
+            Err(e @ tenant::CreateTimelineError::AlreadyCreating) => json_response(
                StatusCode::TOO_MANY_REQUESTS,
                HttpErrorBody::from_msg(e.to_string()),
            ),
            Err(tenant::CreateTimelineError::AncestorLsn(err)) => json_response(
                StatusCode::NOT_ACCEPTABLE,
                HttpErrorBody::from_msg(format!("{err:#}")),
@@ -580,7 +569,7 @@ async fn timeline_list_handler(
    let response_data = async {
        let tenant = state
            .tenant_manager
-            .get_attached_tenant_shard(tenant_shard_id, false)?;
+            .get_attached_tenant_shard(tenant_shard_id)?;
        tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
@@ -618,6 +607,7 @@ async fn timeline_preserve_initdb_handler(
    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
    let state = get_state(&request);
    // Part of the process for disaster recovery from safekeeper-stored WAL:
    // If we don't recover into a new timeline but want to keep the timeline ID,
@@ -625,7 +615,9 @@ async fn timeline_preserve_initdb_handler(
    // location where timeline recreation cand find it.
    async {
-        let tenant = mgr::get_tenant(tenant_shard_id, false)?;
+        let tenant = state
            .tenant_manager
            .get_attached_tenant_shard(tenant_shard_id)?;
        let timeline = tenant
            .get_timeline(timeline_id, false)
@@ -667,7 +659,7 @@ async fn timeline_detail_handler(
    let timeline_info = async {
        let tenant = state
            .tenant_manager
-            .get_attached_tenant_shard(tenant_shard_id, false)?;
+            .get_attached_tenant_shard(tenant_shard_id)?;
        tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
@@ -854,7 +846,7 @@ async fn timeline_delete_handler(
    let tenant = state
        .tenant_manager
-        .get_attached_tenant_shard(tenant_shard_id, false)
+        .get_attached_tenant_shard(tenant_shard_id)
        .map_err(|e| {
            match e {
                // GetTenantError has a built-in conversion to ApiError, but in this context we don't
@@ -885,14 +877,16 @@ async fn tenant_detach_handler(
    let state = get_state(&request);
    let conf = state.conf;
-    mgr::detach_tenant(
+    state
-        conf,
+        .tenant_manager
-        tenant_shard_id,
+        .detach_tenant(
-        detach_ignored.unwrap_or(false),
+            conf,
-        &state.deletion_queue_client,
+            tenant_shard_id,
-    )
+            detach_ignored.unwrap_or(false),
-    .instrument(info_span!("tenant_detach", %tenant_id, shard_id=%tenant_shard_id.shard_slug()))
+            &state.deletion_queue_client,
-    .await?;
+        )
        .instrument(info_span!("tenant_detach", %tenant_id, shard_id=%tenant_shard_id.shard_slug()))
        .await?;
    json_response(StatusCode::OK, ())
 }
@@ -970,10 +964,11 @@ async fn tenant_list_handler(
    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
    check_permission(&request, None)?;
    let state = get_state(&request);
-    let response_data = mgr::list_tenants()
+    let response_data = state
-        .instrument(info_span!("tenant_list"))
+        .tenant_manager
-        .await
+        .list_tenants()
        .map_err(|_| {
            ApiError::ResourceUnavailable("Tenant map is initializing or shutting down".into())
        })?
@@ -996,9 +991,12 @@ async fn tenant_status(
 ) -> Result<Response<Body>, ApiError> {
    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
    let state = get_state(&request);
    let tenant_info = async {
-        let tenant = mgr::get_tenant(tenant_shard_id, false)?;
+        let tenant = state
            .tenant_manager
            .get_attached_tenant_shard(tenant_shard_id)?;
        // Calculate total physical size of all timelines
        let mut current_physical_size = 0;
@@ -1071,9 +1069,7 @@ async fn tenant_size_handler(
    let inputs_only: Option<bool> = parse_query_param(&request, "inputs_only")?;
    let retention_period: Option<u64> = parse_query_param(&request, "retention_period")?;
    let headers = request.headers();
-
+    let state = get_state(&request);
    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
    let tenant = mgr::get_tenant(tenant_shard_id, true)?;
    if !tenant_shard_id.is_zero() {
        return Err(ApiError::BadRequest(anyhow!(
@@ -1081,6 +1077,12 @@ async fn tenant_size_handler(
        )));
    }
    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
    let tenant = state
        .tenant_manager
        .get_attached_tenant_shard(tenant_shard_id)?;
    tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
    // this can be long operation
    let inputs = tenant
        .gather_size_inputs(
@@ -1149,9 +1151,19 @@ async fn tenant_shard_split_handler(
    let state = get_state(&request);
    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
    let tenant = state
        .tenant_manager
        .get_attached_tenant_shard(tenant_shard_id)?;
    tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
    let new_shards = state
        .tenant_manager
-        .shard_split(tenant_shard_id, ShardCount::new(req.new_shard_count), &ctx)
+        .shard_split(
            tenant,
            ShardCount::new(req.new_shard_count),
            req.new_stripe_size,
            &ctx,
        )
        .await
        .map_err(ApiError::InternalServerError)?;
@@ -1365,8 +1377,11 @@ async fn get_tenant_config_handler(
 ) -> Result<Response<Body>, ApiError> {
    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
    let state = get_state(&request);
-    let tenant = mgr::get_tenant(tenant_shard_id, false)?;
+    let tenant = state
        .tenant_manager
        .get_attached_tenant_shard(tenant_shard_id)?;
    let response = HashMap::from([
        (
@@ -1394,13 +1409,31 @@ async fn update_tenant_config_handler(
    let tenant_id = request_data.tenant_id;
    check_permission(&request, Some(tenant_id))?;
-    let tenant_conf =
+    let new_tenant_conf =
        TenantConfOpt::try_from(&request_data.config).map_err(ApiError::BadRequest)?;
    let state = get_state(&request);
-    mgr::set_new_tenant_config(state.conf, tenant_conf, tenant_id)
+
-        .instrument(info_span!("tenant_config", %tenant_id))
+    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
-        .await?;
+
    let tenant = state
        .tenant_manager
        .get_attached_tenant_shard(tenant_shard_id)?;
    tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
    // This is a legacy API that only operates on attached tenants: the preferred
    // API to use is the location_config/ endpoint, which lets the caller provide
    // the full LocationConf.
    let location_conf = LocationConf::attached_single(
        new_tenant_conf.clone(),
        tenant.get_generation(),
        &ShardParameters::default(),
    );
    crate::tenant::Tenant::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf)
        .await
        .map_err(ApiError::InternalServerError)?;
    tenant.set_new_tenant_config(new_tenant_conf);
    json_response(StatusCode::OK, ())
 }
@@ -1423,13 +1456,14 @@ async fn put_tenant_location_config_handler(
    // The `Detached` state is special, it doesn't upsert a tenant, it removes
    // its local disk content and drops it from memory.
    if let LocationConfigMode::Detached = request_data.config.mode {
-        if let Err(e) =
+        if let Err(e) = state
-            mgr::detach_tenant(conf, tenant_shard_id, true, &state.deletion_queue_client)
+            .tenant_manager
-                .instrument(info_span!("tenant_detach",
+            .detach_tenant(conf, tenant_shard_id, true, &state.deletion_queue_client)
-                    tenant_id = %tenant_shard_id.tenant_id,
+            .instrument(info_span!("tenant_detach",
-                    shard_id = %tenant_shard_id.shard_slug()
+                tenant_id = %tenant_shard_id.tenant_id,
-                ))
+                shard_id = %tenant_shard_id.shard_slug()
-                .await
+            ))
            .await
        {
            match e {
                TenantStateError::SlotError(TenantSlotError::NotFound(_)) => {
@@ -1623,10 +1657,12 @@ async fn handle_tenant_break(
 ) -> Result<Response<Body>, ApiError> {
    let tenant_shard_id: TenantShardId = parse_request_param(&r, "tenant_shard_id")?;
-    let tenant = crate::tenant::mgr::get_tenant(tenant_shard_id, true)
+    let state = get_state(&r);
-        .map_err(|_| ApiError::Conflict(String::from("no active tenant found")))?;
+    state
-
+        .tenant_manager
-    tenant.set_broken("broken from test".to_owned()).await;
+        .get_attached_tenant_shard(tenant_shard_id)?
        .set_broken("broken from test".to_owned())
        .await;
    json_response(StatusCode::OK, ())
 }
@@ -1643,8 +1679,7 @@ async fn timeline_gc_handler(
    let gc_req: TimelineGcRequest = json_request(&mut request).await?;
    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-    let wait_task_done =
+    let wait_task_done = mgr::immediate_gc(tenant_shard_id, timeline_id, gc_req, cancel, &ctx)?;
        mgr::immediate_gc(tenant_shard_id, timeline_id, gc_req, cancel, &ctx).await?;
    let gc_result = wait_task_done
        .await
        .context("wait for gc task")
@@ -1871,7 +1906,7 @@ async fn active_timeline_of_active_tenant(
    tenant_shard_id: TenantShardId,
    timeline_id: TimelineId,
 ) -> Result<Arc<Timeline>, ApiError> {
-    let tenant = tenant_manager.get_attached_tenant_shard(tenant_shard_id, false)?;
+    let tenant = tenant_manager.get_attached_tenant_shard(tenant_shard_id)?;
    tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
@@ -1982,13 +2017,42 @@ async fn secondary_download_handler(
 ) -> Result<Response<Body>, ApiError> {
    let state = get_state(&request);
    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
-    state
+    let wait = parse_query_param(&request, "wait_ms")?.map(Duration::from_millis);
        .secondary_controller
        .download_tenant(tenant_shard_id)
        .await
        .map_err(ApiError::InternalServerError)?;
-    json_response(StatusCode::OK, ())
+    // We don't need this to issue the download request, but:
    // - it enables us to cleanly return 404 if we get a request for an absent shard
    // - we will use this to provide status feedback in the response
    let Some(secondary_tenant) = state
        .tenant_manager
        .get_secondary_tenant_shard(tenant_shard_id)
    else {
        return Err(ApiError::NotFound(
            anyhow::anyhow!("Shard {} not found", tenant_shard_id).into(),
        ));
    };
    let timeout = wait.unwrap_or(Duration::MAX);
    let status = match tokio::time::timeout(
        timeout,
        state.secondary_controller.download_tenant(tenant_shard_id),
    )
    .await
    {
        // Download job ran to completion.
        Ok(Ok(())) => StatusCode::OK,
        // Edge case: downloads aren't usually fallible: things like a missing heatmap are considered
        // okay.  We could get an error here in the unlikely edge case that the tenant
        // was detached between our check above and executing the download job.
        Ok(Err(e)) => return Err(ApiError::InternalServerError(e)),
        // A timeout is not an error: we have started the download, we're just not done
        // yet.  The caller will get a response body indicating status.
        Err(_) => StatusCode::ACCEPTED,
    };
    let progress = secondary_tenant.progress.lock().unwrap().clone();
    json_response(status, progress)
 }
 async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
@@ -2048,6 +2112,10 @@ async fn get_utilization(
    r: Request<Body>,
    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
    fail::fail_point!("get-utilization-http-handler", |_| {
        Err(ApiError::ResourceUnavailable("failpoint".into()))
    });
    // this probably could be completely public, but lets make that change later.
    check_permission(&r, None)?;
@@ -2224,6 +2292,7 @@ pub fn make_router(
    Ok(router
        .data(state)
        .get("/metrics", |r| request_span(r, prometheus_metrics_handler))
        .get("/v1/status", |r| api_handler(r, status_handler))
        .put("/v1/failpoints", |r| {
            testing_api_handler("manage failpoints", r, failpoints_handler)
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -2,28 +2,20 @@
 //! Import data and WAL from a PostgreSQL data directory and WAL segments into
 //! a neon Timeline.
 //!
 use std::io::SeekFrom;
 use std::path::{Path, PathBuf};
 use anyhow::{bail, ensure, Context, Result};
 use async_compression::tokio::bufread::ZstdDecoder;
 use async_compression::{tokio::write::ZstdEncoder, zstd::CParameter, Level};
 use bytes::Bytes;
 use camino::Utf8Path;
 use futures::StreamExt;
-use nix::NixPath;
+use tokio::io::{AsyncRead, AsyncReadExt};
 use tokio::fs::{File, OpenOptions};
 use tokio::io::{AsyncBufRead, AsyncRead, AsyncReadExt, AsyncSeekExt, AsyncWriteExt};
 use tokio_tar::Archive;
 use tokio_tar::Builder;
 use tokio_tar::HeaderMode;
 use tracing::*;
 use walkdir::WalkDir;
 use crate::context::RequestContext;
 use crate::metrics::WAL_INGEST;
 use crate::pgdatadir_mapping::*;
 use crate::tenant::remote_timeline_client::INITDB_PATH;
 use crate::tenant::Timeline;
 use crate::walingest::WalIngest;
 use crate::walrecord::DecodedWALRecord;
@@ -633,65 +625,3 @@ async fn read_all_bytes(reader: &mut (impl AsyncRead + Unpin)) -> Result<Bytes>
    reader.read_to_end(&mut buf).await?;
    Ok(Bytes::from(buf))
 }
 pub async fn create_tar_zst(pgdata_path: &Utf8Path, tmp_path: &Utf8Path) -> Result<(File, u64)> {
    let file = OpenOptions::new()
        .create(true)
        .truncate(true)
        .read(true)
        .write(true)
        .open(&tmp_path)
        .await
        .with_context(|| format!("tempfile creation {tmp_path}"))?;
    let mut paths = Vec::new();
    for entry in WalkDir::new(pgdata_path) {
        let entry = entry?;
        let metadata = entry.metadata().expect("error getting dir entry metadata");
        // Also allow directories so that we also get empty directories
        if !(metadata.is_file() || metadata.is_dir()) {
            continue;
        }
        let path = entry.into_path();
        paths.push(path);
    }
    // Do a sort to get a more consistent listing
    paths.sort_unstable();
    let zstd = ZstdEncoder::with_quality_and_params(
        file,
        Level::Default,
        &[CParameter::enable_long_distance_matching(true)],
    );
    let mut builder = Builder::new(zstd);
    // Use reproducible header mode
    builder.mode(HeaderMode::Deterministic);
    for path in paths {
        let rel_path = path.strip_prefix(pgdata_path)?;
        if rel_path.is_empty() {
            // The top directory should not be compressed,
            // the tar crate doesn't like that
            continue;
        }
        builder.append_path_with_name(&path, rel_path).await?;
    }
    let mut zstd = builder.into_inner().await?;
    zstd.shutdown().await?;
    let mut compressed = zstd.into_inner();
    let compressed_len = compressed.metadata().await?.len();
    const INITDB_TAR_ZST_WARN_LIMIT: u64 = 2 * 1024 * 1024;
    if compressed_len > INITDB_TAR_ZST_WARN_LIMIT {
        warn!("compressed {INITDB_PATH} size of {compressed_len} is above limit {INITDB_TAR_ZST_WARN_LIMIT}.");
    }
    compressed.seek(SeekFrom::Start(0)).await?;
    Ok((compressed, compressed_len))
 }
 pub async fn extract_tar_zst(
    pgdata_path: &Utf8Path,
    tar_zst: impl AsyncBufRead + Unpin,
 ) -> Result<()> {
    let tar = Box::pin(ZstdDecoder::new(tar_zst));
    let mut archive = Archive::new(tar);
    archive.unpack(pgdata_path).await?;
    Ok(())
 }
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -31,6 +31,7 @@ pub mod walredo;
 use crate::task_mgr::TaskKind;
 use camino::Utf8Path;
 use deletion_queue::DeletionQueue;
 use tenant::mgr::TenantManager;
 use tracing::info;
 /// Current storage format version
@@ -53,7 +54,11 @@ static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]);
 pub use crate::metrics::preinitialize_metrics;
 #[tracing::instrument(skip_all, fields(%exit_code))]
-pub async fn shutdown_pageserver(deletion_queue: Option<DeletionQueue>, exit_code: i32) {
+pub async fn shutdown_pageserver(
    tenant_manager: &TenantManager,
    deletion_queue: Option<DeletionQueue>,
    exit_code: i32,
 ) {
    use std::time::Duration;
    // Shut down the libpq endpoint task. This prevents new connections from
    // being accepted.
@@ -67,7 +72,7 @@ pub async fn shutdown_pageserver(deletion_queue: Option<DeletionQueue>, exit_cod
    // Shut down all the tenants. This flushes everything to disk and kills
    // the checkpoint and GC tasks.
    timed(
-        tenant::mgr::shutdown_all_tenants(),
+        tenant_manager.shutdown(),
        "shutdown all tenants",
        Duration::from_secs(5),
    )
@@ -114,27 +119,27 @@ pub const METADATA_FILE_NAME: &str = "metadata";
 /// Per-tenant configuration file.
 /// Full path: `tenants/<tenant_id>/config`.
-pub const TENANT_CONFIG_NAME: &str = "config";
+pub(crate) const TENANT_CONFIG_NAME: &str = "config";
 /// Per-tenant configuration file.
 /// Full path: `tenants/<tenant_id>/config`.
-pub const TENANT_LOCATION_CONFIG_NAME: &str = "config-v1";
+pub(crate) const TENANT_LOCATION_CONFIG_NAME: &str = "config-v1";
 /// Per-tenant copy of their remote heatmap, downloaded into the local
 /// tenant path while in secondary mode.
-pub const TENANT_HEATMAP_BASENAME: &str = "heatmap-v1.json";
+pub(crate) const TENANT_HEATMAP_BASENAME: &str = "heatmap-v1.json";
 /// A suffix used for various temporary files. Any temporary files found in the
 /// data directory at pageserver startup can be automatically removed.
-pub const TEMP_FILE_SUFFIX: &str = "___temp";
+pub(crate) const TEMP_FILE_SUFFIX: &str = "___temp";
 /// A marker file to mark that a timeline directory was not fully initialized.
 /// If a timeline directory with this marker is encountered at pageserver startup,
 /// the timeline directory and the marker file are both removed.
 /// Full path: `tenants/<tenant_id>/timelines/<timeline_id>___uninit`.
-pub const TIMELINE_UNINIT_MARK_SUFFIX: &str = "___uninit";
+pub(crate) const TIMELINE_UNINIT_MARK_SUFFIX: &str = "___uninit";
-pub const TIMELINE_DELETE_MARK_SUFFIX: &str = "___delete";
+pub(crate) const TIMELINE_DELETE_MARK_SUFFIX: &str = "___delete";
 /// A marker file to prevent pageserver from loading a certain tenant on restart.
 /// Different from [`TIMELINE_UNINIT_MARK_SUFFIX`] due to semantics of the corresponding
@@ -161,11 +166,11 @@ fn ends_with_suffix(path: &Utf8Path, suffix: &str) -> bool {
 // from the directory name. Instead create type "UninitMark(TimelineId)" and only parse it once
 // from the name.
-pub fn is_uninit_mark(path: &Utf8Path) -> bool {
+pub(crate) fn is_uninit_mark(path: &Utf8Path) -> bool {
    ends_with_suffix(path, TIMELINE_UNINIT_MARK_SUFFIX)
 }
-pub fn is_delete_mark(path: &Utf8Path) -> bool {
+pub(crate) fn is_delete_mark(path: &Utf8Path) -> bool {
    ends_with_suffix(path, TIMELINE_DELETE_MARK_SUFFIX)
 }
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1,5 +1,4 @@
 use enum_map::EnumMap;
 use metrics::metric_vec_duration::DurationResultObserver;
 use metrics::{
    register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec,
    register_int_counter, register_int_counter_pair_vec, register_int_counter_vec,
@@ -168,7 +167,7 @@ impl GetVectoredLatency {
 pub(crate) static GET_VECTORED_LATENCY: Lazy<GetVectoredLatency> = Lazy::new(|| {
    let inner = register_histogram_vec!(
        "pageserver_get_vectored_seconds",
-        "Time spent in get_vectored",
+        "Time spent in get_vectored, excluding time spent in timeline_get_throttle.",
        &["task_kind"],
        CRITICAL_OP_BUCKETS.into(),
    )
@@ -436,7 +435,7 @@ pub(crate) static RESIDENT_PHYSICAL_SIZE_GLOBAL: Lazy<UIntGauge> = Lazy::new(||
 static REMOTE_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
    register_uint_gauge_vec!(
        "pageserver_remote_physical_size",
-        "The size of the layer files present in the remote storage that are listed in the the remote index_part.json.",
+        "The size of the layer files present in the remote storage that are listed in the remote index_part.json.",
        // Corollary: If any files are missing from the index part, they won't be included here.
        &["tenant_id", "shard_id", "timeline_id"]
    )
@@ -700,6 +699,14 @@ pub static STARTUP_IS_LOADING: Lazy<UIntGauge> = Lazy::new(|| {
    .expect("Failed to register pageserver_startup_is_loading")
 });
 pub(crate) static TIMELINE_EPHEMERAL_BYTES: Lazy<UIntGauge> = Lazy::new(|| {
    register_uint_gauge!(
        "pageserver_timeline_ephemeral_bytes",
        "Total number of bytes in ephemeral layers, summed for all timelines.  Approximate, lazily updated."
    )
    .expect("Failed to register metric")
 });
 /// Metrics related to the lifecycle of a [`crate::tenant::Tenant`] object: things
 /// like how long it took to load.
 ///
@@ -1283,11 +1290,65 @@ pub(crate) static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|
    })
 });
-impl DurationResultObserver for BasebackupQueryTime {
+pub(crate) struct BasebackupQueryTimeOngoingRecording<'a, 'c> {
-    fn observe_result<T, E>(&self, res: &Result<T, E>, duration: std::time::Duration) {
+    parent: &'a BasebackupQueryTime,
    ctx: &'c RequestContext,
    start: std::time::Instant,
 }
 impl BasebackupQueryTime {
    pub(crate) fn start_recording<'c: 'a, 'a>(
        &'a self,
        ctx: &'c RequestContext,
    ) -> BasebackupQueryTimeOngoingRecording<'_, '_> {
        let start = Instant::now();
        match ctx.micros_spent_throttled.open() {
            Ok(()) => (),
            Err(error) => {
                use utils::rate_limit::RateLimit;
                static LOGGED: Lazy<Mutex<RateLimit>> =
                    Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
                let mut rate_limit = LOGGED.lock().unwrap();
                rate_limit.call(|| {
                    warn!(error, "error opening micros_spent_throttled; this message is logged at a global rate limit");
                });
            }
        }
        BasebackupQueryTimeOngoingRecording {
            parent: self,
            ctx,
            start,
        }
    }
 }
 impl<'a, 'c> BasebackupQueryTimeOngoingRecording<'a, 'c> {
    pub(crate) fn observe<T, E>(self, res: &Result<T, E>) {
        let elapsed = self.start.elapsed();
        let ex_throttled = self
            .ctx
            .micros_spent_throttled
            .close_and_checked_sub_from(elapsed);
        let ex_throttled = match ex_throttled {
            Ok(ex_throttled) => ex_throttled,
            Err(error) => {
                use utils::rate_limit::RateLimit;
                static LOGGED: Lazy<Mutex<RateLimit>> =
                    Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
                let mut rate_limit = LOGGED.lock().unwrap();
                rate_limit.call(|| {
                    warn!(error, "error deducting time spent throttled; this message is logged at a global rate limit");
                });
                elapsed
            }
        };
        let label_value = if res.is_ok() { "ok" } else { "error" };
-        let metric = self.0.get_metric_with_label_values(&[label_value]).unwrap();
+        let metric = self
-        metric.observe(duration.as_secs_f64());
+            .parent
            .0
            .get_metric_with_label_values(&[label_value])
            .unwrap();
        metric.observe(ex_throttled.as_secs_f64());
    }
 }
@@ -1964,10 +2025,8 @@ impl TimelineMetrics {
    pub(crate) fn resident_physical_size_get(&self) -> u64 {
        self.resident_physical_size_gauge.get()
    }
 }
-impl Drop for TimelineMetrics {
+    pub(crate) fn shutdown(&self) {
    fn drop(&mut self) {
        let tenant_id = &self.tenant_id;
        let timeline_id = &self.timeline_id;
        let shard_id = &self.shard_id;
@@ -2414,7 +2473,8 @@ impl<F: Future<Output = Result<O, E>>, O, E> Future for MeasuredRemoteOp<F> {
 }
 pub mod tokio_epoll_uring {
-    use metrics::UIntGauge;
+    use metrics::{register_int_counter, UIntGauge};
    use once_cell::sync::Lazy;
    pub struct Collector {
        descs: Vec<metrics::core::Desc>,
@@ -2422,15 +2482,13 @@ pub mod tokio_epoll_uring {
        systems_destroyed: UIntGauge,
    }
    const NMETRICS: usize = 2;
    impl metrics::core::Collector for Collector {
        fn desc(&self) -> Vec<&metrics::core::Desc> {
            self.descs.iter().collect()
        }
        fn collect(&self) -> Vec<metrics::proto::MetricFamily> {
-            let mut mfs = Vec::with_capacity(NMETRICS);
+            let mut mfs = Vec::with_capacity(Self::NMETRICS);
            let tokio_epoll_uring::metrics::Metrics {
                systems_created,
                systems_destroyed,
@@ -2444,6 +2502,8 @@ pub mod tokio_epoll_uring {
    }
    impl Collector {
        const NMETRICS: usize = 2;
        #[allow(clippy::new_without_default)]
        pub fn new() -> Self {
            let mut descs = Vec::new();
@@ -2477,6 +2537,22 @@ pub mod tokio_epoll_uring {
            }
        }
    }
    pub(crate) static THREAD_LOCAL_LAUNCH_SUCCESSES: Lazy<metrics::IntCounter> = Lazy::new(|| {
        register_int_counter!(
            "pageserver_tokio_epoll_uring_pageserver_thread_local_launch_success_count",
            "Number of times where thread_local_system creation spanned multiple executor threads",
        )
        .unwrap()
    });
    pub(crate) static THREAD_LOCAL_LAUNCH_FAILURES: Lazy<metrics::IntCounter> = Lazy::new(|| {
        register_int_counter!(
            "pageserver_tokio_epoll_uring_pageserver_thread_local_launch_failures_count",
            "Number of times thread_local_system creation failed and was retried after back-off.",
        )
        .unwrap()
    });
 }
 pub(crate) mod tenant_throttling {
@@ -2605,6 +2681,8 @@ pub fn preinitialize_metrics() {
        &WALRECEIVER_BROKER_UPDATES,
        &WALRECEIVER_CANDIDATES_ADDED,
        &WALRECEIVER_CANDIDATES_REMOVED,
        &tokio_epoll_uring::THREAD_LOCAL_LAUNCH_FAILURES,
        &tokio_epoll_uring::THREAD_LOCAL_LAUNCH_SUCCESSES,
    ]
    .into_iter()
    .for_each(|c| {
@@ -2623,6 +2701,12 @@ pub fn preinitialize_metrics() {
    Lazy::force(&crate::tenant::storage_layer::layer::LAYER_IMPL_METRICS);
    Lazy::force(&disk_usage_based_eviction::METRICS);
    for state_name in pageserver_api::models::TenantState::VARIANTS {
        // initialize the metric for all gauges, otherwise the time series might seemingly show
        // values from last restart.
        TENANT_STATE_METRIC.with_label_values(&[state_name]).set(0);
    }
    // countervecs
    [&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT]
        .into_iter()
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -760,6 +760,7 @@ impl PageServerHandler {
        let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb, &tenant.cancel)));
        timeline
            .import_basebackup_from_tar(
                tenant.clone(),
                &mut copyin_reader,
                base_lsn,
                self.broker_client.clone(),
@@ -1199,7 +1200,7 @@ impl PageServerHandler {
        prev_lsn: Option<Lsn>,
        full_backup: bool,
        gzip: bool,
-        ctx: RequestContext,
+        ctx: &RequestContext,
    ) -> Result<(), QueryError>
    where
        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
@@ -1214,7 +1215,7 @@ impl PageServerHandler {
        if let Some(lsn) = lsn {
            // Backup was requested at a particular LSN. Wait for it to arrive.
            info!("waiting for {}", lsn);
-            timeline.wait_lsn(lsn, &ctx).await?;
+            timeline.wait_lsn(lsn, ctx).await?;
            timeline
                .check_lsn_is_in_scope(lsn, &latest_gc_cutoff_lsn)
                .context("invalid basebackup lsn")?;
@@ -1236,7 +1237,7 @@ impl PageServerHandler {
                lsn,
                prev_lsn,
                full_backup,
-                &ctx,
+                ctx,
            )
            .await?;
        } else {
@@ -1257,7 +1258,7 @@ impl PageServerHandler {
                    lsn,
                    prev_lsn,
                    full_backup,
-                    &ctx,
+                    ctx,
                )
                .await?;
                // shutdown the encoder to ensure the gzip footer is written
@@ -1269,7 +1270,7 @@ impl PageServerHandler {
                    lsn,
                    prev_lsn,
                    full_backup,
-                    &ctx,
+                    ctx,
                )
                .await?;
            }
@@ -1449,25 +1450,25 @@ where
                false
            };
-            ::metrics::metric_vec_duration::observe_async_block_duration_by_result(
+            let metric_recording = metrics::BASEBACKUP_QUERY_TIME.start_recording(&ctx);
-                &*metrics::BASEBACKUP_QUERY_TIME,
+            let res = async {
-                async move {
+                self.handle_basebackup_request(
-                    self.handle_basebackup_request(
+                    pgb,
-                        pgb,
+                    tenant_id,
-                        tenant_id,
+                    timeline_id,
-                        timeline_id,
+                    lsn,
-                        lsn,
+                    None,
-                        None,
+                    false,
-                        false,
+                    gzip,
-                        gzip,
+                    &ctx,
-                        ctx,
+                )
-                    )
+                .await?;
-                    .await?;
+                pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
-                    pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
+                Result::<(), QueryError>::Ok(())
-                    Result::<(), QueryError>::Ok(())
+            }
-                },
+            .await;
-            )
+            metric_recording.observe(&res);
-            .await?;
+            res?;
        }
        // return pair of prev_lsn and last_lsn
        else if query_string.starts_with("get_last_record_rlsn ") {
@@ -1563,7 +1564,7 @@ where
                prev_lsn,
                true,
                false,
-                ctx,
+                &ctx,
            )
            .await?;
            pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -34,6 +34,7 @@ use strum::IntoEnumIterator;
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, trace, warn};
 use utils::bin_ser::DeserializeError;
 use utils::vec_map::{VecMap, VecMapOrdering};
 use utils::{bin_ser::BeSer, lsn::Lsn};
 const MAX_AUX_FILE_DELTAS: usize = 1024;
@@ -1546,12 +1547,13 @@ impl<'a> DatadirModification<'a> {
        if !self.pending_updates.is_empty() {
            // The put_batch call below expects expects the inputs to be sorted by Lsn,
            // so we do that first.
-            let lsn_ordered_batch: Vec<(Key, Lsn, Value)> = self
+            let lsn_ordered_batch: VecMap<Lsn, (Key, Value)> = VecMap::from_iter(
-                .pending_updates
+                self.pending_updates
-                .drain()
+                    .drain()
-                .map(|(key, vals)| vals.into_iter().map(move |(lsn, val)| (key, lsn, val)))
+                    .map(|(key, vals)| vals.into_iter().map(move |(lsn, val)| (lsn, (key, val))))
-                .kmerge_by(|lhs, rhs| lhs.1 .0 < rhs.1 .0)
+                    .kmerge_by(|lhs, rhs| lhs.0 < rhs.0),
-                .collect();
+                VecMapOrdering::GreaterOrEqual,
            );
            writer.put_batch(lsn_ordered_batch, ctx).await?;
        }
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -50,8 +50,6 @@ use once_cell::sync::Lazy;
 use utils::id::TimelineId;
 use crate::shutdown_pageserver;
 //
 // There are four runtimes:
 //
@@ -272,9 +270,6 @@ pub enum TaskKind {
    // Task that uploads a file to remote storage
    RemoteUploadTask,
    // Task that downloads a file from remote storage
    RemoteDownloadTask,
    // task that handles the initial downloading of all tenants
    InitialLoad,
@@ -456,7 +451,7 @@ async fn task_finish(
    }
    if shutdown_process {
-        shutdown_pageserver(None, 1).await;
+        std::process::exit(1);
    }
 }
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -43,6 +43,8 @@ use utils::sync::gate::Gate;
 use utils::sync::gate::GateGuard;
 use utils::timeout::timeout_cancellable;
 use utils::timeout::TimeoutCancellableError;
 use utils::zstd::create_zst_tarball;
 use utils::zstd::extract_zst_tarball;
 use self::config::AttachedLocationConfig;
 use self::config::AttachmentMode;
@@ -55,8 +57,8 @@ use self::mgr::GetTenantError;
 use self::mgr::TenantsMap;
 use self::remote_timeline_client::upload::upload_index_part;
 use self::remote_timeline_client::RemoteTimelineClient;
 use self::timeline::uninit::TimelineCreateGuard;
 use self::timeline::uninit::TimelineExclusionError;
 use self::timeline::uninit::TimelineUninitMark;
 use self::timeline::uninit::UninitializedTimeline;
 use self::timeline::EvictionTaskTenantState;
 use self::timeline::TimelineResources;
@@ -200,6 +202,13 @@ pub(super) struct AttachedTenantConf {
 }
 impl AttachedTenantConf {
    fn new(tenant_conf: TenantConfOpt, location: AttachedLocationConfig) -> Self {
        Self {
            tenant_conf,
            location,
        }
    }
    fn try_from(location_conf: LocationConf) -> anyhow::Result<Self> {
        match &location_conf.mode {
            LocationMode::Attached(attach_conf) => Ok(Self {
@@ -565,9 +574,8 @@ impl Tenant {
            // avoiding holding it across awaits
            let mut timelines_accessor = self.timelines.lock().unwrap();
            match timelines_accessor.entry(timeline_id) {
                // We should never try and load the same timeline twice during startup
                Entry::Occupied(_) => {
                    // The uninit mark file acts as a lock that prevents another task from
                    // initializing the timeline at the same time.
                    unreachable!(
                        "Timeline {tenant_id}/{timeline_id} already exists in the tenant map"
                    );
@@ -677,9 +685,20 @@ impl Tenant {
                }
                // Ideally we should use Tenant::set_broken_no_wait, but it is not supposed to be used when tenant is in loading state.
                enum BrokenVerbosity {
                    Error,
                    Info
                }
                let make_broken =
-                    |t: &Tenant, err: anyhow::Error| {
+                    |t: &Tenant, err: anyhow::Error, verbosity: BrokenVerbosity| {
-                        error!("attach failed, setting tenant state to Broken: {err:?}");
+                        match verbosity {
                            BrokenVerbosity::Info => {
                                info!("attach cancelled, setting tenant state to Broken: {err}");
                            },
                            BrokenVerbosity::Error => {
                                error!("attach failed, setting tenant state to Broken: {err:?}");
                            }
                        }
                        t.state.send_modify(|state| {
                            // The Stopping case is for when we have passed control on to DeleteTenantFlow:
                            // if it errors, we will call make_broken when tenant is already in Stopping.
@@ -743,7 +762,7 @@ impl Tenant {
                            // Make the tenant broken so that set_stopping will not hang waiting for it to leave
                            // the Attaching state.  This is an over-reaction (nothing really broke, the tenant is
                            // just shutting down), but ensures progress.
-                            make_broken(&tenant_clone, anyhow::anyhow!("Shut down while Attaching"));
+                            make_broken(&tenant_clone, anyhow::anyhow!("Shut down while Attaching"), BrokenVerbosity::Info);
                            return Ok(());
                        },
                    )
@@ -765,7 +784,7 @@ impl Tenant {
                        match res {
                            Ok(p) => Some(p),
                            Err(e) => {
-                                make_broken(&tenant_clone, anyhow::anyhow!(e));
+                                make_broken(&tenant_clone, anyhow::anyhow!(e), BrokenVerbosity::Error);
                                return Ok(());
                            }
                        }
@@ -789,7 +808,7 @@ impl Tenant {
                    {
                        Ok(should_resume_deletion) => should_resume_deletion,
                        Err(err) => {
-                            make_broken(&tenant_clone, anyhow::anyhow!(err));
+                            make_broken(&tenant_clone, anyhow::anyhow!(err), BrokenVerbosity::Error);
                            return Ok(());
                        }
                    }
@@ -819,7 +838,7 @@ impl Tenant {
                    .await;
                    if let Err(e) = deleted {
-                        make_broken(&tenant_clone, anyhow::anyhow!(e));
+                        make_broken(&tenant_clone, anyhow::anyhow!(e), BrokenVerbosity::Error);
                    }
                    return Ok(());
@@ -840,7 +859,7 @@ impl Tenant {
                        tenant_clone.activate(broker_client, None, &ctx);
                    }
                    Err(e) => {
-                        make_broken(&tenant_clone, anyhow::anyhow!(e));
+                        make_broken(&tenant_clone, anyhow::anyhow!(e), BrokenVerbosity::Error);
                    }
                }
@@ -1064,8 +1083,7 @@ impl Tenant {
            let entry_path = entry.path();
            let purge = if crate::is_temporary(entry_path)
-                // TODO: uninit_mark isn't needed any more, since uninitialized timelines are already
+                // TODO: remove uninit mark code (https://github.com/neondatabase/neon/issues/5718)
                // covered by the check that the timeline must exist in remote storage.
                || is_uninit_mark(entry_path)
                || crate::is_delete_mark(entry_path)
            {
@@ -1298,11 +1316,6 @@ impl Tenant {
    /// Until that happens, the on-disk state is invalid (disk_consistent_lsn=Lsn(0))
    /// and the timeline will fail to load at a restart.
    ///
    /// That's why we add an uninit mark file, and wrap it together witht the Timeline
    /// in-memory object into UninitializedTimeline.
    /// Once the caller is done setting up the timeline, they should call
    /// `UninitializedTimeline::initialize_with_lock` to remove the uninit mark.
    ///
    /// For tests, use `DatadirModification::init_empty_test_timeline` + `commit` to setup the
    /// minimum amount of keys required to get a writable timeline.
    /// (Without it, `put` might fail due to `repartition` failing.)
@@ -1318,7 +1331,9 @@ impl Tenant {
            "Cannot create empty timelines on inactive tenant"
        );
-        let timeline_uninit_mark = self.create_timeline_uninit_mark(new_timeline_id)?;
+        // Protect against concurrent attempts to use this TimelineId
        let create_guard = self.create_timeline_create_guard(new_timeline_id)?;
        let new_metadata = TimelineMetadata::new(
            // Initialize disk_consistent LSN to 0, The caller must import some data to
            // make it valid, before calling finish_creation()
@@ -1333,7 +1348,7 @@ impl Tenant {
        self.prepare_new_timeline(
            new_timeline_id,
            &new_metadata,
-            timeline_uninit_mark,
+            create_guard,
            initdb_lsn,
            None,
        )
@@ -1396,7 +1411,7 @@ impl Tenant {
    /// the same timeline ID already exists, returns CreateTimelineError::AlreadyExists.
    #[allow(clippy::too_many_arguments)]
    pub(crate) async fn create_timeline(
-        &self,
+        self: &Arc<Tenant>,
        new_timeline_id: TimelineId,
        ancestor_timeline_id: Option<TimelineId>,
        mut ancestor_start_lsn: Option<Lsn>,
@@ -1421,9 +1436,8 @@ impl Tenant {
            .map_err(|_| CreateTimelineError::ShuttingDown)?;
        // Get exclusive access to the timeline ID: this ensures that it does not already exist,
-        // and that no other creation attempts will be allowed in while we are working.  The
+        // and that no other creation attempts will be allowed in while we are working.
-        // uninit_mark is a guard.
+        let create_guard = match self.create_timeline_create_guard(new_timeline_id) {
        let uninit_mark = match self.create_timeline_uninit_mark(new_timeline_id) {
            Ok(m) => m,
            Err(TimelineExclusionError::AlreadyCreating) => {
                // Creation is in progress, we cannot create it again, and we cannot
@@ -1466,6 +1480,8 @@ impl Tenant {
            }
        };
        pausable_failpoint!("timeline-creation-after-uninit");
        let loaded_timeline = match ancestor_timeline_id {
            Some(ancestor_timeline_id) => {
                let ancestor_timeline = self
@@ -1513,7 +1529,7 @@ impl Tenant {
                    &ancestor_timeline,
                    new_timeline_id,
                    ancestor_start_lsn,
-                    uninit_mark,
+                    create_guard,
                    ctx,
                )
                .await?
@@ -1523,7 +1539,7 @@ impl Tenant {
                    new_timeline_id,
                    pg_version,
                    load_existing_initdb,
-                    uninit_mark,
+                    create_guard,
                    ctx,
                )
                .await?
@@ -1543,7 +1559,7 @@ impl Tenant {
            })?;
        }
-        loaded_timeline.activate(broker_client, None, ctx);
+        loaded_timeline.activate(self.clone(), broker_client, None, ctx);
        Ok(loaded_timeline)
    }
@@ -1715,7 +1731,12 @@ impl Tenant {
            let mut activated_timelines = 0;
            for timeline in timelines_to_activate {
-                timeline.activate(broker_client.clone(), background_jobs_can_start, ctx);
+                timeline.activate(
                    self.clone(),
                    broker_client.clone(),
                    background_jobs_can_start,
                    ctx,
                );
                activated_timelines += 1;
            }
@@ -1846,6 +1867,8 @@ impl Tenant {
        // Wait for any in-flight operations to complete
        self.gate.close().await;
        remove_tenant_metrics(&self.tenant_shard_id);
        Ok(())
    }
@@ -2045,7 +2068,12 @@ impl Tenant {
                TenantState::Active { .. } => {
                    return Ok(());
                }
-                TenantState::Broken { .. } | TenantState::Stopping { .. } => {
+                TenantState::Broken { reason, .. } => {
                    // This is fatal, and reported distinctly from the general case of "will never be active" because
                    // it's logically a 500 to external API users (broken is always a bug).
                    return Err(GetActiveTenantError::Broken(reason));
                }
                TenantState::Stopping { .. } => {
                    // There's no chance the tenant can transition back into ::Active
                    return Err(GetActiveTenantError::WillNotBecomeActive(current_state));
                }
@@ -2123,7 +2151,7 @@ impl Tenant {
            // Shut down the timeline's remote client: this means that the indices we write
            // for child shards will not be invalidated by the parent shard deleting layers.
-            tl_client.shutdown().await?;
+            tl_client.shutdown().await;
            // Download methods can still be used after shutdown, as they don't flow through the remote client's
            // queue.  In principal the RemoteTimelineClient could provide this without downloading it, but this
@@ -2868,9 +2896,9 @@ impl Tenant {
        start_lsn: Option<Lsn>,
        ctx: &RequestContext,
    ) -> Result<Arc<Timeline>, CreateTimelineError> {
-        let uninit_mark = self.create_timeline_uninit_mark(dst_id).unwrap();
+        let create_guard = self.create_timeline_create_guard(dst_id).unwrap();
        let tl = self
-            .branch_timeline_impl(src_timeline, dst_id, start_lsn, uninit_mark, ctx)
+            .branch_timeline_impl(src_timeline, dst_id, start_lsn, create_guard, ctx)
            .await?;
        tl.set_state(TimelineState::Active);
        Ok(tl)
@@ -2884,10 +2912,10 @@ impl Tenant {
        src_timeline: &Arc<Timeline>,
        dst_id: TimelineId,
        start_lsn: Option<Lsn>,
-        timeline_uninit_mark: TimelineUninitMark<'_>,
+        timeline_create_guard: TimelineCreateGuard<'_>,
        ctx: &RequestContext,
    ) -> Result<Arc<Timeline>, CreateTimelineError> {
-        self.branch_timeline_impl(src_timeline, dst_id, start_lsn, timeline_uninit_mark, ctx)
+        self.branch_timeline_impl(src_timeline, dst_id, start_lsn, timeline_create_guard, ctx)
            .await
    }
@@ -2896,7 +2924,7 @@ impl Tenant {
        src_timeline: &Arc<Timeline>,
        dst_id: TimelineId,
        start_lsn: Option<Lsn>,
-        timeline_uninit_mark: TimelineUninitMark<'_>,
+        timeline_create_guard: TimelineCreateGuard<'_>,
        _ctx: &RequestContext,
    ) -> Result<Arc<Timeline>, CreateTimelineError> {
        let src_id = src_timeline.timeline_id;
@@ -2980,7 +3008,7 @@ impl Tenant {
            .prepare_new_timeline(
                dst_id,
                &metadata,
-                timeline_uninit_mark,
+                timeline_create_guard,
                start_lsn + 1,
                Some(Arc::clone(src_timeline)),
            )
@@ -3012,12 +3040,12 @@ impl Tenant {
        load_existing_initdb: Option<TimelineId>,
        ctx: &RequestContext,
    ) -> anyhow::Result<Arc<Timeline>> {
-        let uninit_mark = self.create_timeline_uninit_mark(timeline_id).unwrap();
+        let create_guard = self.create_timeline_create_guard(timeline_id).unwrap();
        self.bootstrap_timeline(
            timeline_id,
            pg_version,
            load_existing_initdb,
-            uninit_mark,
+            create_guard,
            ctx,
        )
        .await
@@ -3044,8 +3072,13 @@ impl Tenant {
            }
        }
-        let (pgdata_zstd, tar_zst_size) =
+        let (pgdata_zstd, tar_zst_size) = create_zst_tarball(pgdata_path, &temp_path).await?;
-            import_datadir::create_tar_zst(pgdata_path, &temp_path).await?;
+        const INITDB_TAR_ZST_WARN_LIMIT: u64 = 2 * 1024 * 1024;
        if tar_zst_size > INITDB_TAR_ZST_WARN_LIMIT {
            warn!(
                "compressed {temp_path} size of {tar_zst_size} is above limit {INITDB_TAR_ZST_WARN_LIMIT}."
            );
        }
        pausable_failpoint!("before-initdb-upload");
@@ -3081,7 +3114,7 @@ impl Tenant {
        timeline_id: TimelineId,
        pg_version: u32,
        load_existing_initdb: Option<TimelineId>,
-        timeline_uninit_mark: TimelineUninitMark<'_>,
+        timeline_create_guard: TimelineCreateGuard<'_>,
        ctx: &RequestContext,
    ) -> anyhow::Result<Arc<Timeline>> {
        // create a `tenant/{tenant_id}/timelines/basebackup-{timeline_id}.{TEMP_FILE_SUFFIX}/`
@@ -3093,13 +3126,14 @@ impl Tenant {
            TEMP_FILE_SUFFIX,
        );
-        // an uninit mark was placed before, nothing else can access this timeline files
+        // Remove whatever was left from the previous runs: safe because TimelineCreateGuard guarantees
-        // current initdb was not run yet, so remove whatever was left from the previous runs
+        // we won't race with other creations or existent timelines with the same path.
        if pgdata_path.exists() {
            fs::remove_dir_all(&pgdata_path).with_context(|| {
                format!("Failed to remove already existing initdb directory: {pgdata_path}")
            })?;
        }
        // this new directory is very temporary, set to remove it immediately after bootstrap, we don't need it
        scopeguard::defer! {
            if let Err(e) = fs::remove_dir_all(&pgdata_path) {
@@ -3144,7 +3178,7 @@ impl Tenant {
            let buf_read =
                BufReader::with_capacity(remote_timeline_client::BUFFER_SIZE, initdb_tar_zst);
-            import_datadir::extract_tar_zst(&pgdata_path, buf_read)
+            extract_zst_tarball(&pgdata_path, buf_read)
                .await
                .context("extract initdb tar")?;
        } else {
@@ -3176,7 +3210,7 @@ impl Tenant {
            .prepare_new_timeline(
                timeline_id,
                &new_metadata,
-                timeline_uninit_mark,
+                timeline_create_guard,
                pgdata_lsn,
                None,
            )
@@ -3248,13 +3282,12 @@ impl Tenant {
    ///
    /// An empty layer map is initialized, and new data and WAL can be imported starting
    /// at 'disk_consistent_lsn'. After any initial data has been imported, call
-    /// `finish_creation` to insert the Timeline into the timelines map and to remove the
+    /// `finish_creation` to insert the Timeline into the timelines map.
    /// uninit mark file.
    async fn prepare_new_timeline<'a>(
        &'a self,
        new_timeline_id: TimelineId,
        new_metadata: &TimelineMetadata,
-        uninit_mark: TimelineUninitMark<'a>,
+        create_guard: TimelineCreateGuard<'a>,
        start_lsn: Lsn,
        ancestor: Option<Arc<Timeline>>,
    ) -> anyhow::Result<UninitializedTimeline> {
@@ -3277,9 +3310,12 @@ impl Tenant {
        timeline_struct.init_empty_layer_map(start_lsn);
-        if let Err(e) = self.create_timeline_files(&uninit_mark.timeline_path).await {
+        if let Err(e) = self
            .create_timeline_files(&create_guard.timeline_path)
            .await
        {
            error!("Failed to create initial files for timeline {tenant_shard_id}/{new_timeline_id}, cleaning up: {e:?}");
-            cleanup_timeline_directory(uninit_mark);
+            cleanup_timeline_directory(create_guard);
            return Err(e);
        }
@@ -3290,41 +3326,31 @@ impl Tenant {
        Ok(UninitializedTimeline::new(
            self,
            new_timeline_id,
-            Some((timeline_struct, uninit_mark)),
+            Some((timeline_struct, create_guard)),
        ))
    }
    async fn create_timeline_files(&self, timeline_path: &Utf8Path) -> anyhow::Result<()> {
        crashsafe::create_dir(timeline_path).context("Failed to create timeline directory")?;
-        fail::fail_point!("after-timeline-uninit-mark-creation", |_| {
+        fail::fail_point!("after-timeline-dir-creation", |_| {
-            anyhow::bail!("failpoint after-timeline-uninit-mark-creation");
+            anyhow::bail!("failpoint after-timeline-dir-creation");
        });
        Ok(())
    }
-    /// Attempts to create an uninit mark file for the timeline initialization.
+    /// Get a guard that provides exclusive access to the timeline directory, preventing
-    /// Bails, if the timeline is already loaded into the memory (i.e. initialized before), or the uninit mark file already exists.
+    /// concurrent attempts to create the same timeline.
-    ///
+    fn create_timeline_create_guard(
    /// This way, we need to hold the timelines lock only for small amount of time during the mark check/creation per timeline init.
    fn create_timeline_uninit_mark(
        &self,
        timeline_id: TimelineId,
-    ) -> Result<TimelineUninitMark, TimelineExclusionError> {
+    ) -> Result<TimelineCreateGuard, TimelineExclusionError> {
        let tenant_shard_id = self.tenant_shard_id;
        let uninit_mark_path = self
            .conf
            .timeline_uninit_mark_file_path(tenant_shard_id, timeline_id);
        let timeline_path = self.conf.timeline_path(&tenant_shard_id, &timeline_id);
-        let uninit_mark = TimelineUninitMark::new(
+        let create_guard = TimelineCreateGuard::new(self, timeline_id, timeline_path.clone())?;
            self,
            timeline_id,
            uninit_mark_path.clone(),
            timeline_path.clone(),
        )?;
        // At this stage, we have got exclusive access to in-memory state for this timeline ID
        // for creation.
@@ -3340,23 +3366,7 @@ impl Tenant {
            )));
        }
-        // Create the on-disk uninit mark _after_ the in-memory acquisition of the tenant ID: guarantees
+        Ok(create_guard)
        // that during process runtime, colliding creations will be caught in-memory without getting
        // as far as failing to write a file.
        fs::OpenOptions::new()
            .write(true)
            .create_new(true)
            .open(&uninit_mark_path)
            .context("Failed to create uninit mark file")
            .and_then(|_| {
                crashsafe::fsync_file_and_parent(&uninit_mark_path)
                    .context("Failed to fsync uninit mark file")
            })
            .with_context(|| {
                format!("Failed to crate uninit mark for timeline {tenant_shard_id}/{timeline_id}")
            })?;
        Ok(uninit_mark)
    }
    /// Gathers inputs from all of the timelines to produce a sizing model input.
@@ -3557,11 +3567,6 @@ async fn run_initdb(
    Ok(())
 }
 impl Drop for Tenant {
    fn drop(&mut self) {
        remove_tenant_metrics(&self.tenant_shard_id);
    }
 }
 /// Dump contents of a layer file to stdout.
 pub async fn dump_layerfile_from_path(
    path: &Utf8Path,
@@ -4628,10 +4633,7 @@ mod tests {
        drop(guard);
        // Pick a big LSN such that we query over all the changes.
-        // Technically, u64::MAX - 1 is the largest LSN supported by the read path,
+        let reads_lsn = Lsn(u64::MAX - 1);
        // but there seems to be a bug on the non-vectored search path which surfaces
        // in that case.
        let reads_lsn = Lsn(u64::MAX - 1000);
        for read in reads {
            info!("Doing vectored read on {:?}", read);
@@ -5105,15 +5107,15 @@ mod tests {
    }
    #[tokio::test]
-    async fn test_uninit_mark_crash() -> anyhow::Result<()> {
+    async fn test_create_guard_crash() -> anyhow::Result<()> {
-        let name = "test_uninit_mark_crash";
+        let name = "test_create_guard_crash";
        let harness = TenantHarness::create(name)?;
        {
            let (tenant, ctx) = harness.load().await;
            let tline = tenant
                .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)
                .await?;
-            // Keeps uninit mark in place
+            // Leave the timeline ID in [`Tenant::timelines_creating`] to exclude attempting to create it again
            let raw_tline = tline.raw_timeline().unwrap();
            raw_tline
                .shutdown()
@@ -5141,10 +5143,24 @@ mod tests {
            .timeline_path(&tenant.tenant_shard_id, &TIMELINE_ID)
            .exists());
-        assert!(!harness
+        Ok(())
-            .conf
+    }
-            .timeline_uninit_mark_file_path(tenant.tenant_shard_id, TIMELINE_ID)
+
-            .exists());
+    #[tokio::test]
    async fn test_read_at_max_lsn() -> anyhow::Result<()> {
        let harness = TenantHarness::create("test_read_at_max_lsn")?;
        let (tenant, ctx) = harness.load().await;
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
            .await?;
        let lsn = Lsn(0x10);
        bulk_insert_compact_gc(tline.clone(), &ctx, lsn, 50, 10000).await?;
        let test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
        let read_lsn = Lsn(u64::MAX - 1);
        assert!(tline.get(test_key, read_lsn, &ctx).await.is_ok());
        Ok(())
    }
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -196,16 +196,17 @@ impl LocationConf {
    /// For use when attaching/re-attaching: update the generation stored in this
    /// structure.  If we were in a secondary state, promote to attached (posession
    /// of a fresh generation implies this).
-    pub(crate) fn attach_in_generation(&mut self, generation: Generation) {
+    pub(crate) fn attach_in_generation(&mut self, mode: AttachmentMode, generation: Generation) {
        match &mut self.mode {
            LocationMode::Attached(attach_conf) => {
                attach_conf.generation = generation;
                attach_conf.attach_mode = mode;
            }
            LocationMode::Secondary(_) => {
                // We are promoted to attached by the control plane's re-attach response
                self.mode = LocationMode::Attached(AttachedLocationConfig {
                    generation,
-                    attach_mode: AttachmentMode::Single,
+                    attach_mode: mode,
                })
            }
        }
@@ -354,6 +355,7 @@ pub struct TenantConf {
    /// If non-zero, the period between uploads of a heatmap from attached tenants.  This
    /// may be disabled if a Tenant will not have secondary locations: only secondary
    /// locations will use the heatmap uploaded by attached locations.
    #[serde(with = "humantime_serde")]
    pub heatmap_period: Duration,
    /// If true then SLRU segments are dowloaded on demand, if false SLRU segments are included in basebackup
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -111,6 +111,7 @@ async fn create_local_delete_mark(
    let _ = std::fs::OpenOptions::new()
        .write(true)
        .create(true)
        .truncate(true)
        .open(&marker_path)
        .with_context(|| format!("could not create delete marker file {marker_path:?}"))?;
@@ -296,6 +297,7 @@ impl DeleteTenantFlow {
        remote_storage: Option<GenericRemoteStorage>,
        tenants: &'static std::sync::RwLock<TenantsMap>,
        tenant: Arc<Tenant>,
        cancel: &CancellationToken,
    ) -> Result<(), DeleteTenantError> {
        span::debug_assert_current_span_has_tenant_id();
@@ -303,7 +305,9 @@ impl DeleteTenantFlow {
        let mut guard = Self::prepare(&tenant).await?;
-        if let Err(e) = Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant).await {
+        if let Err(e) =
            Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant, cancel).await
        {
            tenant.set_broken(format!("{e:#}")).await;
            return Err(e);
        }
@@ -322,6 +326,7 @@ impl DeleteTenantFlow {
        conf: &'static PageServerConf,
        remote_storage: Option<&GenericRemoteStorage>,
        tenant: &Tenant,
        cancel: &CancellationToken,
    ) -> Result<(), DeleteTenantError> {
        guard.mark_in_progress()?;
@@ -335,15 +340,9 @@ impl DeleteTenantFlow {
        // Though sounds scary, different mark name?
        // Detach currently uses remove_dir_all so in case of a crash we can end up in a weird state.
        if let Some(remote_storage) = &remote_storage {
-            create_remote_delete_mark(
+            create_remote_delete_mark(conf, remote_storage, &tenant.tenant_shard_id, cancel)
-                conf,
+                .await
-                remote_storage,
+                .context("remote_mark")?
                &tenant.tenant_shard_id,
                // Can't use tenant.cancel, it's already shut down.  TODO: wire in an appropriate token
                &CancellationToken::new(),
            )
            .await
            .context("remote_mark")?
        }
        fail::fail_point!("tenant-delete-before-create-local-mark", |_| {
@@ -546,8 +545,7 @@ impl DeleteTenantFlow {
            conf,
            remote_storage.as_ref(),
            &tenant.tenant_shard_id,
-            // Can't use tenant.cancel, it's already shut down.  TODO: wire in an appropriate token
+            &task_mgr::shutdown_token(),
            &CancellationToken::new(),
        )
        .await?;
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -217,7 +217,7 @@ use crate::task_mgr::shutdown_token;
 use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::remote_timeline_client::download::download_retry;
 use crate::tenant::storage_layer::AsLayerDesc;
-use crate::tenant::upload_queue::Delete;
+use crate::tenant::upload_queue::{Delete, UploadQueueStoppedDeletable};
 use crate::tenant::TIMELINES_SEGMENT_NAME;
 use crate::{
    config::PageServerConf,
@@ -266,15 +266,6 @@ pub enum MaybeDeletedIndexPart {
    Deleted(IndexPart),
 }
 /// Errors that can arise when calling [`RemoteTimelineClient::stop`].
 #[derive(Debug, thiserror::Error)]
 pub enum StopError {
    /// Returned if the upload queue was never initialized.
    /// See [`RemoteTimelineClient::init_upload_queue`] and [`RemoteTimelineClient::init_upload_queue_for_empty_remote`].
    #[error("queue is not initialized")]
    QueueUninitialized,
 }
 #[derive(Debug, thiserror::Error)]
 pub enum PersistIndexPartWithDeletedFlagError {
    #[error("another task is already setting the deleted_flag, started at {0:?}")]
@@ -399,15 +390,10 @@ impl RemoteTimelineClient {
            "bug: it is responsibility of the caller to provide index part from MaybeDeletedIndexPart::Deleted"
        ))?;
        {
            let mut upload_queue = self.upload_queue.lock().unwrap();
            upload_queue.initialize_with_current_remote_index_part(index_part)?;
            self.update_remote_physical_size_gauge(Some(index_part));
        }
        // also locks upload queue, without dropping the guard above it will be a deadlock
        self.stop().expect("initialized line above");
        let mut upload_queue = self.upload_queue.lock().unwrap();
        upload_queue.initialize_with_current_remote_index_part(index_part)?;
        self.update_remote_physical_size_gauge(Some(index_part));
        self.stop_impl(&mut upload_queue);
        upload_queue
            .stopped_mut()
@@ -421,7 +407,8 @@ impl RemoteTimelineClient {
        match &mut *self.upload_queue.lock().unwrap() {
            UploadQueue::Uninitialized => None,
            UploadQueue::Initialized(q) => q.get_last_remote_consistent_lsn_projected(),
-            UploadQueue::Stopped(q) => q
+            UploadQueue::Stopped(UploadQueueStopped::Uninitialized) => None,
            UploadQueue::Stopped(UploadQueueStopped::Deletable(q)) => q
                .upload_queue_for_deletion
                .get_last_remote_consistent_lsn_projected(),
        }
@@ -431,7 +418,8 @@ impl RemoteTimelineClient {
        match &mut *self.upload_queue.lock().unwrap() {
            UploadQueue::Uninitialized => None,
            UploadQueue::Initialized(q) => Some(q.get_last_remote_consistent_lsn_visible()),
-            UploadQueue::Stopped(q) => Some(
+            UploadQueue::Stopped(UploadQueueStopped::Uninitialized) => None,
            UploadQueue::Stopped(UploadQueueStopped::Deletable(q)) => Some(
                q.upload_queue_for_deletion
                    .get_last_remote_consistent_lsn_visible(),
            ),
@@ -898,7 +886,7 @@ impl RemoteTimelineClient {
    /// Wait for all previously scheduled operations to complete, and then stop.
    ///
    /// Not cancellation safe
-    pub(crate) async fn shutdown(self: &Arc<Self>) -> Result<(), StopError> {
+    pub(crate) async fn shutdown(self: &Arc<Self>) {
        // On cancellation the queue is left in ackward state of refusing new operations but
        // proper stop is yet to be called. On cancel the original or some later task must call
        // `stop` or `shutdown`.
@@ -909,8 +897,12 @@ impl RemoteTimelineClient {
        let fut = {
            let mut guard = self.upload_queue.lock().unwrap();
            let upload_queue = match &mut *guard {
-                UploadQueue::Stopped(_) => return Ok(()),
+                UploadQueue::Stopped(_) => return,
-                UploadQueue::Uninitialized => return Err(StopError::QueueUninitialized),
+                UploadQueue::Uninitialized => {
                    // transition into Stopped state
                    self.stop_impl(&mut guard);
                    return;
                }
                UploadQueue::Initialized(ref mut init) => init,
            };
@@ -942,7 +934,7 @@ impl RemoteTimelineClient {
            }
        }
-        self.stop()
+        self.stop();
    }
    /// Set the deleted_at field in the remote index file.
@@ -1324,12 +1316,7 @@ impl RemoteTimelineClient {
            // upload finishes or times out soon enough.
            if cancel.is_cancelled() {
                info!("upload task cancelled by shutdown request");
-                match self.stop() {
+                self.stop();
                    Ok(()) => {}
                    Err(StopError::QueueUninitialized) => {
                        unreachable!("we never launch an upload task if the queue is uninitialized, and once it is initialized, we never go back")
                    }
                }
                return;
            }
@@ -1584,17 +1571,23 @@ impl RemoteTimelineClient {
    /// In-progress operations will still be running after this function returns.
    /// Use `task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(timeline_id))`
    /// to wait for them to complete, after calling this function.
-    pub(crate) fn stop(&self) -> Result<(), StopError> {
+    pub(crate) fn stop(&self) {
        // Whichever *task* for this RemoteTimelineClient grabs the mutex first will transition the queue
        // into stopped state, thereby dropping all off the queued *ops* which haven't become *tasks* yet.
        // The other *tasks* will come here and observe an already shut down queue and hence simply wrap up their business.
        let mut guard = self.upload_queue.lock().unwrap();
-        match &mut *guard {
+        self.stop_impl(&mut guard);
-            UploadQueue::Uninitialized => Err(StopError::QueueUninitialized),
+    }
    fn stop_impl(&self, guard: &mut std::sync::MutexGuard<UploadQueue>) {
        match &mut **guard {
            UploadQueue::Uninitialized => {
                info!("UploadQueue is in state Uninitialized, nothing to do");
                **guard = UploadQueue::Stopped(UploadQueueStopped::Uninitialized);
            }
            UploadQueue::Stopped(_) => {
                // nothing to do
                info!("another concurrent task already shut down the queue");
                Ok(())
            }
            UploadQueue::Initialized(initialized) => {
                info!("shutting down upload queue");
@@ -1627,11 +1620,13 @@ impl RemoteTimelineClient {
                    };
                    let upload_queue = std::mem::replace(
-                        &mut *guard,
+                        &mut **guard,
-                        UploadQueue::Stopped(UploadQueueStopped {
+                        UploadQueue::Stopped(UploadQueueStopped::Deletable(
-                            upload_queue_for_deletion,
+                            UploadQueueStoppedDeletable {
-                            deleted_at: SetDeletedFlagProgress::NotRunning,
+                                upload_queue_for_deletion,
-                        }),
+                                deleted_at: SetDeletedFlagProgress::NotRunning,
                            },
                        )),
                    );
                    if let UploadQueue::Initialized(qi) = upload_queue {
                        qi
@@ -1660,10 +1655,6 @@ impl RemoteTimelineClient {
                    // which is exactly what we want to happen.
                    drop(op);
                }
                // We're done.
                drop(guard);
                Ok(())
            }
        }
    }
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -23,7 +23,7 @@ use crate::tenant::storage_layer::LayerFileName;
 use crate::tenant::Generation;
 use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile};
 use crate::TEMP_FILE_SUFFIX;
-use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode};
+use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode, RemotePath};
 use utils::crashsafe::path_with_suffix_extension;
 use utils::id::TimelineId;
@@ -73,55 +73,13 @@ pub async fn download_layer_file<'a>(
    // If pageserver crashes the temp file will be deleted on startup and re-downloaded.
    let temp_file_path = path_with_suffix_extension(&local_path, TEMP_DOWNLOAD_EXTENSION);
-    let (mut destination_file, bytes_amount) = download_retry(
+    let bytes_amount = download_retry(
-        || async {
+        || async { download_object(storage, &remote_path, &temp_file_path, cancel).await },
            let destination_file = tokio::fs::File::create(&temp_file_path)
                .await
                .with_context(|| format!("create a destination file for layer '{temp_file_path}'"))
                .map_err(DownloadError::Other)?;
            let download = storage.download(&remote_path, cancel).await?;
            let mut destination_file =
                tokio::io::BufWriter::with_capacity(super::BUFFER_SIZE, destination_file);
            let mut reader = tokio_util::io::StreamReader::new(download.download_stream);
            let bytes_amount = tokio::io::copy_buf(&mut reader, &mut destination_file).await;
            match bytes_amount {
                Ok(bytes_amount) => {
                    let destination_file = destination_file.into_inner();
                    Ok((destination_file, bytes_amount))
                }
                Err(e) => {
                    if let Err(e) = tokio::fs::remove_file(&temp_file_path).await {
                        on_fatal_io_error(&e, &format!("Removing temporary file {temp_file_path}"));
                    }
                    Err(e.into())
                }
            }
        },
        &format!("download {remote_path:?}"),
        cancel,
    )
    .await?;
    // Tokio doc here: https://docs.rs/tokio/1.17.0/tokio/fs/struct.File.html states that:
    // A file will not be closed immediately when it goes out of scope if there are any IO operations
    // that have not yet completed. To ensure that a file is closed immediately when it is dropped,
    // you should call flush before dropping it.
    //
    // From the tokio code I see that it waits for pending operations to complete. There shouldt be any because
    // we assume that `destination_file` file is fully written. I e there is no pending .write(...).await operations.
    // But for additional safety lets check/wait for any pending operations.
    destination_file
        .flush()
        .await
        .with_context(|| format!("flush source file at {temp_file_path}"))
        .map_err(DownloadError::Other)?;
    let expected = layer_metadata.file_size();
    if expected != bytes_amount {
        return Err(DownloadError::Other(anyhow!(
@@ -129,14 +87,6 @@ pub async fn download_layer_file<'a>(
        )));
    }
    // not using sync_data because it can lose file size update
    destination_file
        .sync_all()
        .await
        .with_context(|| format!("failed to fsync source file at {temp_file_path}"))
        .map_err(DownloadError::Other)?;
    drop(destination_file);
    fail::fail_point!("remote-storage-download-pre-rename", |_| {
        Err(DownloadError::Other(anyhow!(
            "remote-storage-download-pre-rename failpoint triggered"
@@ -169,6 +119,128 @@ pub async fn download_layer_file<'a>(
    Ok(bytes_amount)
 }
 /// Download the object `src_path` in the remote `storage` to local path `dst_path`.
 ///
 /// If Ok() is returned, the download succeeded and the inode & data have been made durable.
 /// (Note that the directory entry for the inode is not made durable.)
 /// The file size in bytes is returned.
 ///
 /// If Err() is returned, there was some error. The file at `dst_path` has been unlinked.
 /// The unlinking has _not_ been made durable.
 async fn download_object<'a>(
    storage: &'a GenericRemoteStorage,
    src_path: &RemotePath,
    dst_path: &Utf8PathBuf,
    cancel: &CancellationToken,
 ) -> Result<u64, DownloadError> {
    let res = match crate::virtual_file::io_engine::get() {
        crate::virtual_file::io_engine::IoEngine::NotSet => panic!("unset"),
        crate::virtual_file::io_engine::IoEngine::StdFs => {
            async {
                let destination_file = tokio::fs::File::create(dst_path)
                    .await
                    .with_context(|| format!("create a destination file for layer '{dst_path}'"))
                    .map_err(DownloadError::Other)?;
                let download = storage.download(src_path, cancel).await?;
                let mut buf_writer =
                    tokio::io::BufWriter::with_capacity(super::BUFFER_SIZE, destination_file);
                let mut reader = tokio_util::io::StreamReader::new(download.download_stream);
                let bytes_amount = tokio::io::copy_buf(&mut reader, &mut buf_writer).await?;
                buf_writer.flush().await?;
                let mut destination_file = buf_writer.into_inner();
                // Tokio doc here: https://docs.rs/tokio/1.17.0/tokio/fs/struct.File.html states that:
                // A file will not be closed immediately when it goes out of scope if there are any IO operations
                // that have not yet completed. To ensure that a file is closed immediately when it is dropped,
                // you should call flush before dropping it.
                //
                // From the tokio code I see that it waits for pending operations to complete. There shouldt be any because
                // we assume that `destination_file` file is fully written. I e there is no pending .write(...).await operations.
                // But for additional safety lets check/wait for any pending operations.
                destination_file
                    .flush()
                    .await
                    .with_context(|| format!("flush source file at {dst_path}"))
                    .map_err(DownloadError::Other)?;
                // not using sync_data because it can lose file size update
                destination_file
                    .sync_all()
                    .await
                    .with_context(|| format!("failed to fsync source file at {dst_path}"))
                    .map_err(DownloadError::Other)?;
                Ok(bytes_amount)
            }
            .await
        }
        #[cfg(target_os = "linux")]
        crate::virtual_file::io_engine::IoEngine::TokioEpollUring => {
            use crate::virtual_file::owned_buffers_io::{self, util::size_tracking_writer};
            async {
                let destination_file = VirtualFile::create(dst_path)
                    .await
                    .with_context(|| format!("create a destination file for layer '{dst_path}'"))
                    .map_err(DownloadError::Other)?;
                let mut download = storage.download(src_path, cancel).await?;
                // TODO: use vectored write (writev) once supported by tokio-epoll-uring.
                // There's chunks_vectored() on the stream.
                let (bytes_amount, destination_file) = async {
                    let size_tracking = size_tracking_writer::Writer::new(destination_file);
                    let mut buffered = owned_buffers_io::write::BufferedWriter::<
                        { super::BUFFER_SIZE },
                        _,
                    >::new(size_tracking);
                    while let Some(res) =
                        futures::StreamExt::next(&mut download.download_stream).await
                    {
                        let chunk = match res {
                            Ok(chunk) => chunk,
                            Err(e) => return Err(e),
                        };
                        buffered
                            .write_buffered(tokio_epoll_uring::BoundedBuf::slice_full(chunk))
                            .await?;
                    }
                    let size_tracking = buffered.flush_and_into_inner().await?;
                    Ok(size_tracking.into_inner())
                }
                .await?;
                // not using sync_data because it can lose file size update
                destination_file
                    .sync_all()
                    .await
                    .with_context(|| format!("failed to fsync source file at {dst_path}"))
                    .map_err(DownloadError::Other)?;
                Ok(bytes_amount)
            }
            .await
        }
    };
    // in case the download failed, clean up
    match res {
        Ok(bytes_amount) => Ok(bytes_amount),
        Err(e) => {
            if let Err(e) = tokio::fs::remove_file(dst_path).await {
                if e.kind() != std::io::ErrorKind::NotFound {
                    on_fatal_io_error(&e, &format!("Removing temporary file {dst_path}"));
                }
            }
            Err(e)
        }
    }
 }
 const TEMP_DOWNLOAD_EXTENSION: &str = "temp_download";
 pub(crate) fn is_temp_download_file(path: &Utf8Path) -> bool {
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -95,7 +95,11 @@ pub(crate) struct SecondaryTenant {
    shard_identity: ShardIdentity,
    tenant_conf: std::sync::Mutex<TenantConfOpt>,
    // Internal state used by the Downloader.
    detail: std::sync::Mutex<SecondaryDetail>,
    // Public state indicating overall progress of downloads relative to the last heatmap seen
    pub(crate) progress: std::sync::Mutex<models::SecondaryProgress>,
 }
 impl SecondaryTenant {
@@ -118,6 +122,8 @@ impl SecondaryTenant {
            tenant_conf: std::sync::Mutex::new(tenant_conf),
            detail: std::sync::Mutex::new(SecondaryDetail::new(config.clone())),
            progress: std::sync::Mutex::default(),
        })
    }
@@ -247,9 +253,12 @@ impl SecondaryTenant {
 }
 /// The SecondaryController is a pseudo-rpc client for administrative control of secondary mode downloads,
-/// and heatmap uploads.  This is not a hot data path: it's primarily a hook for tests,
+/// and heatmap uploads.  This is not a hot data path: it's used for:
-/// where we want to immediately upload/download for a particular tenant.  In normal operation
+/// - Live migrations, where we want to ensure a migration destination has the freshest possible
-/// uploads & downloads are autonomous and not driven by this interface.
+///   content before trying to cut over.
 /// - Tests, where we want to immediately upload/download for a particular tenant.
 ///
 /// In normal operations, outside of migrations, uploads & downloads are autonomous and not driven by this interface.
 pub struct SecondaryController {
    upload_req_tx: tokio::sync::mpsc::Sender<CommandRequest<UploadCommand>>,
    download_req_tx: tokio::sync::mpsc::Sender<CommandRequest<DownloadCommand>>,
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -15,6 +15,7 @@ use crate::{
    tenant::{
        config::SecondaryLocationConfig,
        debug_assert_current_span_has_tenant_and_timeline_id,
        ephemeral_file::is_ephemeral_file,
        remote_timeline_client::{
            index::LayerFileMetadata, is_temp_download_file, FAILED_DOWNLOAD_WARN_THRESHOLD,
            FAILED_REMOTE_OP_RETRIES,
@@ -41,14 +42,16 @@ use crate::tenant::{
 use camino::Utf8PathBuf;
 use chrono::format::{DelayedFormat, StrftimeItems};
 use futures::Future;
 use pageserver_api::models::SecondaryProgress;
 use pageserver_api::shard::TenantShardId;
 use rand::Rng;
-use remote_storage::{DownloadError, GenericRemoteStorage};
+use remote_storage::{DownloadError, Etag, GenericRemoteStorage};
 use tokio_util::sync::CancellationToken;
 use tracing::{info_span, instrument, warn, Instrument};
 use utils::{
-    backoff, completion::Barrier, crashsafe::path_with_suffix_extension, fs_ext, id::TimelineId,
+    backoff, completion::Barrier, crashsafe::path_with_suffix_extension, failpoint_support, fs_ext,
    id::TimelineId,
 };
 use super::{
@@ -128,6 +131,7 @@ pub(super) struct SecondaryDetail {
    pub(super) config: SecondaryLocationConfig,
    last_download: Option<Instant>,
    last_etag: Option<Etag>,
    next_download: Option<Instant>,
    pub(super) timelines: HashMap<TimelineId, SecondaryDetailTimeline>,
 }
@@ -138,11 +142,26 @@ fn strftime(t: &'_ SystemTime) -> DelayedFormat<StrftimeItems<'_>> {
    datetime.format("%d/%m/%Y %T")
 }
 /// Information returned from download function when it detects the heatmap has changed
 struct HeatMapModified {
    etag: Etag,
    last_modified: SystemTime,
    bytes: Vec<u8>,
 }
 enum HeatMapDownload {
    // The heatmap's etag has changed: return the new etag, mtime and the body bytes
    Modified(HeatMapModified),
    // The heatmap's etag is unchanged
    Unmodified,
 }
 impl SecondaryDetail {
    pub(super) fn new(config: SecondaryLocationConfig) -> Self {
        Self {
            config,
            last_download: None,
            last_etag: None,
            next_download: None,
            timelines: HashMap::new(),
        }
@@ -477,11 +496,31 @@ impl<'a> TenantDownloader<'a> {
        };
        let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
        // We will use the etag from last successful download to make the download conditional on changes
        let last_etag = self
            .secondary_state
            .detail
            .lock()
            .unwrap()
            .last_etag
            .clone();
        // Download the tenant's heatmap
-        let heatmap_bytes = tokio::select!(
+        let HeatMapModified {
-            bytes = self.download_heatmap() => {bytes?},
+            last_modified: heatmap_mtime,
            etag: heatmap_etag,
            bytes: heatmap_bytes,
        } = match tokio::select!(
            bytes = self.download_heatmap(last_etag.as_ref()) => {bytes?},
            _ = self.secondary_state.cancel.cancelled() => return Ok(())
-        );
+        ) {
            HeatMapDownload::Unmodified => {
                tracing::info!("Heatmap unchanged since last successful download");
                return Ok(());
            }
            HeatMapDownload::Modified(m) => m,
        };
        let heatmap = serde_json::from_slice::<HeatMapTenant>(&heatmap_bytes)?;
@@ -496,11 +535,27 @@ impl<'a> TenantDownloader<'a> {
            .await
            .maybe_fatal_err(&context_msg)?;
-        tracing::debug!("Wrote local heatmap to {}", heatmap_path);
+        tracing::debug!(
            "Wrote local heatmap to {}, with {} timelines",
            heatmap_path,
            heatmap.timelines.len()
        );
        // Clean up any local layers that aren't in the heatmap.  We do this first for all timelines, on the general
        // principle that deletions should be done before writes wherever possible, and so that we can use this
        // phase to initialize our SecondaryProgress.
        {
            *self.secondary_state.progress.lock().unwrap() =
                self.prepare_timelines(&heatmap, heatmap_mtime).await?;
        }
        // Download the layers in the heatmap
        for timeline in heatmap.timelines {
            if self.secondary_state.cancel.is_cancelled() {
                tracing::debug!(
                    "Cancelled before downloading timeline {}",
                    timeline.timeline_id
                );
                return Ok(());
            }
@@ -515,30 +570,159 @@ impl<'a> TenantDownloader<'a> {
                .await?;
        }
        // Only update last_etag after a full successful download: this way will not skip
        // the next download, even if the heatmap's actual etag is unchanged.
        self.secondary_state.detail.lock().unwrap().last_etag = Some(heatmap_etag);
        Ok(())
    }
-    async fn download_heatmap(&self) -> Result<Vec<u8>, UpdateError> {
+    /// Do any fast local cleanup that comes before the much slower process of downloading
    /// layers from remote storage.  In the process, initialize the SecondaryProgress object
    /// that will later be updated incrementally as we download layers.
    async fn prepare_timelines(
        &self,
        heatmap: &HeatMapTenant,
        heatmap_mtime: SystemTime,
    ) -> Result<SecondaryProgress, UpdateError> {
        let heatmap_stats = heatmap.get_stats();
        // We will construct a progress object, and then populate its initial "downloaded" numbers
        // while iterating through local layer state in [`Self::prepare_timelines`]
        let mut progress = SecondaryProgress {
            layers_total: heatmap_stats.layers,
            bytes_total: heatmap_stats.bytes,
            heatmap_mtime: Some(heatmap_mtime),
            layers_downloaded: 0,
            bytes_downloaded: 0,
        };
        // Accumulate list of things to delete while holding the detail lock, for execution after dropping the lock
        let mut delete_layers = Vec::new();
        let mut delete_timelines = Vec::new();
        {
            let mut detail = self.secondary_state.detail.lock().unwrap();
            for (timeline_id, timeline_state) in &mut detail.timelines {
                let Some(heatmap_timeline_index) = heatmap
                    .timelines
                    .iter()
                    .position(|t| t.timeline_id == *timeline_id)
                else {
                    // This timeline is no longer referenced in the heatmap: delete it locally
                    delete_timelines.push(*timeline_id);
                    continue;
                };
                let heatmap_timeline = heatmap.timelines.get(heatmap_timeline_index).unwrap();
                let layers_in_heatmap = heatmap_timeline
                    .layers
                    .iter()
                    .map(|l| &l.name)
                    .collect::<HashSet<_>>();
                let layers_on_disk = timeline_state
                    .on_disk_layers
                    .iter()
                    .map(|l| l.0)
                    .collect::<HashSet<_>>();
                let mut layer_count = layers_on_disk.len();
                let mut layer_byte_count: u64 = timeline_state
                    .on_disk_layers
                    .values()
                    .map(|l| l.metadata.file_size())
                    .sum();
                // Remove on-disk layers that are no longer present in heatmap
                for layer in layers_on_disk.difference(&layers_in_heatmap) {
                    layer_count -= 1;
                    layer_byte_count -= timeline_state
                        .on_disk_layers
                        .get(layer)
                        .unwrap()
                        .metadata
                        .file_size();
                    delete_layers.push((*timeline_id, (*layer).clone()));
                }
                progress.bytes_downloaded += layer_byte_count;
                progress.layers_downloaded += layer_count;
            }
        }
        // Execute accumulated deletions
        for (timeline_id, layer_name) in delete_layers {
            let timeline_path = self
                .conf
                .timeline_path(self.secondary_state.get_tenant_shard_id(), &timeline_id);
            let local_path = timeline_path.join(layer_name.to_string());
            tracing::info!(timeline_id=%timeline_id, "Removing secondary local layer {layer_name} because it's absent in heatmap",);
            tokio::fs::remove_file(&local_path)
                .await
                .or_else(fs_ext::ignore_not_found)
                .maybe_fatal_err("Removing secondary layer")?;
            // Update in-memory housekeeping to reflect the absence of the deleted layer
            let mut detail = self.secondary_state.detail.lock().unwrap();
            let Some(timeline_state) = detail.timelines.get_mut(&timeline_id) else {
                continue;
            };
            timeline_state.on_disk_layers.remove(&layer_name);
        }
        for timeline_id in delete_timelines {
            let timeline_path = self
                .conf
                .timeline_path(self.secondary_state.get_tenant_shard_id(), &timeline_id);
            tracing::info!(timeline_id=%timeline_id,
                "Timeline no longer in heatmap, removing from secondary location"
            );
            tokio::fs::remove_dir_all(&timeline_path)
                .await
                .or_else(fs_ext::ignore_not_found)
                .maybe_fatal_err("Removing secondary timeline")?;
        }
        Ok(progress)
    }
    /// Returns downloaded bytes if the etag differs from `prev_etag`, or None if the object
    /// still matches `prev_etag`.
    async fn download_heatmap(
        &self,
        prev_etag: Option<&Etag>,
    ) -> Result<HeatMapDownload, UpdateError> {
        debug_assert_current_span_has_tenant_id();
        let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
-        // TODO: make download conditional on ETag having changed since last download
+        // TODO: pull up etag check into the request, to do a conditional GET rather than
        // issuing a GET and then maybe ignoring the response body
        // (https://github.com/neondatabase/neon/issues/6199)
        tracing::debug!("Downloading heatmap for secondary tenant",);
        let heatmap_path = remote_heatmap_path(tenant_shard_id);
        let cancel = &self.secondary_state.cancel;
-        let heatmap_bytes = backoff::retry(
+        backoff::retry(
            || async {
                let download = self
                    .remote_storage
                    .download(&heatmap_path, cancel)
                    .await
                    .map_err(UpdateError::from)?;
-                let mut heatmap_bytes = Vec::new();
+
-                let mut body = tokio_util::io::StreamReader::new(download.download_stream);
+                if Some(&download.etag) == prev_etag {
-                let _size = tokio::io::copy_buf(&mut body, &mut heatmap_bytes).await?;
+                    Ok(HeatMapDownload::Unmodified)
-                Ok(heatmap_bytes)
+                } else {
                    let mut heatmap_bytes = Vec::new();
                    let mut body = tokio_util::io::StreamReader::new(download.download_stream);
                    let _size = tokio::io::copy_buf(&mut body, &mut heatmap_bytes).await?;
                    SECONDARY_MODE.download_heatmap.inc();
                    Ok(HeatMapDownload::Modified(HeatMapModified {
                        etag: download.etag,
                        last_modified: download.last_modified,
                        bytes: heatmap_bytes,
                    }))
                }
            },
            |e| matches!(e, UpdateError::NoData | UpdateError::Cancelled),
            FAILED_DOWNLOAD_WARN_THRESHOLD,
@@ -548,11 +732,7 @@ impl<'a> TenantDownloader<'a> {
        )
        .await
        .ok_or_else(|| UpdateError::Cancelled)
-        .and_then(|x| x)?;
+        .and_then(|x| x)
        SECONDARY_MODE.download_heatmap.inc();
        Ok(heatmap_bytes)
    }
    async fn download_timeline(&self, timeline: HeatMapTimeline) -> Result<(), UpdateError> {
@@ -593,31 +773,13 @@ impl<'a> TenantDownloader<'a> {
            }
        };
-        let layers_in_heatmap = timeline
+        tracing::debug!(timeline_id=%timeline.timeline_id, "Downloading layers, {} in heatmap", timeline.layers.len());
            .layers
            .iter()
            .map(|l| &l.name)
            .collect::<HashSet<_>>();
        let layers_on_disk = timeline_state
            .on_disk_layers
            .iter()
            .map(|l| l.0)
            .collect::<HashSet<_>>();
        // Remove on-disk layers that are no longer present in heatmap
        for layer in layers_on_disk.difference(&layers_in_heatmap) {
            let local_path = timeline_path.join(layer.to_string());
            tracing::info!("Removing secondary local layer {layer} because it's absent in heatmap",);
            tokio::fs::remove_file(&local_path)
                .await
                .or_else(fs_ext::ignore_not_found)
                .maybe_fatal_err("Removing secondary layer")?;
        }
        // Download heatmap layers that are not present on local disk, or update their
        // access time if they are already present.
        for layer in timeline.layers {
            if self.secondary_state.cancel.is_cancelled() {
                tracing::debug!("Cancelled -- dropping out of layer loop");
                return Ok(());
            }
@@ -662,6 +824,12 @@ impl<'a> TenantDownloader<'a> {
                }
            }
            // Failpoint for simulating slow remote storage
            failpoint_support::sleep_millis_async!(
                "secondary-layer-download-sleep",
                &self.secondary_state.cancel
            );
            // Note: no backoff::retry wrapper here because download_layer_file does its own retries internally
            let downloaded_bytes = match download_layer_file(
                self.conf,
@@ -701,6 +869,11 @@ impl<'a> TenantDownloader<'a> {
                tokio::fs::remove_file(&local_path)
                    .await
                    .or_else(fs_ext::ignore_not_found)?;
            } else {
                tracing::info!("Downloaded layer {}, size {}", layer.name, downloaded_bytes);
                let mut progress = self.secondary_state.progress.lock().unwrap();
                progress.bytes_downloaded += downloaded_bytes;
                progress.layers_downloaded += 1;
            }
            SECONDARY_MODE.download_layer.inc();
@@ -789,7 +962,10 @@ async fn init_timeline_state(
            // Secondary mode doesn't use local metadata files, but they might have been left behind by an attached tenant.
            warn!(path=?dentry.path(), "found legacy metadata file, these should have been removed in load_tenant_config");
            continue;
-        } else if crate::is_temporary(&file_path) || is_temp_download_file(&file_path) {
+        } else if crate::is_temporary(&file_path)
            || is_temp_download_file(&file_path)
            || is_ephemeral_file(file_name)
        {
            // Temporary files are frequently left behind from restarting during downloads
            tracing::info!("Cleaning up temporary file {file_path}");
            if let Err(e) = tokio::fs::remove_file(&file_path)
--- a/pageserver/src/tenant/secondary/heatmap.rs
+++ b/pageserver/src/tenant/secondary/heatmap.rs
@@ -62,3 +62,25 @@ impl HeatMapTimeline {
        }
    }
 }
 pub(crate) struct HeatMapStats {
    pub(crate) bytes: u64,
    pub(crate) layers: usize,
 }
 impl HeatMapTenant {
    pub(crate) fn get_stats(&self) -> HeatMapStats {
        let mut stats = HeatMapStats {
            bytes: 0,
            layers: 0,
        };
        for timeline in &self.timelines {
            for layer in &timeline.layers {
                stats.layers += 1;
                stats.bytes += layer.metadata.file_size;
            }
        }
        stats
    }
 }
--- a/pageserver/src/tenant/secondary/heatmap_uploader.rs
+++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs
@@ -9,6 +9,7 @@ use crate::{
    metrics::SECONDARY_MODE,
    tenant::{
        config::AttachmentMode,
        mgr::GetTenantError,
        mgr::TenantManager,
        remote_timeline_client::remote_heatmap_path,
        span::debug_assert_current_span_has_tenant_id,
@@ -292,8 +293,11 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
            "Starting heatmap write on command");
        let tenant = self
            .tenant_manager
-            .get_attached_tenant_shard(*tenant_shard_id, true)
+            .get_attached_tenant_shard(*tenant_shard_id)
            .map_err(|e| anyhow::anyhow!(e))?;
        if !tenant.is_active() {
            return Err(GetTenantError::NotActive(*tenant_shard_id).into());
        }
        Ok(UploadPending {
            // Ignore our state for last digest: this forces an upload even if nothing has changed
--- a/pageserver/src/tenant/secondary/scheduler.rs
+++ b/pageserver/src/tenant/secondary/scheduler.rs
@@ -300,6 +300,7 @@ where
        let tenant_shard_id = job.get_tenant_shard_id();
        let barrier = if let Some(barrier) = self.get_running(tenant_shard_id) {
            tracing::info!("Command already running, waiting for it");
            barrier
        } else {
            let running = self.spawn_now(job);
--- a/Show More
+++ b/Show More
		`@@ -0,0 +1,3 @@`

							`UPDATE tenant_shards set placement_policy='{"Double": 1}' where placement_policy='{"Attached": 1}';`
							`UPDATE tenant_shards set placement_policy='"Single"' where placement_policy='{"Attached": 0}';`